18ea73c5d628c43d555dcd9f3786139a1ca8bae3
[pandora-kernel.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 /* Set to 3 to get tracing. */
66 #define RT6_DEBUG 2
67
68 #if RT6_DEBUG >= 3
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #else
72 #define RDBG(x)
73 #define RT6_TRACE(x...) do { ; } while (0)
74 #endif
75
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77                                     const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int      ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void             ip6_dst_destroy(struct dst_entry *);
83 static void             ip6_dst_ifdown(struct dst_entry *,
84                                        struct net_device *dev, int how);
85 static int               ip6_dst_gc(struct dst_ops *ops);
86
87 static int              ip6_pkt_discard(struct sk_buff *skb);
88 static int              ip6_pkt_discard_out(struct sk_buff *skb);
89 static void             ip6_link_failure(struct sk_buff *skb);
90 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
91
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94                                            const struct in6_addr *prefix, int prefixlen,
95                                            const struct in6_addr *gwaddr, int ifindex,
96                                            unsigned pref);
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98                                            const struct in6_addr *prefix, int prefixlen,
99                                            const struct in6_addr *gwaddr, int ifindex);
100 #endif
101
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
103 {
104         struct rt6_info *rt = (struct rt6_info *) dst;
105         struct inet_peer *peer;
106         u32 *p = NULL;
107
108         if (!(rt->dst.flags & DST_HOST))
109                 return NULL;
110
111         if (!rt->rt6i_peer)
112                 rt6_bind_peer(rt, 1);
113
114         peer = rt->rt6i_peer;
115         if (peer) {
116                 u32 *old_p = __DST_METRICS_PTR(old);
117                 unsigned long prev, new;
118
119                 p = peer->metrics;
120                 if (inet_metrics_new(peer))
121                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
122
123                 new = (unsigned long) p;
124                 prev = cmpxchg(&dst->_metrics, old, new);
125
126                 if (prev != old) {
127                         p = __DST_METRICS_PTR(prev);
128                         if (prev & DST_METRICS_READ_ONLY)
129                                 p = NULL;
130                 }
131         }
132         return p;
133 }
134
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
136 {
137         return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
138 }
139
140 static struct dst_ops ip6_dst_ops_template = {
141         .family                 =       AF_INET6,
142         .protocol               =       cpu_to_be16(ETH_P_IPV6),
143         .gc                     =       ip6_dst_gc,
144         .gc_thresh              =       1024,
145         .check                  =       ip6_dst_check,
146         .default_advmss         =       ip6_default_advmss,
147         .mtu                    =       ip6_mtu,
148         .cow_metrics            =       ipv6_cow_metrics,
149         .destroy                =       ip6_dst_destroy,
150         .ifdown                 =       ip6_dst_ifdown,
151         .negative_advice        =       ip6_negative_advice,
152         .link_failure           =       ip6_link_failure,
153         .update_pmtu            =       ip6_rt_update_pmtu,
154         .local_out              =       __ip6_local_out,
155         .neigh_lookup           =       ip6_neigh_lookup,
156 };
157
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
159 {
160         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
161
162         return mtu ? : dst->dev->mtu;
163 }
164
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
166 {
167 }
168
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
170                                          unsigned long old)
171 {
172         return NULL;
173 }
174
175 static struct dst_ops ip6_dst_blackhole_ops = {
176         .family                 =       AF_INET6,
177         .protocol               =       cpu_to_be16(ETH_P_IPV6),
178         .destroy                =       ip6_dst_destroy,
179         .check                  =       ip6_dst_check,
180         .mtu                    =       ip6_blackhole_mtu,
181         .default_advmss         =       ip6_default_advmss,
182         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
183         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
184         .neigh_lookup           =       ip6_neigh_lookup,
185 };
186
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188         [RTAX_HOPLIMIT - 1] = 0,
189 };
190
191 static struct rt6_info ip6_null_entry_template = {
192         .dst = {
193                 .__refcnt       = ATOMIC_INIT(1),
194                 .__use          = 1,
195                 .obsolete       = -1,
196                 .error          = -ENETUNREACH,
197                 .input          = ip6_pkt_discard,
198                 .output         = ip6_pkt_discard_out,
199         },
200         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
201         .rt6i_protocol  = RTPROT_KERNEL,
202         .rt6i_metric    = ~(u32) 0,
203         .rt6i_ref       = ATOMIC_INIT(1),
204 };
205
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
207
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
210
211 static struct rt6_info ip6_prohibit_entry_template = {
212         .dst = {
213                 .__refcnt       = ATOMIC_INIT(1),
214                 .__use          = 1,
215                 .obsolete       = -1,
216                 .error          = -EACCES,
217                 .input          = ip6_pkt_prohibit,
218                 .output         = ip6_pkt_prohibit_out,
219         },
220         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
221         .rt6i_protocol  = RTPROT_KERNEL,
222         .rt6i_metric    = ~(u32) 0,
223         .rt6i_ref       = ATOMIC_INIT(1),
224 };
225
226 static struct rt6_info ip6_blk_hole_entry_template = {
227         .dst = {
228                 .__refcnt       = ATOMIC_INIT(1),
229                 .__use          = 1,
230                 .obsolete       = -1,
231                 .error          = -EINVAL,
232                 .input          = dst_discard,
233                 .output         = dst_discard,
234         },
235         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
236         .rt6i_protocol  = RTPROT_KERNEL,
237         .rt6i_metric    = ~(u32) 0,
238         .rt6i_ref       = ATOMIC_INIT(1),
239 };
240
241 #endif
242
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245                                              struct net_device *dev,
246                                              int flags)
247 {
248         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
249
250         if (rt != NULL)
251                 memset(&rt->rt6i_table, 0,
252                         sizeof(*rt) - sizeof(struct dst_entry));
253
254         return rt;
255 }
256
257 static void ip6_dst_destroy(struct dst_entry *dst)
258 {
259         struct rt6_info *rt = (struct rt6_info *)dst;
260         struct inet6_dev *idev = rt->rt6i_idev;
261         struct inet_peer *peer = rt->rt6i_peer;
262
263         if (!(rt->dst.flags & DST_HOST))
264                 dst_destroy_metrics_generic(dst);
265
266         if (idev != NULL) {
267                 rt->rt6i_idev = NULL;
268                 in6_dev_put(idev);
269         }
270         if (peer) {
271                 rt->rt6i_peer = NULL;
272                 inet_putpeer(peer);
273         }
274 }
275
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
277
278 static u32 rt6_peer_genid(void)
279 {
280         return atomic_read(&__rt6_peer_genid);
281 }
282
283 void rt6_bind_peer(struct rt6_info *rt, int create)
284 {
285         struct inet_peer *peer;
286
287         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
289                 inet_putpeer(peer);
290         else
291                 rt->rt6i_peer_genid = rt6_peer_genid();
292 }
293
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
295                            int how)
296 {
297         struct rt6_info *rt = (struct rt6_info *)dst;
298         struct inet6_dev *idev = rt->rt6i_idev;
299         struct net_device *loopback_dev =
300                 dev_net(dev)->loopback_dev;
301
302         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
303                 struct inet6_dev *loopback_idev =
304                         in6_dev_get(loopback_dev);
305                 if (loopback_idev != NULL) {
306                         rt->rt6i_idev = loopback_idev;
307                         in6_dev_put(idev);
308                 }
309         }
310 }
311
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
313 {
314         return (rt->rt6i_flags & RTF_EXPIRES) &&
315                 time_after(jiffies, rt->rt6i_expires);
316 }
317
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
319 {
320         return ipv6_addr_type(daddr) &
321                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
322 }
323
324 /*
325  *      Route lookup. Any table->tb6_lock is implied.
326  */
327
328 static inline struct rt6_info *rt6_device_match(struct net *net,
329                                                     struct rt6_info *rt,
330                                                     const struct in6_addr *saddr,
331                                                     int oif,
332                                                     int flags)
333 {
334         struct rt6_info *local = NULL;
335         struct rt6_info *sprt;
336
337         if (!oif && ipv6_addr_any(saddr))
338                 goto out;
339
340         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341                 struct net_device *dev = sprt->rt6i_dev;
342
343                 if (oif) {
344                         if (dev->ifindex == oif)
345                                 return sprt;
346                         if (dev->flags & IFF_LOOPBACK) {
347                                 if (sprt->rt6i_idev == NULL ||
348                                     sprt->rt6i_idev->dev->ifindex != oif) {
349                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
350                                                 continue;
351                                         if (local && (!oif ||
352                                                       local->rt6i_idev->dev->ifindex == oif))
353                                                 continue;
354                                 }
355                                 local = sprt;
356                         }
357                 } else {
358                         if (ipv6_chk_addr(net, saddr, dev,
359                                           flags & RT6_LOOKUP_F_IFACE))
360                                 return sprt;
361                 }
362         }
363
364         if (oif) {
365                 if (local)
366                         return local;
367
368                 if (flags & RT6_LOOKUP_F_IFACE)
369                         return net->ipv6.ip6_null_entry;
370         }
371 out:
372         return rt;
373 }
374
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
377 {
378         struct neighbour *neigh;
379         /*
380          * Okay, this does not seem to be appropriate
381          * for now, however, we need to check if it
382          * is really so; aka Router Reachability Probing.
383          *
384          * Router Reachability Probe MUST be rate-limited
385          * to no more than one per minute.
386          */
387         rcu_read_lock();
388         neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389         if (!neigh || (neigh->nud_state & NUD_VALID))
390                 goto out;
391         read_lock_bh(&neigh->lock);
392         if (!(neigh->nud_state & NUD_VALID) &&
393             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394                 struct in6_addr mcaddr;
395                 struct in6_addr *target;
396
397                 neigh->updated = jiffies;
398                 read_unlock_bh(&neigh->lock);
399
400                 target = (struct in6_addr *)&neigh->primary_key;
401                 addrconf_addr_solict_mult(target, &mcaddr);
402                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
403         } else {
404                 read_unlock_bh(&neigh->lock);
405         }
406 out:
407         rcu_read_unlock();
408 }
409 #else
410 static inline void rt6_probe(struct rt6_info *rt)
411 {
412 }
413 #endif
414
415 /*
416  * Default Router Selection (RFC 2461 6.3.6)
417  */
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
419 {
420         struct net_device *dev = rt->rt6i_dev;
421         if (!oif || dev->ifindex == oif)
422                 return 2;
423         if ((dev->flags & IFF_LOOPBACK) &&
424             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
425                 return 1;
426         return 0;
427 }
428
429 static inline int rt6_check_neigh(struct rt6_info *rt)
430 {
431         struct neighbour *neigh;
432         int m;
433
434         rcu_read_lock();
435         neigh = dst_get_neighbour(&rt->dst);
436         if (rt->rt6i_flags & RTF_NONEXTHOP ||
437             !(rt->rt6i_flags & RTF_GATEWAY))
438                 m = 1;
439         else if (neigh) {
440                 read_lock_bh(&neigh->lock);
441                 if (neigh->nud_state & NUD_VALID)
442                         m = 2;
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444                 else if (neigh->nud_state & NUD_FAILED)
445                         m = 0;
446 #endif
447                 else
448                         m = 1;
449                 read_unlock_bh(&neigh->lock);
450         } else
451                 m = 0;
452         rcu_read_unlock();
453         return m;
454 }
455
456 static int rt6_score_route(struct rt6_info *rt, int oif,
457                            int strict)
458 {
459         int m, n;
460
461         m = rt6_check_dev(rt, oif);
462         if (!m && (strict & RT6_LOOKUP_F_IFACE))
463                 return -1;
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
466 #endif
467         n = rt6_check_neigh(rt);
468         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
469                 return -1;
470         return m;
471 }
472
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474                                    int *mpri, struct rt6_info *match)
475 {
476         int m;
477
478         if (rt6_check_expired(rt))
479                 goto out;
480
481         m = rt6_score_route(rt, oif, strict);
482         if (m < 0)
483                 goto out;
484
485         if (m > *mpri) {
486                 if (strict & RT6_LOOKUP_F_REACHABLE)
487                         rt6_probe(match);
488                 *mpri = m;
489                 match = rt;
490         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
491                 rt6_probe(rt);
492         }
493
494 out:
495         return match;
496 }
497
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499                                      struct rt6_info *rr_head,
500                                      u32 metric, int oif, int strict)
501 {
502         struct rt6_info *rt, *match;
503         int mpri = -1;
504
505         match = NULL;
506         for (rt = rr_head; rt && rt->rt6i_metric == metric;
507              rt = rt->dst.rt6_next)
508                 match = find_match(rt, oif, strict, &mpri, match);
509         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510              rt = rt->dst.rt6_next)
511                 match = find_match(rt, oif, strict, &mpri, match);
512
513         return match;
514 }
515
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
517 {
518         struct rt6_info *match, *rt0;
519         struct net *net;
520
521         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522                   __func__, fn->leaf, oif);
523
524         rt0 = fn->rr_ptr;
525         if (!rt0)
526                 fn->rr_ptr = rt0 = fn->leaf;
527
528         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
529
530         if (!match &&
531             (strict & RT6_LOOKUP_F_REACHABLE)) {
532                 struct rt6_info *next = rt0->dst.rt6_next;
533
534                 /* no entries matched; do round-robin */
535                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
536                         next = fn->leaf;
537
538                 if (next != rt0)
539                         fn->rr_ptr = next;
540         }
541
542         RT6_TRACE("%s() => %p\n",
543                   __func__, match);
544
545         net = dev_net(rt0->rt6i_dev);
546         return match ? match : net->ipv6.ip6_null_entry;
547 }
548
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551                   const struct in6_addr *gwaddr)
552 {
553         struct net *net = dev_net(dev);
554         struct route_info *rinfo = (struct route_info *) opt;
555         struct in6_addr prefix_buf, *prefix;
556         unsigned int pref;
557         unsigned long lifetime;
558         struct rt6_info *rt;
559
560         if (len < sizeof(struct route_info)) {
561                 return -EINVAL;
562         }
563
564         /* Sanity check for prefix_len and length */
565         if (rinfo->length > 3) {
566                 return -EINVAL;
567         } else if (rinfo->prefix_len > 128) {
568                 return -EINVAL;
569         } else if (rinfo->prefix_len > 64) {
570                 if (rinfo->length < 2) {
571                         return -EINVAL;
572                 }
573         } else if (rinfo->prefix_len > 0) {
574                 if (rinfo->length < 1) {
575                         return -EINVAL;
576                 }
577         }
578
579         pref = rinfo->route_pref;
580         if (pref == ICMPV6_ROUTER_PREF_INVALID)
581                 return -EINVAL;
582
583         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
584
585         if (rinfo->length == 3)
586                 prefix = (struct in6_addr *)rinfo->prefix;
587         else {
588                 /* this function is safe */
589                 ipv6_addr_prefix(&prefix_buf,
590                                  (struct in6_addr *)rinfo->prefix,
591                                  rinfo->prefix_len);
592                 prefix = &prefix_buf;
593         }
594
595         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
596                                 dev->ifindex);
597
598         if (rt && !lifetime) {
599                 ip6_del_rt(rt);
600                 rt = NULL;
601         }
602
603         if (!rt && lifetime)
604                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
605                                         pref);
606         else if (rt)
607                 rt->rt6i_flags = RTF_ROUTEINFO |
608                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
609
610         if (rt) {
611                 if (!addrconf_finite_timeout(lifetime)) {
612                         rt->rt6i_flags &= ~RTF_EXPIRES;
613                 } else {
614                         rt->rt6i_expires = jiffies + HZ * lifetime;
615                         rt->rt6i_flags |= RTF_EXPIRES;
616                 }
617                 dst_release(&rt->dst);
618         }
619         return 0;
620 }
621 #endif
622
623 #define BACKTRACK(__net, saddr)                 \
624 do { \
625         if (rt == __net->ipv6.ip6_null_entry) { \
626                 struct fib6_node *pn; \
627                 while (1) { \
628                         if (fn->fn_flags & RTN_TL_ROOT) \
629                                 goto out; \
630                         pn = fn->parent; \
631                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
632                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
633                         else \
634                                 fn = pn; \
635                         if (fn->fn_flags & RTN_RTINFO) \
636                                 goto restart; \
637                 } \
638         } \
639 } while(0)
640
641 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
642                                              struct fib6_table *table,
643                                              struct flowi6 *fl6, int flags)
644 {
645         struct fib6_node *fn;
646         struct rt6_info *rt;
647
648         read_lock_bh(&table->tb6_lock);
649         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
650 restart:
651         rt = fn->leaf;
652         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
653         BACKTRACK(net, &fl6->saddr);
654 out:
655         dst_use(&rt->dst, jiffies);
656         read_unlock_bh(&table->tb6_lock);
657         return rt;
658
659 }
660
661 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
662                             const struct in6_addr *saddr, int oif, int strict)
663 {
664         struct flowi6 fl6 = {
665                 .flowi6_oif = oif,
666                 .daddr = *daddr,
667         };
668         struct dst_entry *dst;
669         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
670
671         if (saddr) {
672                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
673                 flags |= RT6_LOOKUP_F_HAS_SADDR;
674         }
675
676         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
677         if (dst->error == 0)
678                 return (struct rt6_info *) dst;
679
680         dst_release(dst);
681
682         return NULL;
683 }
684
685 EXPORT_SYMBOL(rt6_lookup);
686
687 /* ip6_ins_rt is called with FREE table->tb6_lock.
688    It takes new route entry, the addition fails by any reason the
689    route is freed. In any case, if caller does not hold it, it may
690    be destroyed.
691  */
692
693 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
694 {
695         int err;
696         struct fib6_table *table;
697
698         table = rt->rt6i_table;
699         write_lock_bh(&table->tb6_lock);
700         err = fib6_add(&table->tb6_root, rt, info);
701         write_unlock_bh(&table->tb6_lock);
702
703         return err;
704 }
705
706 int ip6_ins_rt(struct rt6_info *rt)
707 {
708         struct nl_info info = {
709                 .nl_net = dev_net(rt->rt6i_dev),
710         };
711         return __ip6_ins_rt(rt, &info);
712 }
713
714 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
715                                       const struct in6_addr *daddr,
716                                       const struct in6_addr *saddr)
717 {
718         struct rt6_info *rt;
719
720         /*
721          *      Clone the route.
722          */
723
724         rt = ip6_rt_copy(ort, daddr);
725
726         if (rt) {
727                 struct neighbour *neigh;
728                 int attempts = !in_softirq();
729
730                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
731                         if (ort->rt6i_dst.plen != 128 &&
732                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
733                                 rt->rt6i_flags |= RTF_ANYCAST;
734                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
735                 }
736
737                 rt->rt6i_flags |= RTF_CACHE;
738
739 #ifdef CONFIG_IPV6_SUBTREES
740                 if (rt->rt6i_src.plen && saddr) {
741                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
742                         rt->rt6i_src.plen = 128;
743                 }
744 #endif
745
746         retry:
747                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
748                 if (IS_ERR(neigh)) {
749                         struct net *net = dev_net(rt->rt6i_dev);
750                         int saved_rt_min_interval =
751                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
752                         int saved_rt_elasticity =
753                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
754
755                         if (attempts-- > 0) {
756                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
757                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
758
759                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
760
761                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
762                                         saved_rt_elasticity;
763                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
764                                         saved_rt_min_interval;
765                                 goto retry;
766                         }
767
768                         if (net_ratelimit())
769                                 printk(KERN_WARNING
770                                        "ipv6: Neighbour table overflow.\n");
771                         dst_free(&rt->dst);
772                         return NULL;
773                 }
774                 dst_set_neighbour(&rt->dst, neigh);
775
776         }
777
778         return rt;
779 }
780
781 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
782                                         const struct in6_addr *daddr)
783 {
784         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
785
786         if (rt) {
787                 rt->rt6i_flags |= RTF_CACHE;
788                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
789         }
790         return rt;
791 }
792
793 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
794                                       struct flowi6 *fl6, int flags)
795 {
796         struct fib6_node *fn;
797         struct rt6_info *rt, *nrt;
798         int strict = 0;
799         int attempts = 3;
800         int err;
801         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
802
803         strict |= flags & RT6_LOOKUP_F_IFACE;
804
805 relookup:
806         read_lock_bh(&table->tb6_lock);
807
808 restart_2:
809         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
810
811 restart:
812         rt = rt6_select(fn, oif, strict | reachable);
813
814         BACKTRACK(net, &fl6->saddr);
815         if (rt == net->ipv6.ip6_null_entry ||
816             rt->rt6i_flags & RTF_CACHE)
817                 goto out;
818
819         dst_hold(&rt->dst);
820         read_unlock_bh(&table->tb6_lock);
821
822         if (!dst_get_neighbour_raw(&rt->dst)
823             && !(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_LOCAL)))
824                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
825         else if (!(rt->dst.flags & DST_HOST))
826                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
827         else
828                 goto out2;
829
830         dst_release(&rt->dst);
831         rt = nrt ? : net->ipv6.ip6_null_entry;
832
833         dst_hold(&rt->dst);
834         if (nrt) {
835                 err = ip6_ins_rt(nrt);
836                 if (!err)
837                         goto out2;
838         }
839
840         if (--attempts <= 0)
841                 goto out2;
842
843         /*
844          * Race condition! In the gap, when table->tb6_lock was
845          * released someone could insert this route.  Relookup.
846          */
847         dst_release(&rt->dst);
848         goto relookup;
849
850 out:
851         if (reachable) {
852                 reachable = 0;
853                 goto restart_2;
854         }
855         dst_hold(&rt->dst);
856         read_unlock_bh(&table->tb6_lock);
857 out2:
858         rt->dst.lastuse = jiffies;
859         rt->dst.__use++;
860
861         return rt;
862 }
863
864 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
865                                             struct flowi6 *fl6, int flags)
866 {
867         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
868 }
869
870 void ip6_route_input(struct sk_buff *skb)
871 {
872         const struct ipv6hdr *iph = ipv6_hdr(skb);
873         struct net *net = dev_net(skb->dev);
874         int flags = RT6_LOOKUP_F_HAS_SADDR;
875         struct flowi6 fl6 = {
876                 .flowi6_iif = skb->dev->ifindex,
877                 .daddr = iph->daddr,
878                 .saddr = iph->saddr,
879                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
880                 .flowi6_mark = skb->mark,
881                 .flowi6_proto = iph->nexthdr,
882         };
883
884         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
885                 flags |= RT6_LOOKUP_F_IFACE;
886
887         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
888 }
889
890 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
891                                              struct flowi6 *fl6, int flags)
892 {
893         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
894 }
895
896 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
897                                     struct flowi6 *fl6)
898 {
899         int flags = 0;
900
901         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
902                 flags |= RT6_LOOKUP_F_IFACE;
903
904         if (!ipv6_addr_any(&fl6->saddr))
905                 flags |= RT6_LOOKUP_F_HAS_SADDR;
906         else if (sk)
907                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
908
909         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
910 }
911
912 EXPORT_SYMBOL(ip6_route_output);
913
914 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
915 {
916         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
917         struct dst_entry *new = NULL;
918
919         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
920         if (rt) {
921                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
922
923                 new = &rt->dst;
924
925                 new->__use = 1;
926                 new->input = dst_discard;
927                 new->output = dst_discard;
928
929                 if (dst_metrics_read_only(&ort->dst))
930                         new->_metrics = ort->dst._metrics;
931                 else
932                         dst_copy_metrics(new, &ort->dst);
933                 rt->rt6i_idev = ort->rt6i_idev;
934                 if (rt->rt6i_idev)
935                         in6_dev_hold(rt->rt6i_idev);
936                 rt->rt6i_expires = 0;
937
938                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
939                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
940                 rt->rt6i_metric = 0;
941
942                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
943 #ifdef CONFIG_IPV6_SUBTREES
944                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
945 #endif
946
947                 dst_free(new);
948         }
949
950         dst_release(dst_orig);
951         return new ? new : ERR_PTR(-ENOMEM);
952 }
953
954 /*
955  *      Destination cache support functions
956  */
957
958 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
959 {
960         struct rt6_info *rt;
961
962         rt = (struct rt6_info *) dst;
963
964         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
965                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
966                         if (!rt->rt6i_peer)
967                                 rt6_bind_peer(rt, 0);
968                         rt->rt6i_peer_genid = rt6_peer_genid();
969                 }
970                 return dst;
971         }
972         return NULL;
973 }
974
975 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
976 {
977         struct rt6_info *rt = (struct rt6_info *) dst;
978
979         if (rt) {
980                 if (rt->rt6i_flags & RTF_CACHE) {
981                         if (rt6_check_expired(rt)) {
982                                 ip6_del_rt(rt);
983                                 dst = NULL;
984                         }
985                 } else {
986                         dst_release(dst);
987                         dst = NULL;
988                 }
989         }
990         return dst;
991 }
992
993 static void ip6_link_failure(struct sk_buff *skb)
994 {
995         struct rt6_info *rt;
996
997         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
998
999         rt = (struct rt6_info *) skb_dst(skb);
1000         if (rt) {
1001                 if (rt->rt6i_flags&RTF_CACHE) {
1002                         dst_set_expires(&rt->dst, 0);
1003                         rt->rt6i_flags |= RTF_EXPIRES;
1004                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1005                         rt->rt6i_node->fn_sernum = -1;
1006         }
1007 }
1008
1009 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1010 {
1011         struct rt6_info *rt6 = (struct rt6_info*)dst;
1012
1013         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1014                 rt6->rt6i_flags |= RTF_MODIFIED;
1015                 if (mtu < IPV6_MIN_MTU) {
1016                         u32 features = dst_metric(dst, RTAX_FEATURES);
1017                         mtu = IPV6_MIN_MTU;
1018                         features |= RTAX_FEATURE_ALLFRAG;
1019                         dst_metric_set(dst, RTAX_FEATURES, features);
1020                 }
1021                 dst_metric_set(dst, RTAX_MTU, mtu);
1022         }
1023 }
1024
1025 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1026 {
1027         struct net_device *dev = dst->dev;
1028         unsigned int mtu = dst_mtu(dst);
1029         struct net *net = dev_net(dev);
1030
1031         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1032
1033         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1034                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1035
1036         /*
1037          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1038          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1039          * IPV6_MAXPLEN is also valid and means: "any MSS,
1040          * rely only on pmtu discovery"
1041          */
1042         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1043                 mtu = IPV6_MAXPLEN;
1044         return mtu;
1045 }
1046
1047 static unsigned int ip6_mtu(const struct dst_entry *dst)
1048 {
1049         struct inet6_dev *idev;
1050         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1051
1052         if (mtu)
1053                 return mtu;
1054
1055         mtu = IPV6_MIN_MTU;
1056
1057         rcu_read_lock();
1058         idev = __in6_dev_get(dst->dev);
1059         if (idev)
1060                 mtu = idev->cnf.mtu6;
1061         rcu_read_unlock();
1062
1063         return mtu;
1064 }
1065
1066 static struct dst_entry *icmp6_dst_gc_list;
1067 static DEFINE_SPINLOCK(icmp6_dst_lock);
1068
1069 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1070                                   struct neighbour *neigh,
1071                                   const struct in6_addr *addr)
1072 {
1073         struct rt6_info *rt;
1074         struct inet6_dev *idev = in6_dev_get(dev);
1075         struct net *net = dev_net(dev);
1076
1077         if (unlikely(idev == NULL))
1078                 return NULL;
1079
1080         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1081         if (unlikely(rt == NULL)) {
1082                 in6_dev_put(idev);
1083                 goto out;
1084         }
1085
1086         if (neigh)
1087                 neigh_hold(neigh);
1088         else {
1089                 neigh = ndisc_get_neigh(dev, addr);
1090                 if (IS_ERR(neigh))
1091                         neigh = NULL;
1092         }
1093
1094         rt->dst.flags |= DST_HOST;
1095         rt->dst.output  = ip6_output;
1096         dst_set_neighbour(&rt->dst, neigh);
1097         atomic_set(&rt->dst.__refcnt, 1);
1098         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1099         rt->rt6i_dst.plen = 128;
1100         rt->rt6i_idev     = idev;
1101         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1102
1103         spin_lock_bh(&icmp6_dst_lock);
1104         rt->dst.next = icmp6_dst_gc_list;
1105         icmp6_dst_gc_list = &rt->dst;
1106         spin_unlock_bh(&icmp6_dst_lock);
1107
1108         fib6_force_start_gc(net);
1109
1110 out:
1111         return &rt->dst;
1112 }
1113
1114 int icmp6_dst_gc(void)
1115 {
1116         struct dst_entry *dst, **pprev;
1117         int more = 0;
1118
1119         spin_lock_bh(&icmp6_dst_lock);
1120         pprev = &icmp6_dst_gc_list;
1121
1122         while ((dst = *pprev) != NULL) {
1123                 if (!atomic_read(&dst->__refcnt)) {
1124                         *pprev = dst->next;
1125                         dst_free(dst);
1126                 } else {
1127                         pprev = &dst->next;
1128                         ++more;
1129                 }
1130         }
1131
1132         spin_unlock_bh(&icmp6_dst_lock);
1133
1134         return more;
1135 }
1136
1137 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1138                             void *arg)
1139 {
1140         struct dst_entry *dst, **pprev;
1141
1142         spin_lock_bh(&icmp6_dst_lock);
1143         pprev = &icmp6_dst_gc_list;
1144         while ((dst = *pprev) != NULL) {
1145                 struct rt6_info *rt = (struct rt6_info *) dst;
1146                 if (func(rt, arg)) {
1147                         *pprev = dst->next;
1148                         dst_free(dst);
1149                 } else {
1150                         pprev = &dst->next;
1151                 }
1152         }
1153         spin_unlock_bh(&icmp6_dst_lock);
1154 }
1155
1156 static int ip6_dst_gc(struct dst_ops *ops)
1157 {
1158         unsigned long now = jiffies;
1159         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1160         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1161         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1162         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1163         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1164         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1165         int entries;
1166
1167         entries = dst_entries_get_fast(ops);
1168         if (time_after(rt_last_gc + rt_min_interval, now) &&
1169             entries <= rt_max_size)
1170                 goto out;
1171
1172         net->ipv6.ip6_rt_gc_expire++;
1173         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1174         net->ipv6.ip6_rt_last_gc = now;
1175         entries = dst_entries_get_slow(ops);
1176         if (entries < ops->gc_thresh)
1177                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1178 out:
1179         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1180         return entries > rt_max_size;
1181 }
1182
1183 /* Clean host part of a prefix. Not necessary in radix tree,
1184    but results in cleaner routing tables.
1185
1186    Remove it only when all the things will work!
1187  */
1188
1189 int ip6_dst_hoplimit(struct dst_entry *dst)
1190 {
1191         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1192         if (hoplimit == 0) {
1193                 struct net_device *dev = dst->dev;
1194                 struct inet6_dev *idev;
1195
1196                 rcu_read_lock();
1197                 idev = __in6_dev_get(dev);
1198                 if (idev)
1199                         hoplimit = idev->cnf.hop_limit;
1200                 else
1201                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1202                 rcu_read_unlock();
1203         }
1204         return hoplimit;
1205 }
1206 EXPORT_SYMBOL(ip6_dst_hoplimit);
1207
1208 /*
1209  *
1210  */
1211
1212 int ip6_route_add(struct fib6_config *cfg)
1213 {
1214         int err;
1215         struct net *net = cfg->fc_nlinfo.nl_net;
1216         struct rt6_info *rt = NULL;
1217         struct net_device *dev = NULL;
1218         struct inet6_dev *idev = NULL;
1219         struct fib6_table *table;
1220         int addr_type;
1221
1222         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1223                 return -EINVAL;
1224 #ifndef CONFIG_IPV6_SUBTREES
1225         if (cfg->fc_src_len)
1226                 return -EINVAL;
1227 #endif
1228         if (cfg->fc_ifindex) {
1229                 err = -ENODEV;
1230                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1231                 if (!dev)
1232                         goto out;
1233                 idev = in6_dev_get(dev);
1234                 if (!idev)
1235                         goto out;
1236         }
1237
1238         if (cfg->fc_metric == 0)
1239                 cfg->fc_metric = IP6_RT_PRIO_USER;
1240
1241         table = fib6_new_table(net, cfg->fc_table);
1242         if (table == NULL) {
1243                 err = -ENOBUFS;
1244                 goto out;
1245         }
1246
1247         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1248
1249         if (rt == NULL) {
1250                 err = -ENOMEM;
1251                 goto out;
1252         }
1253
1254         rt->dst.obsolete = -1;
1255         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1256                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1257                                 0;
1258
1259         if (cfg->fc_protocol == RTPROT_UNSPEC)
1260                 cfg->fc_protocol = RTPROT_BOOT;
1261         rt->rt6i_protocol = cfg->fc_protocol;
1262
1263         addr_type = ipv6_addr_type(&cfg->fc_dst);
1264
1265         if (addr_type & IPV6_ADDR_MULTICAST)
1266                 rt->dst.input = ip6_mc_input;
1267         else if (cfg->fc_flags & RTF_LOCAL)
1268                 rt->dst.input = ip6_input;
1269         else
1270                 rt->dst.input = ip6_forward;
1271
1272         rt->dst.output = ip6_output;
1273
1274         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1275         rt->rt6i_dst.plen = cfg->fc_dst_len;
1276         if (rt->rt6i_dst.plen == 128)
1277                rt->dst.flags |= DST_HOST;
1278
1279         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1280                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1281                 if (!metrics) {
1282                         err = -ENOMEM;
1283                         goto out;
1284                 }
1285                 dst_init_metrics(&rt->dst, metrics, 0);
1286         }
1287 #ifdef CONFIG_IPV6_SUBTREES
1288         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1289         rt->rt6i_src.plen = cfg->fc_src_len;
1290 #endif
1291
1292         rt->rt6i_metric = cfg->fc_metric;
1293
1294         /* We cannot add true routes via loopback here,
1295            they would result in kernel looping; promote them to reject routes
1296          */
1297         if ((cfg->fc_flags & RTF_REJECT) ||
1298             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1299                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1300                 /* hold loopback dev/idev if we haven't done so. */
1301                 if (dev != net->loopback_dev) {
1302                         if (dev) {
1303                                 dev_put(dev);
1304                                 in6_dev_put(idev);
1305                         }
1306                         dev = net->loopback_dev;
1307                         dev_hold(dev);
1308                         idev = in6_dev_get(dev);
1309                         if (!idev) {
1310                                 err = -ENODEV;
1311                                 goto out;
1312                         }
1313                 }
1314                 rt->dst.output = ip6_pkt_discard_out;
1315                 rt->dst.input = ip6_pkt_discard;
1316                 rt->dst.error = -ENETUNREACH;
1317                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1318                 goto install_route;
1319         }
1320
1321         if (cfg->fc_flags & RTF_GATEWAY) {
1322                 const struct in6_addr *gw_addr;
1323                 int gwa_type;
1324
1325                 gw_addr = &cfg->fc_gateway;
1326                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1327                 gwa_type = ipv6_addr_type(gw_addr);
1328
1329                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1330                         struct rt6_info *grt;
1331
1332                         /* IPv6 strictly inhibits using not link-local
1333                            addresses as nexthop address.
1334                            Otherwise, router will not able to send redirects.
1335                            It is very good, but in some (rare!) circumstances
1336                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1337                            some exceptions. --ANK
1338                          */
1339                         err = -EINVAL;
1340                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1341                                 goto out;
1342
1343                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1344
1345                         err = -EHOSTUNREACH;
1346                         if (grt == NULL)
1347                                 goto out;
1348                         if (dev) {
1349                                 if (dev != grt->rt6i_dev) {
1350                                         dst_release(&grt->dst);
1351                                         goto out;
1352                                 }
1353                         } else {
1354                                 dev = grt->rt6i_dev;
1355                                 idev = grt->rt6i_idev;
1356                                 dev_hold(dev);
1357                                 in6_dev_hold(grt->rt6i_idev);
1358                         }
1359                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1360                                 err = 0;
1361                         dst_release(&grt->dst);
1362
1363                         if (err)
1364                                 goto out;
1365                 }
1366                 err = -EINVAL;
1367                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1368                         goto out;
1369         }
1370
1371         err = -ENODEV;
1372         if (dev == NULL)
1373                 goto out;
1374
1375         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1376                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1377                         err = -EINVAL;
1378                         goto out;
1379                 }
1380                 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1381                 rt->rt6i_prefsrc.plen = 128;
1382         } else
1383                 rt->rt6i_prefsrc.plen = 0;
1384
1385         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1386                 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1387                 if (IS_ERR(n)) {
1388                         err = PTR_ERR(n);
1389                         goto out;
1390                 }
1391                 dst_set_neighbour(&rt->dst, n);
1392         }
1393
1394         rt->rt6i_flags = cfg->fc_flags;
1395
1396 install_route:
1397         if (cfg->fc_mx) {
1398                 struct nlattr *nla;
1399                 int remaining;
1400
1401                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1402                         int type = nla_type(nla);
1403
1404                         if (type) {
1405                                 if (type > RTAX_MAX) {
1406                                         err = -EINVAL;
1407                                         goto out;
1408                                 }
1409
1410                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1411                         }
1412                 }
1413         }
1414
1415         rt->dst.dev = dev;
1416         rt->rt6i_idev = idev;
1417         rt->rt6i_table = table;
1418
1419         cfg->fc_nlinfo.nl_net = dev_net(dev);
1420
1421         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1422
1423 out:
1424         if (dev)
1425                 dev_put(dev);
1426         if (idev)
1427                 in6_dev_put(idev);
1428         if (rt)
1429                 dst_free(&rt->dst);
1430         return err;
1431 }
1432
1433 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1434 {
1435         int err;
1436         struct fib6_table *table;
1437         struct net *net = dev_net(rt->rt6i_dev);
1438
1439         if (rt == net->ipv6.ip6_null_entry) {
1440                 err = -ENOENT;
1441                 goto out;
1442         }
1443
1444         table = rt->rt6i_table;
1445         write_lock_bh(&table->tb6_lock);
1446         err = fib6_del(rt, info);
1447         write_unlock_bh(&table->tb6_lock);
1448
1449 out:
1450         dst_release(&rt->dst);
1451         return err;
1452 }
1453
1454 int ip6_del_rt(struct rt6_info *rt)
1455 {
1456         struct nl_info info = {
1457                 .nl_net = dev_net(rt->rt6i_dev),
1458         };
1459         return __ip6_del_rt(rt, &info);
1460 }
1461
1462 static int ip6_route_del(struct fib6_config *cfg)
1463 {
1464         struct fib6_table *table;
1465         struct fib6_node *fn;
1466         struct rt6_info *rt;
1467         int err = -ESRCH;
1468
1469         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1470         if (table == NULL)
1471                 return err;
1472
1473         read_lock_bh(&table->tb6_lock);
1474
1475         fn = fib6_locate(&table->tb6_root,
1476                          &cfg->fc_dst, cfg->fc_dst_len,
1477                          &cfg->fc_src, cfg->fc_src_len);
1478
1479         if (fn) {
1480                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1481                         if (cfg->fc_ifindex &&
1482                             (rt->rt6i_dev == NULL ||
1483                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1484                                 continue;
1485                         if (cfg->fc_flags & RTF_GATEWAY &&
1486                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1487                                 continue;
1488                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1489                                 continue;
1490                         dst_hold(&rt->dst);
1491                         read_unlock_bh(&table->tb6_lock);
1492
1493                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1494                 }
1495         }
1496         read_unlock_bh(&table->tb6_lock);
1497
1498         return err;
1499 }
1500
1501 /*
1502  *      Handle redirects
1503  */
1504 struct ip6rd_flowi {
1505         struct flowi6 fl6;
1506         struct in6_addr gateway;
1507 };
1508
1509 static struct rt6_info *__ip6_route_redirect(struct net *net,
1510                                              struct fib6_table *table,
1511                                              struct flowi6 *fl6,
1512                                              int flags)
1513 {
1514         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1515         struct rt6_info *rt;
1516         struct fib6_node *fn;
1517
1518         /*
1519          * Get the "current" route for this destination and
1520          * check if the redirect has come from approriate router.
1521          *
1522          * RFC 2461 specifies that redirects should only be
1523          * accepted if they come from the nexthop to the target.
1524          * Due to the way the routes are chosen, this notion
1525          * is a bit fuzzy and one might need to check all possible
1526          * routes.
1527          */
1528
1529         read_lock_bh(&table->tb6_lock);
1530         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1531 restart:
1532         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1533                 /*
1534                  * Current route is on-link; redirect is always invalid.
1535                  *
1536                  * Seems, previous statement is not true. It could
1537                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1538                  * But then router serving it might decide, that we should
1539                  * know truth 8)8) --ANK (980726).
1540                  */
1541                 if (rt6_check_expired(rt))
1542                         continue;
1543                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1544                         continue;
1545                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1546                         continue;
1547                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1548                         continue;
1549                 break;
1550         }
1551
1552         if (!rt)
1553                 rt = net->ipv6.ip6_null_entry;
1554         BACKTRACK(net, &fl6->saddr);
1555 out:
1556         dst_hold(&rt->dst);
1557
1558         read_unlock_bh(&table->tb6_lock);
1559
1560         return rt;
1561 };
1562
1563 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1564                                            const struct in6_addr *src,
1565                                            const struct in6_addr *gateway,
1566                                            struct net_device *dev)
1567 {
1568         int flags = RT6_LOOKUP_F_HAS_SADDR;
1569         struct net *net = dev_net(dev);
1570         struct ip6rd_flowi rdfl = {
1571                 .fl6 = {
1572                         .flowi6_oif = dev->ifindex,
1573                         .daddr = *dest,
1574                         .saddr = *src,
1575                 },
1576         };
1577
1578         ipv6_addr_copy(&rdfl.gateway, gateway);
1579
1580         if (rt6_need_strict(dest))
1581                 flags |= RT6_LOOKUP_F_IFACE;
1582
1583         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1584                                                    flags, __ip6_route_redirect);
1585 }
1586
1587 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1588                   const struct in6_addr *saddr,
1589                   struct neighbour *neigh, u8 *lladdr, int on_link)
1590 {
1591         struct rt6_info *rt, *nrt = NULL;
1592         struct netevent_redirect netevent;
1593         struct net *net = dev_net(neigh->dev);
1594
1595         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1596
1597         if (rt == net->ipv6.ip6_null_entry) {
1598                 if (net_ratelimit())
1599                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1600                                "for redirect target\n");
1601                 goto out;
1602         }
1603
1604         /*
1605          *      We have finally decided to accept it.
1606          */
1607
1608         neigh_update(neigh, lladdr, NUD_STALE,
1609                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1610                      NEIGH_UPDATE_F_OVERRIDE|
1611                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1612                                      NEIGH_UPDATE_F_ISROUTER))
1613                      );
1614
1615         /*
1616          * Redirect received -> path was valid.
1617          * Look, redirects are sent only in response to data packets,
1618          * so that this nexthop apparently is reachable. --ANK
1619          */
1620         dst_confirm(&rt->dst);
1621
1622         /* Duplicate redirect: silently ignore. */
1623         if (neigh == dst_get_neighbour_raw(&rt->dst))
1624                 goto out;
1625
1626         nrt = ip6_rt_copy(rt, dest);
1627         if (nrt == NULL)
1628                 goto out;
1629
1630         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1631         if (on_link)
1632                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1633
1634         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1635         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1636
1637         if (ip6_ins_rt(nrt))
1638                 goto out;
1639
1640         netevent.old = &rt->dst;
1641         netevent.new = &nrt->dst;
1642         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1643
1644         if (rt->rt6i_flags&RTF_CACHE) {
1645                 ip6_del_rt(rt);
1646                 return;
1647         }
1648
1649 out:
1650         dst_release(&rt->dst);
1651 }
1652
1653 /*
1654  *      Handle ICMP "packet too big" messages
1655  *      i.e. Path MTU discovery
1656  */
1657
1658 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1659                              struct net *net, u32 pmtu, int ifindex)
1660 {
1661         struct rt6_info *rt, *nrt;
1662         int allfrag = 0;
1663 again:
1664         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1665         if (rt == NULL)
1666                 return;
1667
1668         if (rt6_check_expired(rt)) {
1669                 ip6_del_rt(rt);
1670                 goto again;
1671         }
1672
1673         if (pmtu >= dst_mtu(&rt->dst))
1674                 goto out;
1675
1676         if (pmtu < IPV6_MIN_MTU) {
1677                 /*
1678                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1679                  * MTU (1280) and a fragment header should always be included
1680                  * after a node receiving Too Big message reporting PMTU is
1681                  * less than the IPv6 Minimum Link MTU.
1682                  */
1683                 pmtu = IPV6_MIN_MTU;
1684                 allfrag = 1;
1685         }
1686
1687         /* New mtu received -> path was valid.
1688            They are sent only in response to data packets,
1689            so that this nexthop apparently is reachable. --ANK
1690          */
1691         dst_confirm(&rt->dst);
1692
1693         /* Host route. If it is static, it would be better
1694            not to override it, but add new one, so that
1695            when cache entry will expire old pmtu
1696            would return automatically.
1697          */
1698         if (rt->rt6i_flags & RTF_CACHE) {
1699                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1700                 if (allfrag) {
1701                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1702                         features |= RTAX_FEATURE_ALLFRAG;
1703                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1704                 }
1705                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1706                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1707                 goto out;
1708         }
1709
1710         /* Network route.
1711            Two cases are possible:
1712            1. It is connected route. Action: COW
1713            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1714          */
1715         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1716                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1717         else
1718                 nrt = rt6_alloc_clone(rt, daddr);
1719
1720         if (nrt) {
1721                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1722                 if (allfrag) {
1723                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1724                         features |= RTAX_FEATURE_ALLFRAG;
1725                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1726                 }
1727
1728                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1729                  * happened within 5 mins, the recommended timer is 10 mins.
1730                  * Here this route expiration time is set to ip6_rt_mtu_expires
1731                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1732                  * and detecting PMTU increase will be automatically happened.
1733                  */
1734                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1735                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1736
1737                 ip6_ins_rt(nrt);
1738         }
1739 out:
1740         dst_release(&rt->dst);
1741 }
1742
1743 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1744                         struct net_device *dev, u32 pmtu)
1745 {
1746         struct net *net = dev_net(dev);
1747
1748         /*
1749          * RFC 1981 states that a node "MUST reduce the size of the packets it
1750          * is sending along the path" that caused the Packet Too Big message.
1751          * Since it's not possible in the general case to determine which
1752          * interface was used to send the original packet, we update the MTU
1753          * on the interface that will be used to send future packets. We also
1754          * update the MTU on the interface that received the Packet Too Big in
1755          * case the original packet was forced out that interface with
1756          * SO_BINDTODEVICE or similar. This is the next best thing to the
1757          * correct behaviour, which would be to update the MTU on all
1758          * interfaces.
1759          */
1760         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1761         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1762 }
1763
1764 /*
1765  *      Misc support functions
1766  */
1767
1768 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1769                                     const struct in6_addr *dest)
1770 {
1771         struct net *net = dev_net(ort->rt6i_dev);
1772         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1773                                             ort->dst.dev, 0);
1774
1775         if (rt) {
1776                 rt->dst.input = ort->dst.input;
1777                 rt->dst.output = ort->dst.output;
1778                 rt->dst.flags |= DST_HOST;
1779
1780                 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1781                 rt->rt6i_dst.plen = 128;
1782                 dst_copy_metrics(&rt->dst, &ort->dst);
1783                 rt->dst.error = ort->dst.error;
1784                 rt->rt6i_idev = ort->rt6i_idev;
1785                 if (rt->rt6i_idev)
1786                         in6_dev_hold(rt->rt6i_idev);
1787                 rt->dst.lastuse = jiffies;
1788                 rt->rt6i_expires = 0;
1789
1790                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1791                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1792                 rt->rt6i_metric = 0;
1793
1794 #ifdef CONFIG_IPV6_SUBTREES
1795                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1796 #endif
1797                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1798                 rt->rt6i_table = ort->rt6i_table;
1799         }
1800         return rt;
1801 }
1802
1803 #ifdef CONFIG_IPV6_ROUTE_INFO
1804 static struct rt6_info *rt6_get_route_info(struct net *net,
1805                                            const struct in6_addr *prefix, int prefixlen,
1806                                            const struct in6_addr *gwaddr, int ifindex)
1807 {
1808         struct fib6_node *fn;
1809         struct rt6_info *rt = NULL;
1810         struct fib6_table *table;
1811
1812         table = fib6_get_table(net, RT6_TABLE_INFO);
1813         if (table == NULL)
1814                 return NULL;
1815
1816         write_lock_bh(&table->tb6_lock);
1817         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1818         if (!fn)
1819                 goto out;
1820
1821         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1822                 if (rt->rt6i_dev->ifindex != ifindex)
1823                         continue;
1824                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1825                         continue;
1826                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1827                         continue;
1828                 dst_hold(&rt->dst);
1829                 break;
1830         }
1831 out:
1832         write_unlock_bh(&table->tb6_lock);
1833         return rt;
1834 }
1835
1836 static struct rt6_info *rt6_add_route_info(struct net *net,
1837                                            const struct in6_addr *prefix, int prefixlen,
1838                                            const struct in6_addr *gwaddr, int ifindex,
1839                                            unsigned pref)
1840 {
1841         struct fib6_config cfg = {
1842                 .fc_table       = RT6_TABLE_INFO,
1843                 .fc_metric      = IP6_RT_PRIO_USER,
1844                 .fc_ifindex     = ifindex,
1845                 .fc_dst_len     = prefixlen,
1846                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1847                                   RTF_UP | RTF_PREF(pref),
1848                 .fc_nlinfo.pid = 0,
1849                 .fc_nlinfo.nlh = NULL,
1850                 .fc_nlinfo.nl_net = net,
1851         };
1852
1853         ipv6_addr_copy(&cfg.fc_dst, prefix);
1854         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1855
1856         /* We should treat it as a default route if prefix length is 0. */
1857         if (!prefixlen)
1858                 cfg.fc_flags |= RTF_DEFAULT;
1859
1860         ip6_route_add(&cfg);
1861
1862         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1863 }
1864 #endif
1865
1866 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1867 {
1868         struct rt6_info *rt;
1869         struct fib6_table *table;
1870
1871         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1872         if (table == NULL)
1873                 return NULL;
1874
1875         write_lock_bh(&table->tb6_lock);
1876         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1877                 if (dev == rt->rt6i_dev &&
1878                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1879                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1880                         break;
1881         }
1882         if (rt)
1883                 dst_hold(&rt->dst);
1884         write_unlock_bh(&table->tb6_lock);
1885         return rt;
1886 }
1887
1888 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1889                                      struct net_device *dev,
1890                                      unsigned int pref)
1891 {
1892         struct fib6_config cfg = {
1893                 .fc_table       = RT6_TABLE_DFLT,
1894                 .fc_metric      = IP6_RT_PRIO_USER,
1895                 .fc_ifindex     = dev->ifindex,
1896                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1897                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1898                 .fc_nlinfo.pid = 0,
1899                 .fc_nlinfo.nlh = NULL,
1900                 .fc_nlinfo.nl_net = dev_net(dev),
1901         };
1902
1903         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1904
1905         ip6_route_add(&cfg);
1906
1907         return rt6_get_dflt_router(gwaddr, dev);
1908 }
1909
1910 void rt6_purge_dflt_routers(struct net *net)
1911 {
1912         struct rt6_info *rt;
1913         struct fib6_table *table;
1914
1915         /* NOTE: Keep consistent with rt6_get_dflt_router */
1916         table = fib6_get_table(net, RT6_TABLE_DFLT);
1917         if (table == NULL)
1918                 return;
1919
1920 restart:
1921         read_lock_bh(&table->tb6_lock);
1922         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1923                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
1924                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
1925                         dst_hold(&rt->dst);
1926                         read_unlock_bh(&table->tb6_lock);
1927                         ip6_del_rt(rt);
1928                         goto restart;
1929                 }
1930         }
1931         read_unlock_bh(&table->tb6_lock);
1932 }
1933
1934 static void rtmsg_to_fib6_config(struct net *net,
1935                                  struct in6_rtmsg *rtmsg,
1936                                  struct fib6_config *cfg)
1937 {
1938         memset(cfg, 0, sizeof(*cfg));
1939
1940         cfg->fc_table = RT6_TABLE_MAIN;
1941         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1942         cfg->fc_metric = rtmsg->rtmsg_metric;
1943         cfg->fc_expires = rtmsg->rtmsg_info;
1944         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1945         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1946         cfg->fc_flags = rtmsg->rtmsg_flags;
1947
1948         cfg->fc_nlinfo.nl_net = net;
1949
1950         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1951         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1952         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1953 }
1954
1955 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1956 {
1957         struct fib6_config cfg;
1958         struct in6_rtmsg rtmsg;
1959         int err;
1960
1961         switch(cmd) {
1962         case SIOCADDRT:         /* Add a route */
1963         case SIOCDELRT:         /* Delete a route */
1964                 if (!capable(CAP_NET_ADMIN))
1965                         return -EPERM;
1966                 err = copy_from_user(&rtmsg, arg,
1967                                      sizeof(struct in6_rtmsg));
1968                 if (err)
1969                         return -EFAULT;
1970
1971                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1972
1973                 rtnl_lock();
1974                 switch (cmd) {
1975                 case SIOCADDRT:
1976                         err = ip6_route_add(&cfg);
1977                         break;
1978                 case SIOCDELRT:
1979                         err = ip6_route_del(&cfg);
1980                         break;
1981                 default:
1982                         err = -EINVAL;
1983                 }
1984                 rtnl_unlock();
1985
1986                 return err;
1987         }
1988
1989         return -EINVAL;
1990 }
1991
1992 /*
1993  *      Drop the packet on the floor
1994  */
1995
1996 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1997 {
1998         int type;
1999         struct dst_entry *dst = skb_dst(skb);
2000         switch (ipstats_mib_noroutes) {
2001         case IPSTATS_MIB_INNOROUTES:
2002                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2003                 if (type == IPV6_ADDR_ANY) {
2004                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2005                                       IPSTATS_MIB_INADDRERRORS);
2006                         break;
2007                 }
2008                 /* FALLTHROUGH */
2009         case IPSTATS_MIB_OUTNOROUTES:
2010                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2011                               ipstats_mib_noroutes);
2012                 break;
2013         }
2014         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2015         kfree_skb(skb);
2016         return 0;
2017 }
2018
2019 static int ip6_pkt_discard(struct sk_buff *skb)
2020 {
2021         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2022 }
2023
2024 static int ip6_pkt_discard_out(struct sk_buff *skb)
2025 {
2026         skb->dev = skb_dst(skb)->dev;
2027         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2028 }
2029
2030 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2031
2032 static int ip6_pkt_prohibit(struct sk_buff *skb)
2033 {
2034         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2035 }
2036
2037 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2038 {
2039         skb->dev = skb_dst(skb)->dev;
2040         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2041 }
2042
2043 #endif
2044
2045 /*
2046  *      Allocate a dst for local (unicast / anycast) address.
2047  */
2048
2049 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2050                                     const struct in6_addr *addr,
2051                                     int anycast)
2052 {
2053         struct net *net = dev_net(idev->dev);
2054         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2055                                             net->loopback_dev, 0);
2056         struct neighbour *neigh;
2057
2058         if (rt == NULL) {
2059                 if (net_ratelimit())
2060                         pr_warning("IPv6:  Maximum number of routes reached,"
2061                                    " consider increasing route/max_size.\n");
2062                 return ERR_PTR(-ENOMEM);
2063         }
2064
2065         in6_dev_hold(idev);
2066
2067         rt->dst.flags |= DST_HOST;
2068         rt->dst.input = ip6_input;
2069         rt->dst.output = ip6_output;
2070         rt->rt6i_idev = idev;
2071         rt->dst.obsolete = -1;
2072
2073         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2074         if (anycast)
2075                 rt->rt6i_flags |= RTF_ANYCAST;
2076         else
2077                 rt->rt6i_flags |= RTF_LOCAL;
2078         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2079         if (IS_ERR(neigh)) {
2080                 dst_free(&rt->dst);
2081
2082                 return ERR_CAST(neigh);
2083         }
2084         dst_set_neighbour(&rt->dst, neigh);
2085
2086         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2087         rt->rt6i_dst.plen = 128;
2088         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2089
2090         atomic_set(&rt->dst.__refcnt, 1);
2091
2092         return rt;
2093 }
2094
2095 int ip6_route_get_saddr(struct net *net,
2096                         struct rt6_info *rt,
2097                         const struct in6_addr *daddr,
2098                         unsigned int prefs,
2099                         struct in6_addr *saddr)
2100 {
2101         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2102         int err = 0;
2103         if (rt->rt6i_prefsrc.plen)
2104                 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2105         else
2106                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2107                                          daddr, prefs, saddr);
2108         return err;
2109 }
2110
2111 /* remove deleted ip from prefsrc entries */
2112 struct arg_dev_net_ip {
2113         struct net_device *dev;
2114         struct net *net;
2115         struct in6_addr *addr;
2116 };
2117
2118 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2119 {
2120         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2121         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2122         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2123
2124         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2125             rt != net->ipv6.ip6_null_entry &&
2126             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2127                 /* remove prefsrc entry */
2128                 rt->rt6i_prefsrc.plen = 0;
2129         }
2130         return 0;
2131 }
2132
2133 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2134 {
2135         struct net *net = dev_net(ifp->idev->dev);
2136         struct arg_dev_net_ip adni = {
2137                 .dev = ifp->idev->dev,
2138                 .net = net,
2139                 .addr = &ifp->addr,
2140         };
2141         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2142 }
2143
2144 struct arg_dev_net {
2145         struct net_device *dev;
2146         struct net *net;
2147 };
2148
2149 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2150 {
2151         const struct arg_dev_net *adn = arg;
2152         const struct net_device *dev = adn->dev;
2153
2154         if ((rt->rt6i_dev == dev || dev == NULL) &&
2155             rt != adn->net->ipv6.ip6_null_entry) {
2156                 RT6_TRACE("deleted by ifdown %p\n", rt);
2157                 return -1;
2158         }
2159         return 0;
2160 }
2161
2162 void rt6_ifdown(struct net *net, struct net_device *dev)
2163 {
2164         struct arg_dev_net adn = {
2165                 .dev = dev,
2166                 .net = net,
2167         };
2168
2169         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2170         icmp6_clean_all(fib6_ifdown, &adn);
2171 }
2172
2173 struct rt6_mtu_change_arg
2174 {
2175         struct net_device *dev;
2176         unsigned mtu;
2177 };
2178
2179 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2180 {
2181         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2182         struct inet6_dev *idev;
2183
2184         /* In IPv6 pmtu discovery is not optional,
2185            so that RTAX_MTU lock cannot disable it.
2186            We still use this lock to block changes
2187            caused by addrconf/ndisc.
2188         */
2189
2190         idev = __in6_dev_get(arg->dev);
2191         if (idev == NULL)
2192                 return 0;
2193
2194         /* For administrative MTU increase, there is no way to discover
2195            IPv6 PMTU increase, so PMTU increase should be updated here.
2196            Since RFC 1981 doesn't include administrative MTU increase
2197            update PMTU increase is a MUST. (i.e. jumbo frame)
2198          */
2199         /*
2200            If new MTU is less than route PMTU, this new MTU will be the
2201            lowest MTU in the path, update the route PMTU to reflect PMTU
2202            decreases; if new MTU is greater than route PMTU, and the
2203            old MTU is the lowest MTU in the path, update the route PMTU
2204            to reflect the increase. In this case if the other nodes' MTU
2205            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2206            PMTU discouvery.
2207          */
2208         if (rt->rt6i_dev == arg->dev &&
2209             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2210             (dst_mtu(&rt->dst) >= arg->mtu ||
2211              (dst_mtu(&rt->dst) < arg->mtu &&
2212               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2213                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2214         }
2215         return 0;
2216 }
2217
2218 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2219 {
2220         struct rt6_mtu_change_arg arg = {
2221                 .dev = dev,
2222                 .mtu = mtu,
2223         };
2224
2225         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2226 }
2227
2228 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2229         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2230         [RTA_OIF]               = { .type = NLA_U32 },
2231         [RTA_IIF]               = { .type = NLA_U32 },
2232         [RTA_PRIORITY]          = { .type = NLA_U32 },
2233         [RTA_METRICS]           = { .type = NLA_NESTED },
2234 };
2235
2236 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2237                               struct fib6_config *cfg)
2238 {
2239         struct rtmsg *rtm;
2240         struct nlattr *tb[RTA_MAX+1];
2241         int err;
2242
2243         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2244         if (err < 0)
2245                 goto errout;
2246
2247         err = -EINVAL;
2248         rtm = nlmsg_data(nlh);
2249         memset(cfg, 0, sizeof(*cfg));
2250
2251         cfg->fc_table = rtm->rtm_table;
2252         cfg->fc_dst_len = rtm->rtm_dst_len;
2253         cfg->fc_src_len = rtm->rtm_src_len;
2254         cfg->fc_flags = RTF_UP;
2255         cfg->fc_protocol = rtm->rtm_protocol;
2256
2257         if (rtm->rtm_type == RTN_UNREACHABLE)
2258                 cfg->fc_flags |= RTF_REJECT;
2259
2260         if (rtm->rtm_type == RTN_LOCAL)
2261                 cfg->fc_flags |= RTF_LOCAL;
2262
2263         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2264         cfg->fc_nlinfo.nlh = nlh;
2265         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2266
2267         if (tb[RTA_GATEWAY]) {
2268                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2269                 cfg->fc_flags |= RTF_GATEWAY;
2270         }
2271
2272         if (tb[RTA_DST]) {
2273                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2274
2275                 if (nla_len(tb[RTA_DST]) < plen)
2276                         goto errout;
2277
2278                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2279         }
2280
2281         if (tb[RTA_SRC]) {
2282                 int plen = (rtm->rtm_src_len + 7) >> 3;
2283
2284                 if (nla_len(tb[RTA_SRC]) < plen)
2285                         goto errout;
2286
2287                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2288         }
2289
2290         if (tb[RTA_PREFSRC])
2291                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2292
2293         if (tb[RTA_OIF])
2294                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2295
2296         if (tb[RTA_PRIORITY])
2297                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2298
2299         if (tb[RTA_METRICS]) {
2300                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2301                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2302         }
2303
2304         if (tb[RTA_TABLE])
2305                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2306
2307         err = 0;
2308 errout:
2309         return err;
2310 }
2311
2312 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2313 {
2314         struct fib6_config cfg;
2315         int err;
2316
2317         err = rtm_to_fib6_config(skb, nlh, &cfg);
2318         if (err < 0)
2319                 return err;
2320
2321         return ip6_route_del(&cfg);
2322 }
2323
2324 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2325 {
2326         struct fib6_config cfg;
2327         int err;
2328
2329         err = rtm_to_fib6_config(skb, nlh, &cfg);
2330         if (err < 0)
2331                 return err;
2332
2333         return ip6_route_add(&cfg);
2334 }
2335
2336 static inline size_t rt6_nlmsg_size(void)
2337 {
2338         return NLMSG_ALIGN(sizeof(struct rtmsg))
2339                + nla_total_size(16) /* RTA_SRC */
2340                + nla_total_size(16) /* RTA_DST */
2341                + nla_total_size(16) /* RTA_GATEWAY */
2342                + nla_total_size(16) /* RTA_PREFSRC */
2343                + nla_total_size(4) /* RTA_TABLE */
2344                + nla_total_size(4) /* RTA_IIF */
2345                + nla_total_size(4) /* RTA_OIF */
2346                + nla_total_size(4) /* RTA_PRIORITY */
2347                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2348                + nla_total_size(sizeof(struct rta_cacheinfo));
2349 }
2350
2351 static int rt6_fill_node(struct net *net,
2352                          struct sk_buff *skb, struct rt6_info *rt,
2353                          struct in6_addr *dst, struct in6_addr *src,
2354                          int iif, int type, u32 pid, u32 seq,
2355                          int prefix, int nowait, unsigned int flags)
2356 {
2357         struct rtmsg *rtm;
2358         struct nlmsghdr *nlh;
2359         long expires;
2360         u32 table;
2361         struct neighbour *n;
2362
2363         if (prefix) {   /* user wants prefix routes only */
2364                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2365                         /* success since this is not a prefix route */
2366                         return 1;
2367                 }
2368         }
2369
2370         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2371         if (nlh == NULL)
2372                 return -EMSGSIZE;
2373
2374         rtm = nlmsg_data(nlh);
2375         rtm->rtm_family = AF_INET6;
2376         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2377         rtm->rtm_src_len = rt->rt6i_src.plen;
2378         rtm->rtm_tos = 0;
2379         if (rt->rt6i_table)
2380                 table = rt->rt6i_table->tb6_id;
2381         else
2382                 table = RT6_TABLE_UNSPEC;
2383         rtm->rtm_table = table;
2384         NLA_PUT_U32(skb, RTA_TABLE, table);
2385         if (rt->rt6i_flags&RTF_REJECT)
2386                 rtm->rtm_type = RTN_UNREACHABLE;
2387         else if (rt->rt6i_flags&RTF_LOCAL)
2388                 rtm->rtm_type = RTN_LOCAL;
2389         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2390                 rtm->rtm_type = RTN_LOCAL;
2391         else
2392                 rtm->rtm_type = RTN_UNICAST;
2393         rtm->rtm_flags = 0;
2394         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2395         rtm->rtm_protocol = rt->rt6i_protocol;
2396         if (rt->rt6i_flags&RTF_DYNAMIC)
2397                 rtm->rtm_protocol = RTPROT_REDIRECT;
2398         else if (rt->rt6i_flags & RTF_ADDRCONF)
2399                 rtm->rtm_protocol = RTPROT_KERNEL;
2400         else if (rt->rt6i_flags&RTF_DEFAULT)
2401                 rtm->rtm_protocol = RTPROT_RA;
2402
2403         if (rt->rt6i_flags&RTF_CACHE)
2404                 rtm->rtm_flags |= RTM_F_CLONED;
2405
2406         if (dst) {
2407                 NLA_PUT(skb, RTA_DST, 16, dst);
2408                 rtm->rtm_dst_len = 128;
2409         } else if (rtm->rtm_dst_len)
2410                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2411 #ifdef CONFIG_IPV6_SUBTREES
2412         if (src) {
2413                 NLA_PUT(skb, RTA_SRC, 16, src);
2414                 rtm->rtm_src_len = 128;
2415         } else if (rtm->rtm_src_len)
2416                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2417 #endif
2418         if (iif) {
2419 #ifdef CONFIG_IPV6_MROUTE
2420                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2421                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2422                         if (err <= 0) {
2423                                 if (!nowait) {
2424                                         if (err == 0)
2425                                                 return 0;
2426                                         goto nla_put_failure;
2427                                 } else {
2428                                         if (err == -EMSGSIZE)
2429                                                 goto nla_put_failure;
2430                                 }
2431                         }
2432                 } else
2433 #endif
2434                         NLA_PUT_U32(skb, RTA_IIF, iif);
2435         } else if (dst) {
2436                 struct in6_addr saddr_buf;
2437                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2438                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2439         }
2440
2441         if (rt->rt6i_prefsrc.plen) {
2442                 struct in6_addr saddr_buf;
2443                 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2444                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2445         }
2446
2447         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2448                 goto nla_put_failure;
2449
2450         rcu_read_lock();
2451         n = dst_get_neighbour(&rt->dst);
2452         if (n) {
2453                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2454                         rcu_read_unlock();
2455                         goto nla_put_failure;
2456                 }
2457         }
2458         rcu_read_unlock();
2459
2460         if (rt->dst.dev)
2461                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2462
2463         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2464
2465         if (!(rt->rt6i_flags & RTF_EXPIRES))
2466                 expires = 0;
2467         else if (rt->rt6i_expires - jiffies < INT_MAX)
2468                 expires = rt->rt6i_expires - jiffies;
2469         else
2470                 expires = INT_MAX;
2471
2472         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2473                                expires, rt->dst.error) < 0)
2474                 goto nla_put_failure;
2475
2476         return nlmsg_end(skb, nlh);
2477
2478 nla_put_failure:
2479         nlmsg_cancel(skb, nlh);
2480         return -EMSGSIZE;
2481 }
2482
2483 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2484 {
2485         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2486         int prefix;
2487
2488         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2489                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2490                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2491         } else
2492                 prefix = 0;
2493
2494         return rt6_fill_node(arg->net,
2495                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2496                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2497                      prefix, 0, NLM_F_MULTI);
2498 }
2499
2500 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2501 {
2502         struct net *net = sock_net(in_skb->sk);
2503         struct nlattr *tb[RTA_MAX+1];
2504         struct rt6_info *rt;
2505         struct sk_buff *skb;
2506         struct rtmsg *rtm;
2507         struct flowi6 fl6;
2508         int err, iif = 0;
2509
2510         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2511         if (err < 0)
2512                 goto errout;
2513
2514         err = -EINVAL;
2515         memset(&fl6, 0, sizeof(fl6));
2516
2517         if (tb[RTA_SRC]) {
2518                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2519                         goto errout;
2520
2521                 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2522         }
2523
2524         if (tb[RTA_DST]) {
2525                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2526                         goto errout;
2527
2528                 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2529         }
2530
2531         if (tb[RTA_IIF])
2532                 iif = nla_get_u32(tb[RTA_IIF]);
2533
2534         if (tb[RTA_OIF])
2535                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2536
2537         if (iif) {
2538                 struct net_device *dev;
2539                 dev = __dev_get_by_index(net, iif);
2540                 if (!dev) {
2541                         err = -ENODEV;
2542                         goto errout;
2543                 }
2544         }
2545
2546         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2547         if (skb == NULL) {
2548                 err = -ENOBUFS;
2549                 goto errout;
2550         }
2551
2552         /* Reserve room for dummy headers, this skb can pass
2553            through good chunk of routing engine.
2554          */
2555         skb_reset_mac_header(skb);
2556         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2557
2558         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2559         skb_dst_set(skb, &rt->dst);
2560
2561         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2562                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2563                             nlh->nlmsg_seq, 0, 0, 0);
2564         if (err < 0) {
2565                 kfree_skb(skb);
2566                 goto errout;
2567         }
2568
2569         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2570 errout:
2571         return err;
2572 }
2573
2574 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2575 {
2576         struct sk_buff *skb;
2577         struct net *net = info->nl_net;
2578         u32 seq;
2579         int err;
2580
2581         err = -ENOBUFS;
2582         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2583
2584         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2585         if (skb == NULL)
2586                 goto errout;
2587
2588         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2589                                 event, info->pid, seq, 0, 0, 0);
2590         if (err < 0) {
2591                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2592                 WARN_ON(err == -EMSGSIZE);
2593                 kfree_skb(skb);
2594                 goto errout;
2595         }
2596         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2597                     info->nlh, gfp_any());
2598         return;
2599 errout:
2600         if (err < 0)
2601                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2602 }
2603
2604 static int ip6_route_dev_notify(struct notifier_block *this,
2605                                 unsigned long event, void *data)
2606 {
2607         struct net_device *dev = (struct net_device *)data;
2608         struct net *net = dev_net(dev);
2609
2610         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2611                 net->ipv6.ip6_null_entry->dst.dev = dev;
2612                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2613 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2614                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2615                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2616                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2617                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2618 #endif
2619         }
2620
2621         return NOTIFY_OK;
2622 }
2623
2624 /*
2625  *      /proc
2626  */
2627
2628 #ifdef CONFIG_PROC_FS
2629
2630 struct rt6_proc_arg
2631 {
2632         char *buffer;
2633         int offset;
2634         int length;
2635         int skip;
2636         int len;
2637 };
2638
2639 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2640 {
2641         struct seq_file *m = p_arg;
2642         struct neighbour *n;
2643
2644         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2645
2646 #ifdef CONFIG_IPV6_SUBTREES
2647         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2648 #else
2649         seq_puts(m, "00000000000000000000000000000000 00 ");
2650 #endif
2651         rcu_read_lock();
2652         n = dst_get_neighbour(&rt->dst);
2653         if (n) {
2654                 seq_printf(m, "%pi6", n->primary_key);
2655         } else {
2656                 seq_puts(m, "00000000000000000000000000000000");
2657         }
2658         rcu_read_unlock();
2659         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2660                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2661                    rt->dst.__use, rt->rt6i_flags,
2662                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2663         return 0;
2664 }
2665
2666 static int ipv6_route_show(struct seq_file *m, void *v)
2667 {
2668         struct net *net = (struct net *)m->private;
2669         fib6_clean_all(net, rt6_info_route, 0, m);
2670         return 0;
2671 }
2672
2673 static int ipv6_route_open(struct inode *inode, struct file *file)
2674 {
2675         return single_open_net(inode, file, ipv6_route_show);
2676 }
2677
2678 static const struct file_operations ipv6_route_proc_fops = {
2679         .owner          = THIS_MODULE,
2680         .open           = ipv6_route_open,
2681         .read           = seq_read,
2682         .llseek         = seq_lseek,
2683         .release        = single_release_net,
2684 };
2685
2686 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2687 {
2688         struct net *net = (struct net *)seq->private;
2689         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2690                    net->ipv6.rt6_stats->fib_nodes,
2691                    net->ipv6.rt6_stats->fib_route_nodes,
2692                    net->ipv6.rt6_stats->fib_rt_alloc,
2693                    net->ipv6.rt6_stats->fib_rt_entries,
2694                    net->ipv6.rt6_stats->fib_rt_cache,
2695                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2696                    net->ipv6.rt6_stats->fib_discarded_routes);
2697
2698         return 0;
2699 }
2700
2701 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2702 {
2703         return single_open_net(inode, file, rt6_stats_seq_show);
2704 }
2705
2706 static const struct file_operations rt6_stats_seq_fops = {
2707         .owner   = THIS_MODULE,
2708         .open    = rt6_stats_seq_open,
2709         .read    = seq_read,
2710         .llseek  = seq_lseek,
2711         .release = single_release_net,
2712 };
2713 #endif  /* CONFIG_PROC_FS */
2714
2715 #ifdef CONFIG_SYSCTL
2716
2717 static
2718 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2719                               void __user *buffer, size_t *lenp, loff_t *ppos)
2720 {
2721         struct net *net;
2722         int delay;
2723         if (!write)
2724                 return -EINVAL;
2725
2726         net = (struct net *)ctl->extra1;
2727         delay = net->ipv6.sysctl.flush_delay;
2728         proc_dointvec(ctl, write, buffer, lenp, ppos);
2729         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2730         return 0;
2731 }
2732
2733 ctl_table ipv6_route_table_template[] = {
2734         {
2735                 .procname       =       "flush",
2736                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2737                 .maxlen         =       sizeof(int),
2738                 .mode           =       0200,
2739                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2740         },
2741         {
2742                 .procname       =       "gc_thresh",
2743                 .data           =       &ip6_dst_ops_template.gc_thresh,
2744                 .maxlen         =       sizeof(int),
2745                 .mode           =       0644,
2746                 .proc_handler   =       proc_dointvec,
2747         },
2748         {
2749                 .procname       =       "max_size",
2750                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2751                 .maxlen         =       sizeof(int),
2752                 .mode           =       0644,
2753                 .proc_handler   =       proc_dointvec,
2754         },
2755         {
2756                 .procname       =       "gc_min_interval",
2757                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2758                 .maxlen         =       sizeof(int),
2759                 .mode           =       0644,
2760                 .proc_handler   =       proc_dointvec_jiffies,
2761         },
2762         {
2763                 .procname       =       "gc_timeout",
2764                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2765                 .maxlen         =       sizeof(int),
2766                 .mode           =       0644,
2767                 .proc_handler   =       proc_dointvec_jiffies,
2768         },
2769         {
2770                 .procname       =       "gc_interval",
2771                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2772                 .maxlen         =       sizeof(int),
2773                 .mode           =       0644,
2774                 .proc_handler   =       proc_dointvec_jiffies,
2775         },
2776         {
2777                 .procname       =       "gc_elasticity",
2778                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2779                 .maxlen         =       sizeof(int),
2780                 .mode           =       0644,
2781                 .proc_handler   =       proc_dointvec,
2782         },
2783         {
2784                 .procname       =       "mtu_expires",
2785                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2786                 .maxlen         =       sizeof(int),
2787                 .mode           =       0644,
2788                 .proc_handler   =       proc_dointvec_jiffies,
2789         },
2790         {
2791                 .procname       =       "min_adv_mss",
2792                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2793                 .maxlen         =       sizeof(int),
2794                 .mode           =       0644,
2795                 .proc_handler   =       proc_dointvec,
2796         },
2797         {
2798                 .procname       =       "gc_min_interval_ms",
2799                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2800                 .maxlen         =       sizeof(int),
2801                 .mode           =       0644,
2802                 .proc_handler   =       proc_dointvec_ms_jiffies,
2803         },
2804         { }
2805 };
2806
2807 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2808 {
2809         struct ctl_table *table;
2810
2811         table = kmemdup(ipv6_route_table_template,
2812                         sizeof(ipv6_route_table_template),
2813                         GFP_KERNEL);
2814
2815         if (table) {
2816                 table[0].data = &net->ipv6.sysctl.flush_delay;
2817                 table[0].extra1 = net;
2818                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2819                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2820                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2821                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2822                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2823                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2824                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2825                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2826                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2827         }
2828
2829         return table;
2830 }
2831 #endif
2832
2833 static int __net_init ip6_route_net_init(struct net *net)
2834 {
2835         int ret = -ENOMEM;
2836
2837         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2838                sizeof(net->ipv6.ip6_dst_ops));
2839
2840         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2841                 goto out_ip6_dst_ops;
2842
2843         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2844                                            sizeof(*net->ipv6.ip6_null_entry),
2845                                            GFP_KERNEL);
2846         if (!net->ipv6.ip6_null_entry)
2847                 goto out_ip6_dst_entries;
2848         net->ipv6.ip6_null_entry->dst.path =
2849                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2850         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2851         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2852                          ip6_template_metrics, true);
2853
2854 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2855         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2856                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2857                                                GFP_KERNEL);
2858         if (!net->ipv6.ip6_prohibit_entry)
2859                 goto out_ip6_null_entry;
2860         net->ipv6.ip6_prohibit_entry->dst.path =
2861                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2862         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2863         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2864                          ip6_template_metrics, true);
2865
2866         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2867                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2868                                                GFP_KERNEL);
2869         if (!net->ipv6.ip6_blk_hole_entry)
2870                 goto out_ip6_prohibit_entry;
2871         net->ipv6.ip6_blk_hole_entry->dst.path =
2872                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2873         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2874         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2875                          ip6_template_metrics, true);
2876 #endif
2877
2878         net->ipv6.sysctl.flush_delay = 0;
2879         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2880         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2881         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2882         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2883         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2884         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2885         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2886
2887         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2888
2889         ret = 0;
2890 out:
2891         return ret;
2892
2893 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2894 out_ip6_prohibit_entry:
2895         kfree(net->ipv6.ip6_prohibit_entry);
2896 out_ip6_null_entry:
2897         kfree(net->ipv6.ip6_null_entry);
2898 #endif
2899 out_ip6_dst_entries:
2900         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2901 out_ip6_dst_ops:
2902         goto out;
2903 }
2904
2905 static void __net_exit ip6_route_net_exit(struct net *net)
2906 {
2907         kfree(net->ipv6.ip6_null_entry);
2908 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2909         kfree(net->ipv6.ip6_prohibit_entry);
2910         kfree(net->ipv6.ip6_blk_hole_entry);
2911 #endif
2912         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2913 }
2914
2915 static int __net_init ip6_route_net_init_late(struct net *net)
2916 {
2917 #ifdef CONFIG_PROC_FS
2918         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2919         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2920 #endif
2921         return 0;
2922 }
2923
2924 static void __net_exit ip6_route_net_exit_late(struct net *net)
2925 {
2926 #ifdef CONFIG_PROC_FS
2927         proc_net_remove(net, "ipv6_route");
2928         proc_net_remove(net, "rt6_stats");
2929 #endif
2930 }
2931
2932 static struct pernet_operations ip6_route_net_ops = {
2933         .init = ip6_route_net_init,
2934         .exit = ip6_route_net_exit,
2935 };
2936
2937 static struct pernet_operations ip6_route_net_late_ops = {
2938         .init = ip6_route_net_init_late,
2939         .exit = ip6_route_net_exit_late,
2940 };
2941
2942 static struct notifier_block ip6_route_dev_notifier = {
2943         .notifier_call = ip6_route_dev_notify,
2944         .priority = 0,
2945 };
2946
2947 int __init ip6_route_init(void)
2948 {
2949         int ret;
2950
2951         ret = -ENOMEM;
2952         ip6_dst_ops_template.kmem_cachep =
2953                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2954                                   SLAB_HWCACHE_ALIGN, NULL);
2955         if (!ip6_dst_ops_template.kmem_cachep)
2956                 goto out;
2957
2958         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2959         if (ret)
2960                 goto out_kmem_cache;
2961
2962         ret = register_pernet_subsys(&ip6_route_net_ops);
2963         if (ret)
2964                 goto out_dst_entries;
2965
2966         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2967
2968         /* Registering of the loopback is done before this portion of code,
2969          * the loopback reference in rt6_info will not be taken, do it
2970          * manually for init_net */
2971         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2972         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2973   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2974         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2975         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2976         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2977         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2978   #endif
2979         ret = fib6_init();
2980         if (ret)
2981                 goto out_register_subsys;
2982
2983         ret = xfrm6_init();
2984         if (ret)
2985                 goto out_fib6_init;
2986
2987         ret = fib6_rules_init();
2988         if (ret)
2989                 goto xfrm6_init;
2990
2991         ret = register_pernet_subsys(&ip6_route_net_late_ops);
2992         if (ret)
2993                 goto fib6_rules_init;
2994
2995         ret = -ENOBUFS;
2996         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2997             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2998             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2999                 goto out_register_late_subsys;
3000
3001         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3002         if (ret)
3003                 goto out_register_late_subsys;
3004
3005 out:
3006         return ret;
3007
3008 out_register_late_subsys:
3009         unregister_pernet_subsys(&ip6_route_net_late_ops);
3010 fib6_rules_init:
3011         fib6_rules_cleanup();
3012 xfrm6_init:
3013         xfrm6_fini();
3014 out_fib6_init:
3015         fib6_gc_cleanup();
3016 out_register_subsys:
3017         unregister_pernet_subsys(&ip6_route_net_ops);
3018 out_dst_entries:
3019         dst_entries_destroy(&ip6_dst_blackhole_ops);
3020 out_kmem_cache:
3021         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3022         goto out;
3023 }
3024
3025 void ip6_route_cleanup(void)
3026 {
3027         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3028         unregister_pernet_subsys(&ip6_route_net_late_ops);
3029         fib6_rules_cleanup();
3030         xfrm6_fini();
3031         fib6_gc_cleanup();
3032         unregister_pernet_subsys(&ip6_route_net_ops);
3033         dst_entries_destroy(&ip6_dst_blackhole_ops);
3034         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3035 }