ab48b02eb56a7c96c5723a658820ba1dec993d38
[pandora-kernel.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 /* Set to 3 to get tracing. */
66 #define RT6_DEBUG 2
67
68 #if RT6_DEBUG >= 3
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #else
72 #define RDBG(x)
73 #define RT6_TRACE(x...) do { ; } while (0)
74 #endif
75
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77                                     const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int      ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void             ip6_dst_destroy(struct dst_entry *);
83 static void             ip6_dst_ifdown(struct dst_entry *,
84                                        struct net_device *dev, int how);
85 static int               ip6_dst_gc(struct dst_ops *ops);
86
87 static int              ip6_pkt_discard(struct sk_buff *skb);
88 static int              ip6_pkt_discard_out(struct sk_buff *skb);
89 static void             ip6_link_failure(struct sk_buff *skb);
90 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
91
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94                                            const struct in6_addr *prefix, int prefixlen,
95                                            const struct in6_addr *gwaddr, int ifindex,
96                                            unsigned pref);
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98                                            const struct in6_addr *prefix, int prefixlen,
99                                            const struct in6_addr *gwaddr, int ifindex);
100 #endif
101
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
103 {
104         struct rt6_info *rt = (struct rt6_info *) dst;
105         struct inet_peer *peer;
106         u32 *p = NULL;
107
108         if (!(rt->dst.flags & DST_HOST))
109                 return NULL;
110
111         if (!rt->rt6i_peer)
112                 rt6_bind_peer(rt, 1);
113
114         peer = rt->rt6i_peer;
115         if (peer) {
116                 u32 *old_p = __DST_METRICS_PTR(old);
117                 unsigned long prev, new;
118
119                 p = peer->metrics;
120                 if (inet_metrics_new(peer))
121                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
122
123                 new = (unsigned long) p;
124                 prev = cmpxchg(&dst->_metrics, old, new);
125
126                 if (prev != old) {
127                         p = __DST_METRICS_PTR(prev);
128                         if (prev & DST_METRICS_READ_ONLY)
129                                 p = NULL;
130                 }
131         }
132         return p;
133 }
134
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
136 {
137         return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
138 }
139
140 static struct dst_ops ip6_dst_ops_template = {
141         .family                 =       AF_INET6,
142         .protocol               =       cpu_to_be16(ETH_P_IPV6),
143         .gc                     =       ip6_dst_gc,
144         .gc_thresh              =       1024,
145         .check                  =       ip6_dst_check,
146         .default_advmss         =       ip6_default_advmss,
147         .mtu                    =       ip6_mtu,
148         .cow_metrics            =       ipv6_cow_metrics,
149         .destroy                =       ip6_dst_destroy,
150         .ifdown                 =       ip6_dst_ifdown,
151         .negative_advice        =       ip6_negative_advice,
152         .link_failure           =       ip6_link_failure,
153         .update_pmtu            =       ip6_rt_update_pmtu,
154         .local_out              =       __ip6_local_out,
155         .neigh_lookup           =       ip6_neigh_lookup,
156 };
157
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
159 {
160         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
161
162         return mtu ? : dst->dev->mtu;
163 }
164
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
166 {
167 }
168
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
170                                          unsigned long old)
171 {
172         return NULL;
173 }
174
175 static struct dst_ops ip6_dst_blackhole_ops = {
176         .family                 =       AF_INET6,
177         .protocol               =       cpu_to_be16(ETH_P_IPV6),
178         .destroy                =       ip6_dst_destroy,
179         .check                  =       ip6_dst_check,
180         .mtu                    =       ip6_blackhole_mtu,
181         .default_advmss         =       ip6_default_advmss,
182         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
183         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
184         .neigh_lookup           =       ip6_neigh_lookup,
185 };
186
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188         [RTAX_HOPLIMIT - 1] = 255,
189 };
190
191 static struct rt6_info ip6_null_entry_template = {
192         .dst = {
193                 .__refcnt       = ATOMIC_INIT(1),
194                 .__use          = 1,
195                 .obsolete       = -1,
196                 .error          = -ENETUNREACH,
197                 .input          = ip6_pkt_discard,
198                 .output         = ip6_pkt_discard_out,
199         },
200         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
201         .rt6i_protocol  = RTPROT_KERNEL,
202         .rt6i_metric    = ~(u32) 0,
203         .rt6i_ref       = ATOMIC_INIT(1),
204 };
205
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
207
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
210
211 static struct rt6_info ip6_prohibit_entry_template = {
212         .dst = {
213                 .__refcnt       = ATOMIC_INIT(1),
214                 .__use          = 1,
215                 .obsolete       = -1,
216                 .error          = -EACCES,
217                 .input          = ip6_pkt_prohibit,
218                 .output         = ip6_pkt_prohibit_out,
219         },
220         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
221         .rt6i_protocol  = RTPROT_KERNEL,
222         .rt6i_metric    = ~(u32) 0,
223         .rt6i_ref       = ATOMIC_INIT(1),
224 };
225
226 static struct rt6_info ip6_blk_hole_entry_template = {
227         .dst = {
228                 .__refcnt       = ATOMIC_INIT(1),
229                 .__use          = 1,
230                 .obsolete       = -1,
231                 .error          = -EINVAL,
232                 .input          = dst_discard,
233                 .output         = dst_discard,
234         },
235         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
236         .rt6i_protocol  = RTPROT_KERNEL,
237         .rt6i_metric    = ~(u32) 0,
238         .rt6i_ref       = ATOMIC_INIT(1),
239 };
240
241 #endif
242
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245                                              struct net_device *dev,
246                                              int flags)
247 {
248         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
249
250         if (rt)
251                 memset(&rt->rt6i_table, 0,
252                        sizeof(*rt) - sizeof(struct dst_entry));
253
254         return rt;
255 }
256
257 static void ip6_dst_destroy(struct dst_entry *dst)
258 {
259         struct rt6_info *rt = (struct rt6_info *)dst;
260         struct inet6_dev *idev = rt->rt6i_idev;
261         struct inet_peer *peer = rt->rt6i_peer;
262
263         if (!(rt->dst.flags & DST_HOST))
264                 dst_destroy_metrics_generic(dst);
265
266         if (idev) {
267                 rt->rt6i_idev = NULL;
268                 in6_dev_put(idev);
269         }
270         if (peer) {
271                 rt->rt6i_peer = NULL;
272                 inet_putpeer(peer);
273         }
274 }
275
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
277
278 static u32 rt6_peer_genid(void)
279 {
280         return atomic_read(&__rt6_peer_genid);
281 }
282
283 void rt6_bind_peer(struct rt6_info *rt, int create)
284 {
285         struct inet_peer *peer;
286
287         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
289                 inet_putpeer(peer);
290         else
291                 rt->rt6i_peer_genid = rt6_peer_genid();
292 }
293
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
295                            int how)
296 {
297         struct rt6_info *rt = (struct rt6_info *)dst;
298         struct inet6_dev *idev = rt->rt6i_idev;
299         struct net_device *loopback_dev =
300                 dev_net(dev)->loopback_dev;
301
302         if (dev != loopback_dev && idev && idev->dev == dev) {
303                 struct inet6_dev *loopback_idev =
304                         in6_dev_get(loopback_dev);
305                 if (loopback_idev) {
306                         rt->rt6i_idev = loopback_idev;
307                         in6_dev_put(idev);
308                 }
309         }
310 }
311
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
313 {
314         return (rt->rt6i_flags & RTF_EXPIRES) &&
315                 time_after(jiffies, rt->rt6i_expires);
316 }
317
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
319 {
320         return ipv6_addr_type(daddr) &
321                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
322 }
323
324 /*
325  *      Route lookup. Any table->tb6_lock is implied.
326  */
327
328 static inline struct rt6_info *rt6_device_match(struct net *net,
329                                                     struct rt6_info *rt,
330                                                     const struct in6_addr *saddr,
331                                                     int oif,
332                                                     int flags)
333 {
334         struct rt6_info *local = NULL;
335         struct rt6_info *sprt;
336
337         if (!oif && ipv6_addr_any(saddr))
338                 goto out;
339
340         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341                 struct net_device *dev = sprt->rt6i_dev;
342
343                 if (oif) {
344                         if (dev->ifindex == oif)
345                                 return sprt;
346                         if (dev->flags & IFF_LOOPBACK) {
347                                 if (!sprt->rt6i_idev ||
348                                     sprt->rt6i_idev->dev->ifindex != oif) {
349                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
350                                                 continue;
351                                         if (local && (!oif ||
352                                                       local->rt6i_idev->dev->ifindex == oif))
353                                                 continue;
354                                 }
355                                 local = sprt;
356                         }
357                 } else {
358                         if (ipv6_chk_addr(net, saddr, dev,
359                                           flags & RT6_LOOKUP_F_IFACE))
360                                 return sprt;
361                 }
362         }
363
364         if (oif) {
365                 if (local)
366                         return local;
367
368                 if (flags & RT6_LOOKUP_F_IFACE)
369                         return net->ipv6.ip6_null_entry;
370         }
371 out:
372         return rt;
373 }
374
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
377 {
378         struct neighbour *neigh;
379         /*
380          * Okay, this does not seem to be appropriate
381          * for now, however, we need to check if it
382          * is really so; aka Router Reachability Probing.
383          *
384          * Router Reachability Probe MUST be rate-limited
385          * to no more than one per minute.
386          */
387         rcu_read_lock();
388         neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389         if (!neigh || (neigh->nud_state & NUD_VALID))
390                 goto out;
391         read_lock_bh(&neigh->lock);
392         if (!(neigh->nud_state & NUD_VALID) &&
393             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394                 struct in6_addr mcaddr;
395                 struct in6_addr *target;
396
397                 neigh->updated = jiffies;
398                 read_unlock_bh(&neigh->lock);
399
400                 target = (struct in6_addr *)&neigh->primary_key;
401                 addrconf_addr_solict_mult(target, &mcaddr);
402                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
403         } else {
404                 read_unlock_bh(&neigh->lock);
405         }
406 out:
407         rcu_read_unlock();
408 }
409 #else
410 static inline void rt6_probe(struct rt6_info *rt)
411 {
412 }
413 #endif
414
415 /*
416  * Default Router Selection (RFC 2461 6.3.6)
417  */
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
419 {
420         struct net_device *dev = rt->rt6i_dev;
421         if (!oif || dev->ifindex == oif)
422                 return 2;
423         if ((dev->flags & IFF_LOOPBACK) &&
424             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
425                 return 1;
426         return 0;
427 }
428
429 static inline int rt6_check_neigh(struct rt6_info *rt)
430 {
431         struct neighbour *neigh;
432         int m;
433
434         rcu_read_lock();
435         neigh = dst_get_neighbour(&rt->dst);
436         if (rt->rt6i_flags & RTF_NONEXTHOP ||
437             !(rt->rt6i_flags & RTF_GATEWAY))
438                 m = 1;
439         else if (neigh) {
440                 read_lock_bh(&neigh->lock);
441                 if (neigh->nud_state & NUD_VALID)
442                         m = 2;
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444                 else if (neigh->nud_state & NUD_FAILED)
445                         m = 0;
446 #endif
447                 else
448                         m = 1;
449                 read_unlock_bh(&neigh->lock);
450         } else
451                 m = 0;
452         rcu_read_unlock();
453         return m;
454 }
455
456 static int rt6_score_route(struct rt6_info *rt, int oif,
457                            int strict)
458 {
459         int m, n;
460
461         m = rt6_check_dev(rt, oif);
462         if (!m && (strict & RT6_LOOKUP_F_IFACE))
463                 return -1;
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
466 #endif
467         n = rt6_check_neigh(rt);
468         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
469                 return -1;
470         return m;
471 }
472
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474                                    int *mpri, struct rt6_info *match)
475 {
476         int m;
477
478         if (rt6_check_expired(rt))
479                 goto out;
480
481         m = rt6_score_route(rt, oif, strict);
482         if (m < 0)
483                 goto out;
484
485         if (m > *mpri) {
486                 if (strict & RT6_LOOKUP_F_REACHABLE)
487                         rt6_probe(match);
488                 *mpri = m;
489                 match = rt;
490         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
491                 rt6_probe(rt);
492         }
493
494 out:
495         return match;
496 }
497
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499                                      struct rt6_info *rr_head,
500                                      u32 metric, int oif, int strict)
501 {
502         struct rt6_info *rt, *match;
503         int mpri = -1;
504
505         match = NULL;
506         for (rt = rr_head; rt && rt->rt6i_metric == metric;
507              rt = rt->dst.rt6_next)
508                 match = find_match(rt, oif, strict, &mpri, match);
509         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510              rt = rt->dst.rt6_next)
511                 match = find_match(rt, oif, strict, &mpri, match);
512
513         return match;
514 }
515
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
517 {
518         struct rt6_info *match, *rt0;
519         struct net *net;
520
521         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522                   __func__, fn->leaf, oif);
523
524         rt0 = fn->rr_ptr;
525         if (!rt0)
526                 fn->rr_ptr = rt0 = fn->leaf;
527
528         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
529
530         if (!match &&
531             (strict & RT6_LOOKUP_F_REACHABLE)) {
532                 struct rt6_info *next = rt0->dst.rt6_next;
533
534                 /* no entries matched; do round-robin */
535                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
536                         next = fn->leaf;
537
538                 if (next != rt0)
539                         fn->rr_ptr = next;
540         }
541
542         RT6_TRACE("%s() => %p\n",
543                   __func__, match);
544
545         net = dev_net(rt0->rt6i_dev);
546         return match ? match : net->ipv6.ip6_null_entry;
547 }
548
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551                   const struct in6_addr *gwaddr)
552 {
553         struct net *net = dev_net(dev);
554         struct route_info *rinfo = (struct route_info *) opt;
555         struct in6_addr prefix_buf, *prefix;
556         unsigned int pref;
557         unsigned long lifetime;
558         struct rt6_info *rt;
559
560         if (len < sizeof(struct route_info)) {
561                 return -EINVAL;
562         }
563
564         /* Sanity check for prefix_len and length */
565         if (rinfo->length > 3) {
566                 return -EINVAL;
567         } else if (rinfo->prefix_len > 128) {
568                 return -EINVAL;
569         } else if (rinfo->prefix_len > 64) {
570                 if (rinfo->length < 2) {
571                         return -EINVAL;
572                 }
573         } else if (rinfo->prefix_len > 0) {
574                 if (rinfo->length < 1) {
575                         return -EINVAL;
576                 }
577         }
578
579         pref = rinfo->route_pref;
580         if (pref == ICMPV6_ROUTER_PREF_INVALID)
581                 return -EINVAL;
582
583         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
584
585         if (rinfo->length == 3)
586                 prefix = (struct in6_addr *)rinfo->prefix;
587         else {
588                 /* this function is safe */
589                 ipv6_addr_prefix(&prefix_buf,
590                                  (struct in6_addr *)rinfo->prefix,
591                                  rinfo->prefix_len);
592                 prefix = &prefix_buf;
593         }
594
595         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
596                                 dev->ifindex);
597
598         if (rt && !lifetime) {
599                 ip6_del_rt(rt);
600                 rt = NULL;
601         }
602
603         if (!rt && lifetime)
604                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
605                                         pref);
606         else if (rt)
607                 rt->rt6i_flags = RTF_ROUTEINFO |
608                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
609
610         if (rt) {
611                 if (!addrconf_finite_timeout(lifetime)) {
612                         rt->rt6i_flags &= ~RTF_EXPIRES;
613                 } else {
614                         rt->rt6i_expires = jiffies + HZ * lifetime;
615                         rt->rt6i_flags |= RTF_EXPIRES;
616                 }
617                 dst_release(&rt->dst);
618         }
619         return 0;
620 }
621 #endif
622
623 #define BACKTRACK(__net, saddr)                 \
624 do { \
625         if (rt == __net->ipv6.ip6_null_entry) { \
626                 struct fib6_node *pn; \
627                 while (1) { \
628                         if (fn->fn_flags & RTN_TL_ROOT) \
629                                 goto out; \
630                         pn = fn->parent; \
631                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
632                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
633                         else \
634                                 fn = pn; \
635                         if (fn->fn_flags & RTN_RTINFO) \
636                                 goto restart; \
637                 } \
638         } \
639 } while (0)
640
641 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
642                                              struct fib6_table *table,
643                                              struct flowi6 *fl6, int flags)
644 {
645         struct fib6_node *fn;
646         struct rt6_info *rt;
647
648         read_lock_bh(&table->tb6_lock);
649         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
650 restart:
651         rt = fn->leaf;
652         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
653         BACKTRACK(net, &fl6->saddr);
654 out:
655         dst_use(&rt->dst, jiffies);
656         read_unlock_bh(&table->tb6_lock);
657         return rt;
658
659 }
660
661 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
662                                     int flags)
663 {
664         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
665 }
666 EXPORT_SYMBOL_GPL(ip6_route_lookup);
667
668 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
669                             const struct in6_addr *saddr, int oif, int strict)
670 {
671         struct flowi6 fl6 = {
672                 .flowi6_oif = oif,
673                 .daddr = *daddr,
674         };
675         struct dst_entry *dst;
676         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
677
678         if (saddr) {
679                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
680                 flags |= RT6_LOOKUP_F_HAS_SADDR;
681         }
682
683         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
684         if (dst->error == 0)
685                 return (struct rt6_info *) dst;
686
687         dst_release(dst);
688
689         return NULL;
690 }
691
692 EXPORT_SYMBOL(rt6_lookup);
693
694 /* ip6_ins_rt is called with FREE table->tb6_lock.
695    It takes new route entry, the addition fails by any reason the
696    route is freed. In any case, if caller does not hold it, it may
697    be destroyed.
698  */
699
700 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
701 {
702         int err;
703         struct fib6_table *table;
704
705         table = rt->rt6i_table;
706         write_lock_bh(&table->tb6_lock);
707         err = fib6_add(&table->tb6_root, rt, info);
708         write_unlock_bh(&table->tb6_lock);
709
710         return err;
711 }
712
713 int ip6_ins_rt(struct rt6_info *rt)
714 {
715         struct nl_info info = {
716                 .nl_net = dev_net(rt->rt6i_dev),
717         };
718         return __ip6_ins_rt(rt, &info);
719 }
720
721 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
722                                       const struct in6_addr *daddr,
723                                       const struct in6_addr *saddr)
724 {
725         struct rt6_info *rt;
726
727         /*
728          *      Clone the route.
729          */
730
731         rt = ip6_rt_copy(ort, daddr);
732
733         if (rt) {
734                 struct neighbour *neigh;
735                 int attempts = !in_softirq();
736
737                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
738                         if (rt->rt6i_dst.plen != 128 &&
739                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
740                                 rt->rt6i_flags |= RTF_ANYCAST;
741                         rt->rt6i_gateway = *daddr;
742                 }
743
744                 rt->rt6i_flags |= RTF_CACHE;
745
746 #ifdef CONFIG_IPV6_SUBTREES
747                 if (rt->rt6i_src.plen && saddr) {
748                         rt->rt6i_src.addr = *saddr;
749                         rt->rt6i_src.plen = 128;
750                 }
751 #endif
752
753         retry:
754                 neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway,
755                                              rt->rt6i_dev);
756                 if (IS_ERR(neigh)) {
757                         struct net *net = dev_net(rt->rt6i_dev);
758                         int saved_rt_min_interval =
759                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
760                         int saved_rt_elasticity =
761                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
762
763                         if (attempts-- > 0) {
764                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
765                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
766
767                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
768
769                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
770                                         saved_rt_elasticity;
771                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
772                                         saved_rt_min_interval;
773                                 goto retry;
774                         }
775
776                         if (net_ratelimit())
777                                 printk(KERN_WARNING
778                                        "ipv6: Neighbour table overflow.\n");
779                         dst_free(&rt->dst);
780                         return NULL;
781                 }
782                 dst_set_neighbour(&rt->dst, neigh);
783
784         }
785
786         return rt;
787 }
788
789 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
790                                         const struct in6_addr *daddr)
791 {
792         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
793
794         if (rt) {
795                 rt->rt6i_flags |= RTF_CACHE;
796                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
797         }
798         return rt;
799 }
800
801 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
802                                       struct flowi6 *fl6, int flags)
803 {
804         struct fib6_node *fn;
805         struct rt6_info *rt, *nrt;
806         int strict = 0;
807         int attempts = 3;
808         int err;
809         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
810
811         strict |= flags & RT6_LOOKUP_F_IFACE;
812
813 relookup:
814         read_lock_bh(&table->tb6_lock);
815
816 restart_2:
817         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
818
819 restart:
820         rt = rt6_select(fn, oif, strict | reachable);
821
822         BACKTRACK(net, &fl6->saddr);
823         if (rt == net->ipv6.ip6_null_entry ||
824             rt->rt6i_flags & RTF_CACHE)
825                 goto out;
826
827         dst_hold(&rt->dst);
828         read_unlock_bh(&table->tb6_lock);
829
830         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
831                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
832         else if (!(rt->dst.flags & DST_HOST))
833                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
834         else
835                 goto out2;
836
837         dst_release(&rt->dst);
838         rt = nrt ? : net->ipv6.ip6_null_entry;
839
840         dst_hold(&rt->dst);
841         if (nrt) {
842                 err = ip6_ins_rt(nrt);
843                 if (!err)
844                         goto out2;
845         }
846
847         if (--attempts <= 0)
848                 goto out2;
849
850         /*
851          * Race condition! In the gap, when table->tb6_lock was
852          * released someone could insert this route.  Relookup.
853          */
854         dst_release(&rt->dst);
855         goto relookup;
856
857 out:
858         if (reachable) {
859                 reachable = 0;
860                 goto restart_2;
861         }
862         dst_hold(&rt->dst);
863         read_unlock_bh(&table->tb6_lock);
864 out2:
865         rt->dst.lastuse = jiffies;
866         rt->dst.__use++;
867
868         return rt;
869 }
870
871 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
872                                             struct flowi6 *fl6, int flags)
873 {
874         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
875 }
876
877 void ip6_route_input(struct sk_buff *skb)
878 {
879         const struct ipv6hdr *iph = ipv6_hdr(skb);
880         struct net *net = dev_net(skb->dev);
881         int flags = RT6_LOOKUP_F_HAS_SADDR;
882         struct flowi6 fl6 = {
883                 .flowi6_iif = skb->dev->ifindex,
884                 .daddr = iph->daddr,
885                 .saddr = iph->saddr,
886                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
887                 .flowi6_mark = skb->mark,
888                 .flowi6_proto = iph->nexthdr,
889         };
890
891         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
892                 flags |= RT6_LOOKUP_F_IFACE;
893
894         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
895 }
896
897 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
898                                              struct flowi6 *fl6, int flags)
899 {
900         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
901 }
902
903 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
904                                     struct flowi6 *fl6)
905 {
906         int flags = 0;
907
908         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
909                 flags |= RT6_LOOKUP_F_IFACE;
910
911         if (!ipv6_addr_any(&fl6->saddr))
912                 flags |= RT6_LOOKUP_F_HAS_SADDR;
913         else if (sk)
914                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
915
916         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
917 }
918
919 EXPORT_SYMBOL(ip6_route_output);
920
921 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
922 {
923         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
924         struct dst_entry *new = NULL;
925
926         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
927         if (rt) {
928                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
929
930                 new = &rt->dst;
931
932                 new->__use = 1;
933                 new->input = dst_discard;
934                 new->output = dst_discard;
935
936                 if (dst_metrics_read_only(&ort->dst))
937                         new->_metrics = ort->dst._metrics;
938                 else
939                         dst_copy_metrics(new, &ort->dst);
940                 rt->rt6i_idev = ort->rt6i_idev;
941                 if (rt->rt6i_idev)
942                         in6_dev_hold(rt->rt6i_idev);
943                 rt->rt6i_expires = 0;
944
945                 rt->rt6i_gateway = ort->rt6i_gateway;
946                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
947                 rt->rt6i_metric = 0;
948
949                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
950 #ifdef CONFIG_IPV6_SUBTREES
951                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
952 #endif
953
954                 dst_free(new);
955         }
956
957         dst_release(dst_orig);
958         return new ? new : ERR_PTR(-ENOMEM);
959 }
960
961 /*
962  *      Destination cache support functions
963  */
964
965 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
966 {
967         struct rt6_info *rt;
968
969         rt = (struct rt6_info *) dst;
970
971         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
972                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
973                         if (!rt->rt6i_peer)
974                                 rt6_bind_peer(rt, 0);
975                         rt->rt6i_peer_genid = rt6_peer_genid();
976                 }
977                 return dst;
978         }
979         return NULL;
980 }
981
982 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
983 {
984         struct rt6_info *rt = (struct rt6_info *) dst;
985
986         if (rt) {
987                 if (rt->rt6i_flags & RTF_CACHE) {
988                         if (rt6_check_expired(rt)) {
989                                 ip6_del_rt(rt);
990                                 dst = NULL;
991                         }
992                 } else {
993                         dst_release(dst);
994                         dst = NULL;
995                 }
996         }
997         return dst;
998 }
999
1000 static void ip6_link_failure(struct sk_buff *skb)
1001 {
1002         struct rt6_info *rt;
1003
1004         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1005
1006         rt = (struct rt6_info *) skb_dst(skb);
1007         if (rt) {
1008                 if (rt->rt6i_flags & RTF_CACHE) {
1009                         dst_set_expires(&rt->dst, 0);
1010                         rt->rt6i_flags |= RTF_EXPIRES;
1011                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1012                         rt->rt6i_node->fn_sernum = -1;
1013         }
1014 }
1015
1016 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1017 {
1018         struct rt6_info *rt6 = (struct rt6_info*)dst;
1019
1020         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1021                 rt6->rt6i_flags |= RTF_MODIFIED;
1022                 if (mtu < IPV6_MIN_MTU) {
1023                         u32 features = dst_metric(dst, RTAX_FEATURES);
1024                         mtu = IPV6_MIN_MTU;
1025                         features |= RTAX_FEATURE_ALLFRAG;
1026                         dst_metric_set(dst, RTAX_FEATURES, features);
1027                 }
1028                 dst_metric_set(dst, RTAX_MTU, mtu);
1029         }
1030 }
1031
1032 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1033 {
1034         struct net_device *dev = dst->dev;
1035         unsigned int mtu = dst_mtu(dst);
1036         struct net *net = dev_net(dev);
1037
1038         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1039
1040         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1041                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1042
1043         /*
1044          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1045          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1046          * IPV6_MAXPLEN is also valid and means: "any MSS,
1047          * rely only on pmtu discovery"
1048          */
1049         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1050                 mtu = IPV6_MAXPLEN;
1051         return mtu;
1052 }
1053
1054 static unsigned int ip6_mtu(const struct dst_entry *dst)
1055 {
1056         struct inet6_dev *idev;
1057         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1058
1059         if (mtu)
1060                 return mtu;
1061
1062         mtu = IPV6_MIN_MTU;
1063
1064         rcu_read_lock();
1065         idev = __in6_dev_get(dst->dev);
1066         if (idev)
1067                 mtu = idev->cnf.mtu6;
1068         rcu_read_unlock();
1069
1070         return mtu;
1071 }
1072
1073 static struct dst_entry *icmp6_dst_gc_list;
1074 static DEFINE_SPINLOCK(icmp6_dst_lock);
1075
1076 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1077                                   struct neighbour *neigh,
1078                                   const struct in6_addr *addr)
1079 {
1080         struct rt6_info *rt;
1081         struct inet6_dev *idev = in6_dev_get(dev);
1082         struct net *net = dev_net(dev);
1083
1084         if (unlikely(!idev))
1085                 return NULL;
1086
1087         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1088         if (unlikely(!rt)) {
1089                 in6_dev_put(idev);
1090                 goto out;
1091         }
1092
1093         if (neigh)
1094                 neigh_hold(neigh);
1095         else {
1096                 neigh = __neigh_lookup_errno(&nd_tbl, addr, dev);
1097                 if (IS_ERR(neigh))
1098                         neigh = NULL;
1099         }
1100
1101         rt->dst.flags |= DST_HOST;
1102         rt->dst.output  = ip6_output;
1103         dst_set_neighbour(&rt->dst, neigh);
1104         atomic_set(&rt->dst.__refcnt, 1);
1105         rt->rt6i_dst.addr = *addr;
1106         rt->rt6i_dst.plen = 128;
1107         rt->rt6i_idev     = idev;
1108         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1109
1110         spin_lock_bh(&icmp6_dst_lock);
1111         rt->dst.next = icmp6_dst_gc_list;
1112         icmp6_dst_gc_list = &rt->dst;
1113         spin_unlock_bh(&icmp6_dst_lock);
1114
1115         fib6_force_start_gc(net);
1116
1117 out:
1118         return &rt->dst;
1119 }
1120
1121 int icmp6_dst_gc(void)
1122 {
1123         struct dst_entry *dst, **pprev;
1124         int more = 0;
1125
1126         spin_lock_bh(&icmp6_dst_lock);
1127         pprev = &icmp6_dst_gc_list;
1128
1129         while ((dst = *pprev) != NULL) {
1130                 if (!atomic_read(&dst->__refcnt)) {
1131                         *pprev = dst->next;
1132                         dst_free(dst);
1133                 } else {
1134                         pprev = &dst->next;
1135                         ++more;
1136                 }
1137         }
1138
1139         spin_unlock_bh(&icmp6_dst_lock);
1140
1141         return more;
1142 }
1143
1144 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1145                             void *arg)
1146 {
1147         struct dst_entry *dst, **pprev;
1148
1149         spin_lock_bh(&icmp6_dst_lock);
1150         pprev = &icmp6_dst_gc_list;
1151         while ((dst = *pprev) != NULL) {
1152                 struct rt6_info *rt = (struct rt6_info *) dst;
1153                 if (func(rt, arg)) {
1154                         *pprev = dst->next;
1155                         dst_free(dst);
1156                 } else {
1157                         pprev = &dst->next;
1158                 }
1159         }
1160         spin_unlock_bh(&icmp6_dst_lock);
1161 }
1162
1163 static int ip6_dst_gc(struct dst_ops *ops)
1164 {
1165         unsigned long now = jiffies;
1166         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1167         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1168         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1169         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1170         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1171         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1172         int entries;
1173
1174         entries = dst_entries_get_fast(ops);
1175         if (time_after(rt_last_gc + rt_min_interval, now) &&
1176             entries <= rt_max_size)
1177                 goto out;
1178
1179         net->ipv6.ip6_rt_gc_expire++;
1180         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1181         net->ipv6.ip6_rt_last_gc = now;
1182         entries = dst_entries_get_slow(ops);
1183         if (entries < ops->gc_thresh)
1184                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1185 out:
1186         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1187         return entries > rt_max_size;
1188 }
1189
1190 /* Clean host part of a prefix. Not necessary in radix tree,
1191    but results in cleaner routing tables.
1192
1193    Remove it only when all the things will work!
1194  */
1195
1196 int ip6_dst_hoplimit(struct dst_entry *dst)
1197 {
1198         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1199         if (hoplimit == 0) {
1200                 struct net_device *dev = dst->dev;
1201                 struct inet6_dev *idev;
1202
1203                 rcu_read_lock();
1204                 idev = __in6_dev_get(dev);
1205                 if (idev)
1206                         hoplimit = idev->cnf.hop_limit;
1207                 else
1208                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1209                 rcu_read_unlock();
1210         }
1211         return hoplimit;
1212 }
1213 EXPORT_SYMBOL(ip6_dst_hoplimit);
1214
1215 /*
1216  *
1217  */
1218
1219 int ip6_route_add(struct fib6_config *cfg)
1220 {
1221         int err;
1222         struct net *net = cfg->fc_nlinfo.nl_net;
1223         struct rt6_info *rt = NULL;
1224         struct net_device *dev = NULL;
1225         struct inet6_dev *idev = NULL;
1226         struct fib6_table *table;
1227         int addr_type;
1228
1229         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1230                 return -EINVAL;
1231 #ifndef CONFIG_IPV6_SUBTREES
1232         if (cfg->fc_src_len)
1233                 return -EINVAL;
1234 #endif
1235         if (cfg->fc_ifindex) {
1236                 err = -ENODEV;
1237                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1238                 if (!dev)
1239                         goto out;
1240                 idev = in6_dev_get(dev);
1241                 if (!idev)
1242                         goto out;
1243         }
1244
1245         if (cfg->fc_metric == 0)
1246                 cfg->fc_metric = IP6_RT_PRIO_USER;
1247
1248         err = -ENOBUFS;
1249         if (cfg->fc_nlinfo.nlh &&
1250             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1251                 table = fib6_get_table(net, cfg->fc_table);
1252                 if (!table) {
1253                         printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1254                         table = fib6_new_table(net, cfg->fc_table);
1255                 }
1256         } else {
1257                 table = fib6_new_table(net, cfg->fc_table);
1258         }
1259
1260         if (!table)
1261                 goto out;
1262
1263         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1264
1265         if (!rt) {
1266                 err = -ENOMEM;
1267                 goto out;
1268         }
1269
1270         rt->dst.obsolete = -1;
1271         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1272                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1273                                 0;
1274
1275         if (cfg->fc_protocol == RTPROT_UNSPEC)
1276                 cfg->fc_protocol = RTPROT_BOOT;
1277         rt->rt6i_protocol = cfg->fc_protocol;
1278
1279         addr_type = ipv6_addr_type(&cfg->fc_dst);
1280
1281         if (addr_type & IPV6_ADDR_MULTICAST)
1282                 rt->dst.input = ip6_mc_input;
1283         else if (cfg->fc_flags & RTF_LOCAL)
1284                 rt->dst.input = ip6_input;
1285         else
1286                 rt->dst.input = ip6_forward;
1287
1288         rt->dst.output = ip6_output;
1289
1290         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1291         rt->rt6i_dst.plen = cfg->fc_dst_len;
1292         if (rt->rt6i_dst.plen == 128)
1293                rt->dst.flags |= DST_HOST;
1294
1295         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1296                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1297                 if (!metrics) {
1298                         err = -ENOMEM;
1299                         goto out;
1300                 }
1301                 dst_init_metrics(&rt->dst, metrics, 0);
1302         }
1303 #ifdef CONFIG_IPV6_SUBTREES
1304         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1305         rt->rt6i_src.plen = cfg->fc_src_len;
1306 #endif
1307
1308         rt->rt6i_metric = cfg->fc_metric;
1309
1310         /* We cannot add true routes via loopback here,
1311            they would result in kernel looping; promote them to reject routes
1312          */
1313         if ((cfg->fc_flags & RTF_REJECT) ||
1314             (dev && (dev->flags & IFF_LOOPBACK) &&
1315              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1316              !(cfg->fc_flags & RTF_LOCAL))) {
1317                 /* hold loopback dev/idev if we haven't done so. */
1318                 if (dev != net->loopback_dev) {
1319                         if (dev) {
1320                                 dev_put(dev);
1321                                 in6_dev_put(idev);
1322                         }
1323                         dev = net->loopback_dev;
1324                         dev_hold(dev);
1325                         idev = in6_dev_get(dev);
1326                         if (!idev) {
1327                                 err = -ENODEV;
1328                                 goto out;
1329                         }
1330                 }
1331                 rt->dst.output = ip6_pkt_discard_out;
1332                 rt->dst.input = ip6_pkt_discard;
1333                 rt->dst.error = -ENETUNREACH;
1334                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1335                 goto install_route;
1336         }
1337
1338         if (cfg->fc_flags & RTF_GATEWAY) {
1339                 const struct in6_addr *gw_addr;
1340                 int gwa_type;
1341
1342                 gw_addr = &cfg->fc_gateway;
1343                 rt->rt6i_gateway = *gw_addr;
1344                 gwa_type = ipv6_addr_type(gw_addr);
1345
1346                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1347                         struct rt6_info *grt;
1348
1349                         /* IPv6 strictly inhibits using not link-local
1350                            addresses as nexthop address.
1351                            Otherwise, router will not able to send redirects.
1352                            It is very good, but in some (rare!) circumstances
1353                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1354                            some exceptions. --ANK
1355                          */
1356                         err = -EINVAL;
1357                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1358                                 goto out;
1359
1360                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1361
1362                         err = -EHOSTUNREACH;
1363                         if (!grt)
1364                                 goto out;
1365                         if (dev) {
1366                                 if (dev != grt->rt6i_dev) {
1367                                         dst_release(&grt->dst);
1368                                         goto out;
1369                                 }
1370                         } else {
1371                                 dev = grt->rt6i_dev;
1372                                 idev = grt->rt6i_idev;
1373                                 dev_hold(dev);
1374                                 in6_dev_hold(grt->rt6i_idev);
1375                         }
1376                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1377                                 err = 0;
1378                         dst_release(&grt->dst);
1379
1380                         if (err)
1381                                 goto out;
1382                 }
1383                 err = -EINVAL;
1384                 if (!dev || (dev->flags & IFF_LOOPBACK))
1385                         goto out;
1386         }
1387
1388         err = -ENODEV;
1389         if (!dev)
1390                 goto out;
1391
1392         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1393                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1394                         err = -EINVAL;
1395                         goto out;
1396                 }
1397                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1398                 rt->rt6i_prefsrc.plen = 128;
1399         } else
1400                 rt->rt6i_prefsrc.plen = 0;
1401
1402         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1403                 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1404                 if (IS_ERR(n)) {
1405                         err = PTR_ERR(n);
1406                         goto out;
1407                 }
1408                 dst_set_neighbour(&rt->dst, n);
1409         }
1410
1411         rt->rt6i_flags = cfg->fc_flags;
1412
1413 install_route:
1414         if (cfg->fc_mx) {
1415                 struct nlattr *nla;
1416                 int remaining;
1417
1418                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1419                         int type = nla_type(nla);
1420
1421                         if (type) {
1422                                 if (type > RTAX_MAX) {
1423                                         err = -EINVAL;
1424                                         goto out;
1425                                 }
1426
1427                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1428                         }
1429                 }
1430         }
1431
1432         rt->dst.dev = dev;
1433         rt->rt6i_idev = idev;
1434         rt->rt6i_table = table;
1435
1436         cfg->fc_nlinfo.nl_net = dev_net(dev);
1437
1438         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1439
1440 out:
1441         if (dev)
1442                 dev_put(dev);
1443         if (idev)
1444                 in6_dev_put(idev);
1445         if (rt)
1446                 dst_free(&rt->dst);
1447         return err;
1448 }
1449
1450 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1451 {
1452         int err;
1453         struct fib6_table *table;
1454         struct net *net = dev_net(rt->rt6i_dev);
1455
1456         if (rt == net->ipv6.ip6_null_entry)
1457                 return -ENOENT;
1458
1459         table = rt->rt6i_table;
1460         write_lock_bh(&table->tb6_lock);
1461
1462         err = fib6_del(rt, info);
1463         dst_release(&rt->dst);
1464
1465         write_unlock_bh(&table->tb6_lock);
1466
1467         return err;
1468 }
1469
1470 int ip6_del_rt(struct rt6_info *rt)
1471 {
1472         struct nl_info info = {
1473                 .nl_net = dev_net(rt->rt6i_dev),
1474         };
1475         return __ip6_del_rt(rt, &info);
1476 }
1477
1478 static int ip6_route_del(struct fib6_config *cfg)
1479 {
1480         struct fib6_table *table;
1481         struct fib6_node *fn;
1482         struct rt6_info *rt;
1483         int err = -ESRCH;
1484
1485         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1486         if (!table)
1487                 return err;
1488
1489         read_lock_bh(&table->tb6_lock);
1490
1491         fn = fib6_locate(&table->tb6_root,
1492                          &cfg->fc_dst, cfg->fc_dst_len,
1493                          &cfg->fc_src, cfg->fc_src_len);
1494
1495         if (fn) {
1496                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1497                         if (cfg->fc_ifindex &&
1498                             (!rt->rt6i_dev ||
1499                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1500                                 continue;
1501                         if (cfg->fc_flags & RTF_GATEWAY &&
1502                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1503                                 continue;
1504                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1505                                 continue;
1506                         dst_hold(&rt->dst);
1507                         read_unlock_bh(&table->tb6_lock);
1508
1509                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1510                 }
1511         }
1512         read_unlock_bh(&table->tb6_lock);
1513
1514         return err;
1515 }
1516
1517 /*
1518  *      Handle redirects
1519  */
1520 struct ip6rd_flowi {
1521         struct flowi6 fl6;
1522         struct in6_addr gateway;
1523 };
1524
1525 static struct rt6_info *__ip6_route_redirect(struct net *net,
1526                                              struct fib6_table *table,
1527                                              struct flowi6 *fl6,
1528                                              int flags)
1529 {
1530         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1531         struct rt6_info *rt;
1532         struct fib6_node *fn;
1533
1534         /*
1535          * Get the "current" route for this destination and
1536          * check if the redirect has come from approriate router.
1537          *
1538          * RFC 2461 specifies that redirects should only be
1539          * accepted if they come from the nexthop to the target.
1540          * Due to the way the routes are chosen, this notion
1541          * is a bit fuzzy and one might need to check all possible
1542          * routes.
1543          */
1544
1545         read_lock_bh(&table->tb6_lock);
1546         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1547 restart:
1548         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1549                 /*
1550                  * Current route is on-link; redirect is always invalid.
1551                  *
1552                  * Seems, previous statement is not true. It could
1553                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1554                  * But then router serving it might decide, that we should
1555                  * know truth 8)8) --ANK (980726).
1556                  */
1557                 if (rt6_check_expired(rt))
1558                         continue;
1559                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1560                         continue;
1561                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1562                         continue;
1563                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1564                         continue;
1565                 break;
1566         }
1567
1568         if (!rt)
1569                 rt = net->ipv6.ip6_null_entry;
1570         BACKTRACK(net, &fl6->saddr);
1571 out:
1572         dst_hold(&rt->dst);
1573
1574         read_unlock_bh(&table->tb6_lock);
1575
1576         return rt;
1577 };
1578
1579 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1580                                            const struct in6_addr *src,
1581                                            const struct in6_addr *gateway,
1582                                            struct net_device *dev)
1583 {
1584         int flags = RT6_LOOKUP_F_HAS_SADDR;
1585         struct net *net = dev_net(dev);
1586         struct ip6rd_flowi rdfl = {
1587                 .fl6 = {
1588                         .flowi6_oif = dev->ifindex,
1589                         .daddr = *dest,
1590                         .saddr = *src,
1591                 },
1592         };
1593
1594         rdfl.gateway = *gateway;
1595
1596         if (rt6_need_strict(dest))
1597                 flags |= RT6_LOOKUP_F_IFACE;
1598
1599         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1600                                                    flags, __ip6_route_redirect);
1601 }
1602
1603 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1604                   const struct in6_addr *saddr,
1605                   struct neighbour *neigh, u8 *lladdr, int on_link)
1606 {
1607         struct rt6_info *rt, *nrt = NULL;
1608         struct netevent_redirect netevent;
1609         struct net *net = dev_net(neigh->dev);
1610
1611         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1612
1613         if (rt == net->ipv6.ip6_null_entry) {
1614                 if (net_ratelimit())
1615                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1616                                "for redirect target\n");
1617                 goto out;
1618         }
1619
1620         /*
1621          *      We have finally decided to accept it.
1622          */
1623
1624         neigh_update(neigh, lladdr, NUD_STALE,
1625                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1626                      NEIGH_UPDATE_F_OVERRIDE|
1627                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1628                                      NEIGH_UPDATE_F_ISROUTER))
1629                      );
1630
1631         /*
1632          * Redirect received -> path was valid.
1633          * Look, redirects are sent only in response to data packets,
1634          * so that this nexthop apparently is reachable. --ANK
1635          */
1636         dst_confirm(&rt->dst);
1637
1638         /* Duplicate redirect: silently ignore. */
1639         if (neigh == dst_get_neighbour_raw(&rt->dst))
1640                 goto out;
1641
1642         nrt = ip6_rt_copy(rt, dest);
1643         if (!nrt)
1644                 goto out;
1645
1646         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1647         if (on_link)
1648                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1649
1650         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1651         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1652
1653         if (ip6_ins_rt(nrt))
1654                 goto out;
1655
1656         netevent.old = &rt->dst;
1657         netevent.new = &nrt->dst;
1658         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1659
1660         if (rt->rt6i_flags & RTF_CACHE) {
1661                 ip6_del_rt(rt);
1662                 return;
1663         }
1664
1665 out:
1666         dst_release(&rt->dst);
1667 }
1668
1669 /*
1670  *      Handle ICMP "packet too big" messages
1671  *      i.e. Path MTU discovery
1672  */
1673
1674 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1675                              struct net *net, u32 pmtu, int ifindex)
1676 {
1677         struct rt6_info *rt, *nrt;
1678         int allfrag = 0;
1679 again:
1680         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1681         if (!rt)
1682                 return;
1683
1684         if (rt6_check_expired(rt)) {
1685                 ip6_del_rt(rt);
1686                 goto again;
1687         }
1688
1689         if (pmtu >= dst_mtu(&rt->dst))
1690                 goto out;
1691
1692         if (pmtu < IPV6_MIN_MTU) {
1693                 /*
1694                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1695                  * MTU (1280) and a fragment header should always be included
1696                  * after a node receiving Too Big message reporting PMTU is
1697                  * less than the IPv6 Minimum Link MTU.
1698                  */
1699                 pmtu = IPV6_MIN_MTU;
1700                 allfrag = 1;
1701         }
1702
1703         /* New mtu received -> path was valid.
1704            They are sent only in response to data packets,
1705            so that this nexthop apparently is reachable. --ANK
1706          */
1707         dst_confirm(&rt->dst);
1708
1709         /* Host route. If it is static, it would be better
1710            not to override it, but add new one, so that
1711            when cache entry will expire old pmtu
1712            would return automatically.
1713          */
1714         if (rt->rt6i_flags & RTF_CACHE) {
1715                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1716                 if (allfrag) {
1717                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1718                         features |= RTAX_FEATURE_ALLFRAG;
1719                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1720                 }
1721                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1722                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1723                 goto out;
1724         }
1725
1726         /* Network route.
1727            Two cases are possible:
1728            1. It is connected route. Action: COW
1729            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1730          */
1731         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1732                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1733         else
1734                 nrt = rt6_alloc_clone(rt, daddr);
1735
1736         if (nrt) {
1737                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1738                 if (allfrag) {
1739                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1740                         features |= RTAX_FEATURE_ALLFRAG;
1741                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1742                 }
1743
1744                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1745                  * happened within 5 mins, the recommended timer is 10 mins.
1746                  * Here this route expiration time is set to ip6_rt_mtu_expires
1747                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1748                  * and detecting PMTU increase will be automatically happened.
1749                  */
1750                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1751                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1752
1753                 ip6_ins_rt(nrt);
1754         }
1755 out:
1756         dst_release(&rt->dst);
1757 }
1758
1759 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1760                         struct net_device *dev, u32 pmtu)
1761 {
1762         struct net *net = dev_net(dev);
1763
1764         /*
1765          * RFC 1981 states that a node "MUST reduce the size of the packets it
1766          * is sending along the path" that caused the Packet Too Big message.
1767          * Since it's not possible in the general case to determine which
1768          * interface was used to send the original packet, we update the MTU
1769          * on the interface that will be used to send future packets. We also
1770          * update the MTU on the interface that received the Packet Too Big in
1771          * case the original packet was forced out that interface with
1772          * SO_BINDTODEVICE or similar. This is the next best thing to the
1773          * correct behaviour, which would be to update the MTU on all
1774          * interfaces.
1775          */
1776         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1777         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1778 }
1779
1780 /*
1781  *      Misc support functions
1782  */
1783
1784 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1785                                     const struct in6_addr *dest)
1786 {
1787         struct net *net = dev_net(ort->rt6i_dev);
1788         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1789                                             ort->dst.dev, 0);
1790
1791         if (rt) {
1792                 rt->dst.input = ort->dst.input;
1793                 rt->dst.output = ort->dst.output;
1794                 rt->dst.flags |= DST_HOST;
1795
1796                 rt->rt6i_dst.addr = *dest;
1797                 rt->rt6i_dst.plen = 128;
1798                 dst_copy_metrics(&rt->dst, &ort->dst);
1799                 rt->dst.error = ort->dst.error;
1800                 rt->rt6i_idev = ort->rt6i_idev;
1801                 if (rt->rt6i_idev)
1802                         in6_dev_hold(rt->rt6i_idev);
1803                 rt->dst.lastuse = jiffies;
1804                 rt->rt6i_expires = 0;
1805
1806                 rt->rt6i_gateway = ort->rt6i_gateway;
1807                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1808                 rt->rt6i_metric = 0;
1809
1810 #ifdef CONFIG_IPV6_SUBTREES
1811                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1812 #endif
1813                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1814                 rt->rt6i_table = ort->rt6i_table;
1815         }
1816         return rt;
1817 }
1818
1819 #ifdef CONFIG_IPV6_ROUTE_INFO
1820 static struct rt6_info *rt6_get_route_info(struct net *net,
1821                                            const struct in6_addr *prefix, int prefixlen,
1822                                            const struct in6_addr *gwaddr, int ifindex)
1823 {
1824         struct fib6_node *fn;
1825         struct rt6_info *rt = NULL;
1826         struct fib6_table *table;
1827
1828         table = fib6_get_table(net, RT6_TABLE_INFO);
1829         if (!table)
1830                 return NULL;
1831
1832         write_lock_bh(&table->tb6_lock);
1833         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1834         if (!fn)
1835                 goto out;
1836
1837         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1838                 if (rt->rt6i_dev->ifindex != ifindex)
1839                         continue;
1840                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1841                         continue;
1842                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1843                         continue;
1844                 dst_hold(&rt->dst);
1845                 break;
1846         }
1847 out:
1848         write_unlock_bh(&table->tb6_lock);
1849         return rt;
1850 }
1851
1852 static struct rt6_info *rt6_add_route_info(struct net *net,
1853                                            const struct in6_addr *prefix, int prefixlen,
1854                                            const struct in6_addr *gwaddr, int ifindex,
1855                                            unsigned pref)
1856 {
1857         struct fib6_config cfg = {
1858                 .fc_table       = RT6_TABLE_INFO,
1859                 .fc_metric      = IP6_RT_PRIO_USER,
1860                 .fc_ifindex     = ifindex,
1861                 .fc_dst_len     = prefixlen,
1862                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1863                                   RTF_UP | RTF_PREF(pref),
1864                 .fc_nlinfo.pid = 0,
1865                 .fc_nlinfo.nlh = NULL,
1866                 .fc_nlinfo.nl_net = net,
1867         };
1868
1869         cfg.fc_dst = *prefix;
1870         cfg.fc_gateway = *gwaddr;
1871
1872         /* We should treat it as a default route if prefix length is 0. */
1873         if (!prefixlen)
1874                 cfg.fc_flags |= RTF_DEFAULT;
1875
1876         ip6_route_add(&cfg);
1877
1878         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1879 }
1880 #endif
1881
1882 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1883 {
1884         struct rt6_info *rt;
1885         struct fib6_table *table;
1886
1887         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1888         if (!table)
1889                 return NULL;
1890
1891         write_lock_bh(&table->tb6_lock);
1892         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1893                 if (dev == rt->rt6i_dev &&
1894                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1895                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1896                         break;
1897         }
1898         if (rt)
1899                 dst_hold(&rt->dst);
1900         write_unlock_bh(&table->tb6_lock);
1901         return rt;
1902 }
1903
1904 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1905                                      struct net_device *dev,
1906                                      unsigned int pref)
1907 {
1908         struct fib6_config cfg = {
1909                 .fc_table       = RT6_TABLE_DFLT,
1910                 .fc_metric      = IP6_RT_PRIO_USER,
1911                 .fc_ifindex     = dev->ifindex,
1912                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1913                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1914                 .fc_nlinfo.pid = 0,
1915                 .fc_nlinfo.nlh = NULL,
1916                 .fc_nlinfo.nl_net = dev_net(dev),
1917         };
1918
1919         cfg.fc_gateway = *gwaddr;
1920
1921         ip6_route_add(&cfg);
1922
1923         return rt6_get_dflt_router(gwaddr, dev);
1924 }
1925
1926 void rt6_purge_dflt_routers(struct net *net)
1927 {
1928         struct rt6_info *rt;
1929         struct fib6_table *table;
1930
1931         /* NOTE: Keep consistent with rt6_get_dflt_router */
1932         table = fib6_get_table(net, RT6_TABLE_DFLT);
1933         if (!table)
1934                 return;
1935
1936 restart:
1937         read_lock_bh(&table->tb6_lock);
1938         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1939                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1940                         dst_hold(&rt->dst);
1941                         read_unlock_bh(&table->tb6_lock);
1942                         ip6_del_rt(rt);
1943                         goto restart;
1944                 }
1945         }
1946         read_unlock_bh(&table->tb6_lock);
1947 }
1948
1949 static void rtmsg_to_fib6_config(struct net *net,
1950                                  struct in6_rtmsg *rtmsg,
1951                                  struct fib6_config *cfg)
1952 {
1953         memset(cfg, 0, sizeof(*cfg));
1954
1955         cfg->fc_table = RT6_TABLE_MAIN;
1956         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1957         cfg->fc_metric = rtmsg->rtmsg_metric;
1958         cfg->fc_expires = rtmsg->rtmsg_info;
1959         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1960         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1961         cfg->fc_flags = rtmsg->rtmsg_flags;
1962
1963         cfg->fc_nlinfo.nl_net = net;
1964
1965         cfg->fc_dst = rtmsg->rtmsg_dst;
1966         cfg->fc_src = rtmsg->rtmsg_src;
1967         cfg->fc_gateway = rtmsg->rtmsg_gateway;
1968 }
1969
1970 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1971 {
1972         struct fib6_config cfg;
1973         struct in6_rtmsg rtmsg;
1974         int err;
1975
1976         switch(cmd) {
1977         case SIOCADDRT:         /* Add a route */
1978         case SIOCDELRT:         /* Delete a route */
1979                 if (!capable(CAP_NET_ADMIN))
1980                         return -EPERM;
1981                 err = copy_from_user(&rtmsg, arg,
1982                                      sizeof(struct in6_rtmsg));
1983                 if (err)
1984                         return -EFAULT;
1985
1986                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1987
1988                 rtnl_lock();
1989                 switch (cmd) {
1990                 case SIOCADDRT:
1991                         err = ip6_route_add(&cfg);
1992                         break;
1993                 case SIOCDELRT:
1994                         err = ip6_route_del(&cfg);
1995                         break;
1996                 default:
1997                         err = -EINVAL;
1998                 }
1999                 rtnl_unlock();
2000
2001                 return err;
2002         }
2003
2004         return -EINVAL;
2005 }
2006
2007 /*
2008  *      Drop the packet on the floor
2009  */
2010
2011 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2012 {
2013         int type;
2014         struct dst_entry *dst = skb_dst(skb);
2015         switch (ipstats_mib_noroutes) {
2016         case IPSTATS_MIB_INNOROUTES:
2017                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2018                 if (type == IPV6_ADDR_ANY) {
2019                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2020                                       IPSTATS_MIB_INADDRERRORS);
2021                         break;
2022                 }
2023                 /* FALLTHROUGH */
2024         case IPSTATS_MIB_OUTNOROUTES:
2025                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2026                               ipstats_mib_noroutes);
2027                 break;
2028         }
2029         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2030         kfree_skb(skb);
2031         return 0;
2032 }
2033
2034 static int ip6_pkt_discard(struct sk_buff *skb)
2035 {
2036         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2037 }
2038
2039 static int ip6_pkt_discard_out(struct sk_buff *skb)
2040 {
2041         skb->dev = skb_dst(skb)->dev;
2042         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2043 }
2044
2045 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2046
2047 static int ip6_pkt_prohibit(struct sk_buff *skb)
2048 {
2049         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2050 }
2051
2052 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2053 {
2054         skb->dev = skb_dst(skb)->dev;
2055         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2056 }
2057
2058 #endif
2059
2060 /*
2061  *      Allocate a dst for local (unicast / anycast) address.
2062  */
2063
2064 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2065                                     const struct in6_addr *addr,
2066                                     int anycast)
2067 {
2068         struct net *net = dev_net(idev->dev);
2069         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2070                                             net->loopback_dev, 0);
2071         struct neighbour *neigh;
2072
2073         if (!rt) {
2074                 if (net_ratelimit())
2075                         pr_warning("IPv6:  Maximum number of routes reached,"
2076                                    " consider increasing route/max_size.\n");
2077                 return ERR_PTR(-ENOMEM);
2078         }
2079
2080         in6_dev_hold(idev);
2081
2082         rt->dst.flags |= DST_HOST;
2083         rt->dst.input = ip6_input;
2084         rt->dst.output = ip6_output;
2085         rt->rt6i_idev = idev;
2086         rt->dst.obsolete = -1;
2087
2088         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2089         if (anycast)
2090                 rt->rt6i_flags |= RTF_ANYCAST;
2091         else
2092                 rt->rt6i_flags |= RTF_LOCAL;
2093         neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, rt->rt6i_dev);
2094         if (IS_ERR(neigh)) {
2095                 dst_free(&rt->dst);
2096
2097                 return ERR_CAST(neigh);
2098         }
2099         dst_set_neighbour(&rt->dst, neigh);
2100
2101         rt->rt6i_dst.addr = *addr;
2102         rt->rt6i_dst.plen = 128;
2103         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2104
2105         atomic_set(&rt->dst.__refcnt, 1);
2106
2107         return rt;
2108 }
2109
2110 int ip6_route_get_saddr(struct net *net,
2111                         struct rt6_info *rt,
2112                         const struct in6_addr *daddr,
2113                         unsigned int prefs,
2114                         struct in6_addr *saddr)
2115 {
2116         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2117         int err = 0;
2118         if (rt->rt6i_prefsrc.plen)
2119                 *saddr = rt->rt6i_prefsrc.addr;
2120         else
2121                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2122                                          daddr, prefs, saddr);
2123         return err;
2124 }
2125
2126 /* remove deleted ip from prefsrc entries */
2127 struct arg_dev_net_ip {
2128         struct net_device *dev;
2129         struct net *net;
2130         struct in6_addr *addr;
2131 };
2132
2133 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2134 {
2135         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2136         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2137         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2138
2139         if (((void *)rt->rt6i_dev == dev || !dev) &&
2140             rt != net->ipv6.ip6_null_entry &&
2141             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2142                 /* remove prefsrc entry */
2143                 rt->rt6i_prefsrc.plen = 0;
2144         }
2145         return 0;
2146 }
2147
2148 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2149 {
2150         struct net *net = dev_net(ifp->idev->dev);
2151         struct arg_dev_net_ip adni = {
2152                 .dev = ifp->idev->dev,
2153                 .net = net,
2154                 .addr = &ifp->addr,
2155         };
2156         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2157 }
2158
2159 struct arg_dev_net {
2160         struct net_device *dev;
2161         struct net *net;
2162 };
2163
2164 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2165 {
2166         const struct arg_dev_net *adn = arg;
2167         const struct net_device *dev = adn->dev;
2168
2169         if ((rt->rt6i_dev == dev || !dev) &&
2170             rt != adn->net->ipv6.ip6_null_entry) {
2171                 RT6_TRACE("deleted by ifdown %p\n", rt);
2172                 return -1;
2173         }
2174         return 0;
2175 }
2176
2177 void rt6_ifdown(struct net *net, struct net_device *dev)
2178 {
2179         struct arg_dev_net adn = {
2180                 .dev = dev,
2181                 .net = net,
2182         };
2183
2184         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2185         icmp6_clean_all(fib6_ifdown, &adn);
2186 }
2187
2188 struct rt6_mtu_change_arg
2189 {
2190         struct net_device *dev;
2191         unsigned mtu;
2192 };
2193
2194 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2195 {
2196         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2197         struct inet6_dev *idev;
2198
2199         /* In IPv6 pmtu discovery is not optional,
2200            so that RTAX_MTU lock cannot disable it.
2201            We still use this lock to block changes
2202            caused by addrconf/ndisc.
2203         */
2204
2205         idev = __in6_dev_get(arg->dev);
2206         if (!idev)
2207                 return 0;
2208
2209         /* For administrative MTU increase, there is no way to discover
2210            IPv6 PMTU increase, so PMTU increase should be updated here.
2211            Since RFC 1981 doesn't include administrative MTU increase
2212            update PMTU increase is a MUST. (i.e. jumbo frame)
2213          */
2214         /*
2215            If new MTU is less than route PMTU, this new MTU will be the
2216            lowest MTU in the path, update the route PMTU to reflect PMTU
2217            decreases; if new MTU is greater than route PMTU, and the
2218            old MTU is the lowest MTU in the path, update the route PMTU
2219            to reflect the increase. In this case if the other nodes' MTU
2220            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2221            PMTU discouvery.
2222          */
2223         if (rt->rt6i_dev == arg->dev &&
2224             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2225             (dst_mtu(&rt->dst) >= arg->mtu ||
2226              (dst_mtu(&rt->dst) < arg->mtu &&
2227               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2228                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2229         }
2230         return 0;
2231 }
2232
2233 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2234 {
2235         struct rt6_mtu_change_arg arg = {
2236                 .dev = dev,
2237                 .mtu = mtu,
2238         };
2239
2240         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2241 }
2242
2243 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2244         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2245         [RTA_OIF]               = { .type = NLA_U32 },
2246         [RTA_IIF]               = { .type = NLA_U32 },
2247         [RTA_PRIORITY]          = { .type = NLA_U32 },
2248         [RTA_METRICS]           = { .type = NLA_NESTED },
2249 };
2250
2251 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2252                               struct fib6_config *cfg)
2253 {
2254         struct rtmsg *rtm;
2255         struct nlattr *tb[RTA_MAX+1];
2256         int err;
2257
2258         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2259         if (err < 0)
2260                 goto errout;
2261
2262         err = -EINVAL;
2263         rtm = nlmsg_data(nlh);
2264         memset(cfg, 0, sizeof(*cfg));
2265
2266         cfg->fc_table = rtm->rtm_table;
2267         cfg->fc_dst_len = rtm->rtm_dst_len;
2268         cfg->fc_src_len = rtm->rtm_src_len;
2269         cfg->fc_flags = RTF_UP;
2270         cfg->fc_protocol = rtm->rtm_protocol;
2271
2272         if (rtm->rtm_type == RTN_UNREACHABLE)
2273                 cfg->fc_flags |= RTF_REJECT;
2274
2275         if (rtm->rtm_type == RTN_LOCAL)
2276                 cfg->fc_flags |= RTF_LOCAL;
2277
2278         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2279         cfg->fc_nlinfo.nlh = nlh;
2280         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2281
2282         if (tb[RTA_GATEWAY]) {
2283                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2284                 cfg->fc_flags |= RTF_GATEWAY;
2285         }
2286
2287         if (tb[RTA_DST]) {
2288                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2289
2290                 if (nla_len(tb[RTA_DST]) < plen)
2291                         goto errout;
2292
2293                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2294         }
2295
2296         if (tb[RTA_SRC]) {
2297                 int plen = (rtm->rtm_src_len + 7) >> 3;
2298
2299                 if (nla_len(tb[RTA_SRC]) < plen)
2300                         goto errout;
2301
2302                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2303         }
2304
2305         if (tb[RTA_PREFSRC])
2306                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2307
2308         if (tb[RTA_OIF])
2309                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2310
2311         if (tb[RTA_PRIORITY])
2312                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2313
2314         if (tb[RTA_METRICS]) {
2315                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2316                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2317         }
2318
2319         if (tb[RTA_TABLE])
2320                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2321
2322         err = 0;
2323 errout:
2324         return err;
2325 }
2326
2327 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2328 {
2329         struct fib6_config cfg;
2330         int err;
2331
2332         err = rtm_to_fib6_config(skb, nlh, &cfg);
2333         if (err < 0)
2334                 return err;
2335
2336         return ip6_route_del(&cfg);
2337 }
2338
2339 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2340 {
2341         struct fib6_config cfg;
2342         int err;
2343
2344         err = rtm_to_fib6_config(skb, nlh, &cfg);
2345         if (err < 0)
2346                 return err;
2347
2348         return ip6_route_add(&cfg);
2349 }
2350
2351 static inline size_t rt6_nlmsg_size(void)
2352 {
2353         return NLMSG_ALIGN(sizeof(struct rtmsg))
2354                + nla_total_size(16) /* RTA_SRC */
2355                + nla_total_size(16) /* RTA_DST */
2356                + nla_total_size(16) /* RTA_GATEWAY */
2357                + nla_total_size(16) /* RTA_PREFSRC */
2358                + nla_total_size(4) /* RTA_TABLE */
2359                + nla_total_size(4) /* RTA_IIF */
2360                + nla_total_size(4) /* RTA_OIF */
2361                + nla_total_size(4) /* RTA_PRIORITY */
2362                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2363                + nla_total_size(sizeof(struct rta_cacheinfo));
2364 }
2365
2366 static int rt6_fill_node(struct net *net,
2367                          struct sk_buff *skb, struct rt6_info *rt,
2368                          struct in6_addr *dst, struct in6_addr *src,
2369                          int iif, int type, u32 pid, u32 seq,
2370                          int prefix, int nowait, unsigned int flags)
2371 {
2372         struct rtmsg *rtm;
2373         struct nlmsghdr *nlh;
2374         long expires;
2375         u32 table;
2376         struct neighbour *n;
2377
2378         if (prefix) {   /* user wants prefix routes only */
2379                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2380                         /* success since this is not a prefix route */
2381                         return 1;
2382                 }
2383         }
2384
2385         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2386         if (!nlh)
2387                 return -EMSGSIZE;
2388
2389         rtm = nlmsg_data(nlh);
2390         rtm->rtm_family = AF_INET6;
2391         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2392         rtm->rtm_src_len = rt->rt6i_src.plen;
2393         rtm->rtm_tos = 0;
2394         if (rt->rt6i_table)
2395                 table = rt->rt6i_table->tb6_id;
2396         else
2397                 table = RT6_TABLE_UNSPEC;
2398         rtm->rtm_table = table;
2399         NLA_PUT_U32(skb, RTA_TABLE, table);
2400         if (rt->rt6i_flags & RTF_REJECT)
2401                 rtm->rtm_type = RTN_UNREACHABLE;
2402         else if (rt->rt6i_flags & RTF_LOCAL)
2403                 rtm->rtm_type = RTN_LOCAL;
2404         else if (rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
2405                 rtm->rtm_type = RTN_LOCAL;
2406         else
2407                 rtm->rtm_type = RTN_UNICAST;
2408         rtm->rtm_flags = 0;
2409         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2410         rtm->rtm_protocol = rt->rt6i_protocol;
2411         if (rt->rt6i_flags & RTF_DYNAMIC)
2412                 rtm->rtm_protocol = RTPROT_REDIRECT;
2413         else if (rt->rt6i_flags & RTF_ADDRCONF)
2414                 rtm->rtm_protocol = RTPROT_KERNEL;
2415         else if (rt->rt6i_flags & RTF_DEFAULT)
2416                 rtm->rtm_protocol = RTPROT_RA;
2417
2418         if (rt->rt6i_flags & RTF_CACHE)
2419                 rtm->rtm_flags |= RTM_F_CLONED;
2420
2421         if (dst) {
2422                 NLA_PUT(skb, RTA_DST, 16, dst);
2423                 rtm->rtm_dst_len = 128;
2424         } else if (rtm->rtm_dst_len)
2425                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2426 #ifdef CONFIG_IPV6_SUBTREES
2427         if (src) {
2428                 NLA_PUT(skb, RTA_SRC, 16, src);
2429                 rtm->rtm_src_len = 128;
2430         } else if (rtm->rtm_src_len)
2431                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2432 #endif
2433         if (iif) {
2434 #ifdef CONFIG_IPV6_MROUTE
2435                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2436                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2437                         if (err <= 0) {
2438                                 if (!nowait) {
2439                                         if (err == 0)
2440                                                 return 0;
2441                                         goto nla_put_failure;
2442                                 } else {
2443                                         if (err == -EMSGSIZE)
2444                                                 goto nla_put_failure;
2445                                 }
2446                         }
2447                 } else
2448 #endif
2449                         NLA_PUT_U32(skb, RTA_IIF, iif);
2450         } else if (dst) {
2451                 struct in6_addr saddr_buf;
2452                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2453                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2454         }
2455
2456         if (rt->rt6i_prefsrc.plen) {
2457                 struct in6_addr saddr_buf;
2458                 saddr_buf = rt->rt6i_prefsrc.addr;
2459                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2460         }
2461
2462         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2463                 goto nla_put_failure;
2464
2465         rcu_read_lock();
2466         n = dst_get_neighbour(&rt->dst);
2467         if (n)
2468                 NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2469         rcu_read_unlock();
2470
2471         if (rt->dst.dev)
2472                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2473
2474         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2475
2476         if (!(rt->rt6i_flags & RTF_EXPIRES))
2477                 expires = 0;
2478         else if (rt->rt6i_expires - jiffies < INT_MAX)
2479                 expires = rt->rt6i_expires - jiffies;
2480         else
2481                 expires = INT_MAX;
2482
2483         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2484                                expires, rt->dst.error) < 0)
2485                 goto nla_put_failure;
2486
2487         return nlmsg_end(skb, nlh);
2488
2489 nla_put_failure:
2490         nlmsg_cancel(skb, nlh);
2491         return -EMSGSIZE;
2492 }
2493
2494 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2495 {
2496         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2497         int prefix;
2498
2499         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2500                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2501                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2502         } else
2503                 prefix = 0;
2504
2505         return rt6_fill_node(arg->net,
2506                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2507                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2508                      prefix, 0, NLM_F_MULTI);
2509 }
2510
2511 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2512 {
2513         struct net *net = sock_net(in_skb->sk);
2514         struct nlattr *tb[RTA_MAX+1];
2515         struct rt6_info *rt;
2516         struct sk_buff *skb;
2517         struct rtmsg *rtm;
2518         struct flowi6 fl6;
2519         int err, iif = 0;
2520
2521         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2522         if (err < 0)
2523                 goto errout;
2524
2525         err = -EINVAL;
2526         memset(&fl6, 0, sizeof(fl6));
2527
2528         if (tb[RTA_SRC]) {
2529                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2530                         goto errout;
2531
2532                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2533         }
2534
2535         if (tb[RTA_DST]) {
2536                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2537                         goto errout;
2538
2539                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2540         }
2541
2542         if (tb[RTA_IIF])
2543                 iif = nla_get_u32(tb[RTA_IIF]);
2544
2545         if (tb[RTA_OIF])
2546                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2547
2548         if (iif) {
2549                 struct net_device *dev;
2550                 dev = __dev_get_by_index(net, iif);
2551                 if (!dev) {
2552                         err = -ENODEV;
2553                         goto errout;
2554                 }
2555         }
2556
2557         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2558         if (!skb) {
2559                 err = -ENOBUFS;
2560                 goto errout;
2561         }
2562
2563         /* Reserve room for dummy headers, this skb can pass
2564            through good chunk of routing engine.
2565          */
2566         skb_reset_mac_header(skb);
2567         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2568
2569         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2570         skb_dst_set(skb, &rt->dst);
2571
2572         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2573                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2574                             nlh->nlmsg_seq, 0, 0, 0);
2575         if (err < 0) {
2576                 kfree_skb(skb);
2577                 goto errout;
2578         }
2579
2580         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2581 errout:
2582         return err;
2583 }
2584
2585 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2586 {
2587         struct sk_buff *skb;
2588         struct net *net = info->nl_net;
2589         u32 seq;
2590         int err;
2591
2592         err = -ENOBUFS;
2593         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2594
2595         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2596         if (!skb)
2597                 goto errout;
2598
2599         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2600                                 event, info->pid, seq, 0, 0, 0);
2601         if (err < 0) {
2602                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2603                 WARN_ON(err == -EMSGSIZE);
2604                 kfree_skb(skb);
2605                 goto errout;
2606         }
2607         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2608                     info->nlh, gfp_any());
2609         return;
2610 errout:
2611         if (err < 0)
2612                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2613 }
2614
2615 static int ip6_route_dev_notify(struct notifier_block *this,
2616                                 unsigned long event, void *data)
2617 {
2618         struct net_device *dev = (struct net_device *)data;
2619         struct net *net = dev_net(dev);
2620
2621         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2622                 net->ipv6.ip6_null_entry->dst.dev = dev;
2623                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2624 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2625                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2626                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2627                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2628                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2629 #endif
2630         }
2631
2632         return NOTIFY_OK;
2633 }
2634
2635 /*
2636  *      /proc
2637  */
2638
2639 #ifdef CONFIG_PROC_FS
2640
2641 struct rt6_proc_arg
2642 {
2643         char *buffer;
2644         int offset;
2645         int length;
2646         int skip;
2647         int len;
2648 };
2649
2650 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2651 {
2652         struct seq_file *m = p_arg;
2653         struct neighbour *n;
2654
2655         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2656
2657 #ifdef CONFIG_IPV6_SUBTREES
2658         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2659 #else
2660         seq_puts(m, "00000000000000000000000000000000 00 ");
2661 #endif
2662         rcu_read_lock();
2663         n = dst_get_neighbour(&rt->dst);
2664         if (n) {
2665                 seq_printf(m, "%pi6", n->primary_key);
2666         } else {
2667                 seq_puts(m, "00000000000000000000000000000000");
2668         }
2669         rcu_read_unlock();
2670         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2671                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2672                    rt->dst.__use, rt->rt6i_flags,
2673                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2674         return 0;
2675 }
2676
2677 static int ipv6_route_show(struct seq_file *m, void *v)
2678 {
2679         struct net *net = (struct net *)m->private;
2680         fib6_clean_all(net, rt6_info_route, 0, m);
2681         return 0;
2682 }
2683
2684 static int ipv6_route_open(struct inode *inode, struct file *file)
2685 {
2686         return single_open_net(inode, file, ipv6_route_show);
2687 }
2688
2689 static const struct file_operations ipv6_route_proc_fops = {
2690         .owner          = THIS_MODULE,
2691         .open           = ipv6_route_open,
2692         .read           = seq_read,
2693         .llseek         = seq_lseek,
2694         .release        = single_release_net,
2695 };
2696
2697 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2698 {
2699         struct net *net = (struct net *)seq->private;
2700         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2701                    net->ipv6.rt6_stats->fib_nodes,
2702                    net->ipv6.rt6_stats->fib_route_nodes,
2703                    net->ipv6.rt6_stats->fib_rt_alloc,
2704                    net->ipv6.rt6_stats->fib_rt_entries,
2705                    net->ipv6.rt6_stats->fib_rt_cache,
2706                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2707                    net->ipv6.rt6_stats->fib_discarded_routes);
2708
2709         return 0;
2710 }
2711
2712 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2713 {
2714         return single_open_net(inode, file, rt6_stats_seq_show);
2715 }
2716
2717 static const struct file_operations rt6_stats_seq_fops = {
2718         .owner   = THIS_MODULE,
2719         .open    = rt6_stats_seq_open,
2720         .read    = seq_read,
2721         .llseek  = seq_lseek,
2722         .release = single_release_net,
2723 };
2724 #endif  /* CONFIG_PROC_FS */
2725
2726 #ifdef CONFIG_SYSCTL
2727
2728 static
2729 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2730                               void __user *buffer, size_t *lenp, loff_t *ppos)
2731 {
2732         struct net *net;
2733         int delay;
2734         if (!write)
2735                 return -EINVAL;
2736
2737         net = (struct net *)ctl->extra1;
2738         delay = net->ipv6.sysctl.flush_delay;
2739         proc_dointvec(ctl, write, buffer, lenp, ppos);
2740         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2741         return 0;
2742 }
2743
2744 ctl_table ipv6_route_table_template[] = {
2745         {
2746                 .procname       =       "flush",
2747                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2748                 .maxlen         =       sizeof(int),
2749                 .mode           =       0200,
2750                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2751         },
2752         {
2753                 .procname       =       "gc_thresh",
2754                 .data           =       &ip6_dst_ops_template.gc_thresh,
2755                 .maxlen         =       sizeof(int),
2756                 .mode           =       0644,
2757                 .proc_handler   =       proc_dointvec,
2758         },
2759         {
2760                 .procname       =       "max_size",
2761                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2762                 .maxlen         =       sizeof(int),
2763                 .mode           =       0644,
2764                 .proc_handler   =       proc_dointvec,
2765         },
2766         {
2767                 .procname       =       "gc_min_interval",
2768                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2769                 .maxlen         =       sizeof(int),
2770                 .mode           =       0644,
2771                 .proc_handler   =       proc_dointvec_jiffies,
2772         },
2773         {
2774                 .procname       =       "gc_timeout",
2775                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2776                 .maxlen         =       sizeof(int),
2777                 .mode           =       0644,
2778                 .proc_handler   =       proc_dointvec_jiffies,
2779         },
2780         {
2781                 .procname       =       "gc_interval",
2782                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2783                 .maxlen         =       sizeof(int),
2784                 .mode           =       0644,
2785                 .proc_handler   =       proc_dointvec_jiffies,
2786         },
2787         {
2788                 .procname       =       "gc_elasticity",
2789                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2790                 .maxlen         =       sizeof(int),
2791                 .mode           =       0644,
2792                 .proc_handler   =       proc_dointvec,
2793         },
2794         {
2795                 .procname       =       "mtu_expires",
2796                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2797                 .maxlen         =       sizeof(int),
2798                 .mode           =       0644,
2799                 .proc_handler   =       proc_dointvec_jiffies,
2800         },
2801         {
2802                 .procname       =       "min_adv_mss",
2803                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2804                 .maxlen         =       sizeof(int),
2805                 .mode           =       0644,
2806                 .proc_handler   =       proc_dointvec,
2807         },
2808         {
2809                 .procname       =       "gc_min_interval_ms",
2810                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2811                 .maxlen         =       sizeof(int),
2812                 .mode           =       0644,
2813                 .proc_handler   =       proc_dointvec_ms_jiffies,
2814         },
2815         { }
2816 };
2817
2818 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2819 {
2820         struct ctl_table *table;
2821
2822         table = kmemdup(ipv6_route_table_template,
2823                         sizeof(ipv6_route_table_template),
2824                         GFP_KERNEL);
2825
2826         if (table) {
2827                 table[0].data = &net->ipv6.sysctl.flush_delay;
2828                 table[0].extra1 = net;
2829                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2830                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2831                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2832                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2833                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2834                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2835                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2836                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2837                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2838         }
2839
2840         return table;
2841 }
2842 #endif
2843
2844 static int __net_init ip6_route_net_init(struct net *net)
2845 {
2846         int ret = -ENOMEM;
2847
2848         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2849                sizeof(net->ipv6.ip6_dst_ops));
2850
2851         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2852                 goto out_ip6_dst_ops;
2853
2854         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2855                                            sizeof(*net->ipv6.ip6_null_entry),
2856                                            GFP_KERNEL);
2857         if (!net->ipv6.ip6_null_entry)
2858                 goto out_ip6_dst_entries;
2859         net->ipv6.ip6_null_entry->dst.path =
2860                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2861         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2862         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2863                          ip6_template_metrics, true);
2864
2865 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2866         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2867                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2868                                                GFP_KERNEL);
2869         if (!net->ipv6.ip6_prohibit_entry)
2870                 goto out_ip6_null_entry;
2871         net->ipv6.ip6_prohibit_entry->dst.path =
2872                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2873         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2874         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2875                          ip6_template_metrics, true);
2876
2877         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2878                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2879                                                GFP_KERNEL);
2880         if (!net->ipv6.ip6_blk_hole_entry)
2881                 goto out_ip6_prohibit_entry;
2882         net->ipv6.ip6_blk_hole_entry->dst.path =
2883                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2884         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2885         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2886                          ip6_template_metrics, true);
2887 #endif
2888
2889         net->ipv6.sysctl.flush_delay = 0;
2890         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2891         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2892         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2893         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2894         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2895         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2896         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2897
2898 #ifdef CONFIG_PROC_FS
2899         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2900         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2901 #endif
2902         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2903
2904         ret = 0;
2905 out:
2906         return ret;
2907
2908 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2909 out_ip6_prohibit_entry:
2910         kfree(net->ipv6.ip6_prohibit_entry);
2911 out_ip6_null_entry:
2912         kfree(net->ipv6.ip6_null_entry);
2913 #endif
2914 out_ip6_dst_entries:
2915         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2916 out_ip6_dst_ops:
2917         goto out;
2918 }
2919
2920 static void __net_exit ip6_route_net_exit(struct net *net)
2921 {
2922 #ifdef CONFIG_PROC_FS
2923         proc_net_remove(net, "ipv6_route");
2924         proc_net_remove(net, "rt6_stats");
2925 #endif
2926         kfree(net->ipv6.ip6_null_entry);
2927 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2928         kfree(net->ipv6.ip6_prohibit_entry);
2929         kfree(net->ipv6.ip6_blk_hole_entry);
2930 #endif
2931         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2932 }
2933
2934 static struct pernet_operations ip6_route_net_ops = {
2935         .init = ip6_route_net_init,
2936         .exit = ip6_route_net_exit,
2937 };
2938
2939 static struct notifier_block ip6_route_dev_notifier = {
2940         .notifier_call = ip6_route_dev_notify,
2941         .priority = 0,
2942 };
2943
2944 int __init ip6_route_init(void)
2945 {
2946         int ret;
2947
2948         ret = -ENOMEM;
2949         ip6_dst_ops_template.kmem_cachep =
2950                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2951                                   SLAB_HWCACHE_ALIGN, NULL);
2952         if (!ip6_dst_ops_template.kmem_cachep)
2953                 goto out;
2954
2955         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2956         if (ret)
2957                 goto out_kmem_cache;
2958
2959         ret = register_pernet_subsys(&ip6_route_net_ops);
2960         if (ret)
2961                 goto out_dst_entries;
2962
2963         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2964
2965         /* Registering of the loopback is done before this portion of code,
2966          * the loopback reference in rt6_info will not be taken, do it
2967          * manually for init_net */
2968         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2969         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2970   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2971         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2972         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2973         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2974         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2975   #endif
2976         ret = fib6_init();
2977         if (ret)
2978                 goto out_register_subsys;
2979
2980         ret = xfrm6_init();
2981         if (ret)
2982                 goto out_fib6_init;
2983
2984         ret = fib6_rules_init();
2985         if (ret)
2986                 goto xfrm6_init;
2987
2988         ret = -ENOBUFS;
2989         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2990             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2991             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2992                 goto fib6_rules_init;
2993
2994         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2995         if (ret)
2996                 goto fib6_rules_init;
2997
2998 out:
2999         return ret;
3000
3001 fib6_rules_init:
3002         fib6_rules_cleanup();
3003 xfrm6_init:
3004         xfrm6_fini();
3005 out_fib6_init:
3006         fib6_gc_cleanup();
3007 out_register_subsys:
3008         unregister_pernet_subsys(&ip6_route_net_ops);
3009 out_dst_entries:
3010         dst_entries_destroy(&ip6_dst_blackhole_ops);
3011 out_kmem_cache:
3012         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3013         goto out;
3014 }
3015
3016 void ip6_route_cleanup(void)
3017 {
3018         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3019         fib6_rules_cleanup();
3020         xfrm6_fini();
3021         fib6_gc_cleanup();
3022         unregister_pernet_subsys(&ip6_route_net_ops);
3023         dst_entries_destroy(&ip6_dst_blackhole_ops);
3024         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3025 }