Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph...
[pandora-kernel.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int      ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            struct in6_addr *prefix, int prefixlen,
93                                            struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            struct in6_addr *prefix, int prefixlen,
97                                            struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
101 {
102         struct rt6_info *rt = (struct rt6_info *) dst;
103         struct inet_peer *peer;
104         u32 *p = NULL;
105
106         if (!rt->rt6i_peer)
107                 rt6_bind_peer(rt, 1);
108
109         peer = rt->rt6i_peer;
110         if (peer) {
111                 u32 *old_p = __DST_METRICS_PTR(old);
112                 unsigned long prev, new;
113
114                 p = peer->metrics;
115                 if (inet_metrics_new(peer))
116                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
117
118                 new = (unsigned long) p;
119                 prev = cmpxchg(&dst->_metrics, old, new);
120
121                 if (prev != old) {
122                         p = __DST_METRICS_PTR(prev);
123                         if (prev & DST_METRICS_READ_ONLY)
124                                 p = NULL;
125                 }
126         }
127         return p;
128 }
129
130 static struct dst_ops ip6_dst_ops_template = {
131         .family                 =       AF_INET6,
132         .protocol               =       cpu_to_be16(ETH_P_IPV6),
133         .gc                     =       ip6_dst_gc,
134         .gc_thresh              =       1024,
135         .check                  =       ip6_dst_check,
136         .default_advmss         =       ip6_default_advmss,
137         .default_mtu            =       ip6_default_mtu,
138         .cow_metrics            =       ipv6_cow_metrics,
139         .destroy                =       ip6_dst_destroy,
140         .ifdown                 =       ip6_dst_ifdown,
141         .negative_advice        =       ip6_negative_advice,
142         .link_failure           =       ip6_link_failure,
143         .update_pmtu            =       ip6_rt_update_pmtu,
144         .local_out              =       __ip6_local_out,
145 };
146
147 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
148 {
149         return 0;
150 }
151
152 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
153 {
154 }
155
156 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
157                                          unsigned long old)
158 {
159         return NULL;
160 }
161
162 static struct dst_ops ip6_dst_blackhole_ops = {
163         .family                 =       AF_INET6,
164         .protocol               =       cpu_to_be16(ETH_P_IPV6),
165         .destroy                =       ip6_dst_destroy,
166         .check                  =       ip6_dst_check,
167         .default_mtu            =       ip6_blackhole_default_mtu,
168         .default_advmss         =       ip6_default_advmss,
169         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
170         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
171 };
172
173 static const u32 ip6_template_metrics[RTAX_MAX] = {
174         [RTAX_HOPLIMIT - 1] = 255,
175 };
176
177 static struct rt6_info ip6_null_entry_template = {
178         .dst = {
179                 .__refcnt       = ATOMIC_INIT(1),
180                 .__use          = 1,
181                 .obsolete       = -1,
182                 .error          = -ENETUNREACH,
183                 .input          = ip6_pkt_discard,
184                 .output         = ip6_pkt_discard_out,
185         },
186         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
187         .rt6i_protocol  = RTPROT_KERNEL,
188         .rt6i_metric    = ~(u32) 0,
189         .rt6i_ref       = ATOMIC_INIT(1),
190 };
191
192 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
193
194 static int ip6_pkt_prohibit(struct sk_buff *skb);
195 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
196
197 static struct rt6_info ip6_prohibit_entry_template = {
198         .dst = {
199                 .__refcnt       = ATOMIC_INIT(1),
200                 .__use          = 1,
201                 .obsolete       = -1,
202                 .error          = -EACCES,
203                 .input          = ip6_pkt_prohibit,
204                 .output         = ip6_pkt_prohibit_out,
205         },
206         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
207         .rt6i_protocol  = RTPROT_KERNEL,
208         .rt6i_metric    = ~(u32) 0,
209         .rt6i_ref       = ATOMIC_INIT(1),
210 };
211
212 static struct rt6_info ip6_blk_hole_entry_template = {
213         .dst = {
214                 .__refcnt       = ATOMIC_INIT(1),
215                 .__use          = 1,
216                 .obsolete       = -1,
217                 .error          = -EINVAL,
218                 .input          = dst_discard,
219                 .output         = dst_discard,
220         },
221         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
222         .rt6i_protocol  = RTPROT_KERNEL,
223         .rt6i_metric    = ~(u32) 0,
224         .rt6i_ref       = ATOMIC_INIT(1),
225 };
226
227 #endif
228
229 /* allocate dst with ip6_dst_ops */
230 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
231 {
232         return (struct rt6_info *)dst_alloc(ops, 0);
233 }
234
235 static void ip6_dst_destroy(struct dst_entry *dst)
236 {
237         struct rt6_info *rt = (struct rt6_info *)dst;
238         struct inet6_dev *idev = rt->rt6i_idev;
239         struct inet_peer *peer = rt->rt6i_peer;
240
241         if (idev != NULL) {
242                 rt->rt6i_idev = NULL;
243                 in6_dev_put(idev);
244         }
245         if (peer) {
246                 rt->rt6i_peer = NULL;
247                 inet_putpeer(peer);
248         }
249 }
250
251 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
252
253 static u32 rt6_peer_genid(void)
254 {
255         return atomic_read(&__rt6_peer_genid);
256 }
257
258 void rt6_bind_peer(struct rt6_info *rt, int create)
259 {
260         struct inet_peer *peer;
261
262         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
263         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
264                 inet_putpeer(peer);
265         else
266                 rt->rt6i_peer_genid = rt6_peer_genid();
267 }
268
269 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
270                            int how)
271 {
272         struct rt6_info *rt = (struct rt6_info *)dst;
273         struct inet6_dev *idev = rt->rt6i_idev;
274         struct net_device *loopback_dev =
275                 dev_net(dev)->loopback_dev;
276
277         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
278                 struct inet6_dev *loopback_idev =
279                         in6_dev_get(loopback_dev);
280                 if (loopback_idev != NULL) {
281                         rt->rt6i_idev = loopback_idev;
282                         in6_dev_put(idev);
283                 }
284         }
285 }
286
287 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
288 {
289         return (rt->rt6i_flags & RTF_EXPIRES) &&
290                 time_after(jiffies, rt->rt6i_expires);
291 }
292
293 static inline int rt6_need_strict(struct in6_addr *daddr)
294 {
295         return ipv6_addr_type(daddr) &
296                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
297 }
298
299 /*
300  *      Route lookup. Any table->tb6_lock is implied.
301  */
302
303 static inline struct rt6_info *rt6_device_match(struct net *net,
304                                                     struct rt6_info *rt,
305                                                     struct in6_addr *saddr,
306                                                     int oif,
307                                                     int flags)
308 {
309         struct rt6_info *local = NULL;
310         struct rt6_info *sprt;
311
312         if (!oif && ipv6_addr_any(saddr))
313                 goto out;
314
315         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
316                 struct net_device *dev = sprt->rt6i_dev;
317
318                 if (oif) {
319                         if (dev->ifindex == oif)
320                                 return sprt;
321                         if (dev->flags & IFF_LOOPBACK) {
322                                 if (sprt->rt6i_idev == NULL ||
323                                     sprt->rt6i_idev->dev->ifindex != oif) {
324                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
325                                                 continue;
326                                         if (local && (!oif ||
327                                                       local->rt6i_idev->dev->ifindex == oif))
328                                                 continue;
329                                 }
330                                 local = sprt;
331                         }
332                 } else {
333                         if (ipv6_chk_addr(net, saddr, dev,
334                                           flags & RT6_LOOKUP_F_IFACE))
335                                 return sprt;
336                 }
337         }
338
339         if (oif) {
340                 if (local)
341                         return local;
342
343                 if (flags & RT6_LOOKUP_F_IFACE)
344                         return net->ipv6.ip6_null_entry;
345         }
346 out:
347         return rt;
348 }
349
350 #ifdef CONFIG_IPV6_ROUTER_PREF
351 static void rt6_probe(struct rt6_info *rt)
352 {
353         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
354         /*
355          * Okay, this does not seem to be appropriate
356          * for now, however, we need to check if it
357          * is really so; aka Router Reachability Probing.
358          *
359          * Router Reachability Probe MUST be rate-limited
360          * to no more than one per minute.
361          */
362         if (!neigh || (neigh->nud_state & NUD_VALID))
363                 return;
364         read_lock_bh(&neigh->lock);
365         if (!(neigh->nud_state & NUD_VALID) &&
366             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
367                 struct in6_addr mcaddr;
368                 struct in6_addr *target;
369
370                 neigh->updated = jiffies;
371                 read_unlock_bh(&neigh->lock);
372
373                 target = (struct in6_addr *)&neigh->primary_key;
374                 addrconf_addr_solict_mult(target, &mcaddr);
375                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
376         } else
377                 read_unlock_bh(&neigh->lock);
378 }
379 #else
380 static inline void rt6_probe(struct rt6_info *rt)
381 {
382 }
383 #endif
384
385 /*
386  * Default Router Selection (RFC 2461 6.3.6)
387  */
388 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
389 {
390         struct net_device *dev = rt->rt6i_dev;
391         if (!oif || dev->ifindex == oif)
392                 return 2;
393         if ((dev->flags & IFF_LOOPBACK) &&
394             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
395                 return 1;
396         return 0;
397 }
398
399 static inline int rt6_check_neigh(struct rt6_info *rt)
400 {
401         struct neighbour *neigh = rt->rt6i_nexthop;
402         int m;
403         if (rt->rt6i_flags & RTF_NONEXTHOP ||
404             !(rt->rt6i_flags & RTF_GATEWAY))
405                 m = 1;
406         else if (neigh) {
407                 read_lock_bh(&neigh->lock);
408                 if (neigh->nud_state & NUD_VALID)
409                         m = 2;
410 #ifdef CONFIG_IPV6_ROUTER_PREF
411                 else if (neigh->nud_state & NUD_FAILED)
412                         m = 0;
413 #endif
414                 else
415                         m = 1;
416                 read_unlock_bh(&neigh->lock);
417         } else
418                 m = 0;
419         return m;
420 }
421
422 static int rt6_score_route(struct rt6_info *rt, int oif,
423                            int strict)
424 {
425         int m, n;
426
427         m = rt6_check_dev(rt, oif);
428         if (!m && (strict & RT6_LOOKUP_F_IFACE))
429                 return -1;
430 #ifdef CONFIG_IPV6_ROUTER_PREF
431         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
432 #endif
433         n = rt6_check_neigh(rt);
434         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
435                 return -1;
436         return m;
437 }
438
439 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
440                                    int *mpri, struct rt6_info *match)
441 {
442         int m;
443
444         if (rt6_check_expired(rt))
445                 goto out;
446
447         m = rt6_score_route(rt, oif, strict);
448         if (m < 0)
449                 goto out;
450
451         if (m > *mpri) {
452                 if (strict & RT6_LOOKUP_F_REACHABLE)
453                         rt6_probe(match);
454                 *mpri = m;
455                 match = rt;
456         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
457                 rt6_probe(rt);
458         }
459
460 out:
461         return match;
462 }
463
464 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
465                                      struct rt6_info *rr_head,
466                                      u32 metric, int oif, int strict)
467 {
468         struct rt6_info *rt, *match;
469         int mpri = -1;
470
471         match = NULL;
472         for (rt = rr_head; rt && rt->rt6i_metric == metric;
473              rt = rt->dst.rt6_next)
474                 match = find_match(rt, oif, strict, &mpri, match);
475         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
476              rt = rt->dst.rt6_next)
477                 match = find_match(rt, oif, strict, &mpri, match);
478
479         return match;
480 }
481
482 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
483 {
484         struct rt6_info *match, *rt0;
485         struct net *net;
486
487         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
488                   __func__, fn->leaf, oif);
489
490         rt0 = fn->rr_ptr;
491         if (!rt0)
492                 fn->rr_ptr = rt0 = fn->leaf;
493
494         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
495
496         if (!match &&
497             (strict & RT6_LOOKUP_F_REACHABLE)) {
498                 struct rt6_info *next = rt0->dst.rt6_next;
499
500                 /* no entries matched; do round-robin */
501                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
502                         next = fn->leaf;
503
504                 if (next != rt0)
505                         fn->rr_ptr = next;
506         }
507
508         RT6_TRACE("%s() => %p\n",
509                   __func__, match);
510
511         net = dev_net(rt0->rt6i_dev);
512         return match ? match : net->ipv6.ip6_null_entry;
513 }
514
515 #ifdef CONFIG_IPV6_ROUTE_INFO
516 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
517                   struct in6_addr *gwaddr)
518 {
519         struct net *net = dev_net(dev);
520         struct route_info *rinfo = (struct route_info *) opt;
521         struct in6_addr prefix_buf, *prefix;
522         unsigned int pref;
523         unsigned long lifetime;
524         struct rt6_info *rt;
525
526         if (len < sizeof(struct route_info)) {
527                 return -EINVAL;
528         }
529
530         /* Sanity check for prefix_len and length */
531         if (rinfo->length > 3) {
532                 return -EINVAL;
533         } else if (rinfo->prefix_len > 128) {
534                 return -EINVAL;
535         } else if (rinfo->prefix_len > 64) {
536                 if (rinfo->length < 2) {
537                         return -EINVAL;
538                 }
539         } else if (rinfo->prefix_len > 0) {
540                 if (rinfo->length < 1) {
541                         return -EINVAL;
542                 }
543         }
544
545         pref = rinfo->route_pref;
546         if (pref == ICMPV6_ROUTER_PREF_INVALID)
547                 return -EINVAL;
548
549         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
550
551         if (rinfo->length == 3)
552                 prefix = (struct in6_addr *)rinfo->prefix;
553         else {
554                 /* this function is safe */
555                 ipv6_addr_prefix(&prefix_buf,
556                                  (struct in6_addr *)rinfo->prefix,
557                                  rinfo->prefix_len);
558                 prefix = &prefix_buf;
559         }
560
561         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
562                                 dev->ifindex);
563
564         if (rt && !lifetime) {
565                 ip6_del_rt(rt);
566                 rt = NULL;
567         }
568
569         if (!rt && lifetime)
570                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
571                                         pref);
572         else if (rt)
573                 rt->rt6i_flags = RTF_ROUTEINFO |
574                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
575
576         if (rt) {
577                 if (!addrconf_finite_timeout(lifetime)) {
578                         rt->rt6i_flags &= ~RTF_EXPIRES;
579                 } else {
580                         rt->rt6i_expires = jiffies + HZ * lifetime;
581                         rt->rt6i_flags |= RTF_EXPIRES;
582                 }
583                 dst_release(&rt->dst);
584         }
585         return 0;
586 }
587 #endif
588
589 #define BACKTRACK(__net, saddr)                 \
590 do { \
591         if (rt == __net->ipv6.ip6_null_entry) { \
592                 struct fib6_node *pn; \
593                 while (1) { \
594                         if (fn->fn_flags & RTN_TL_ROOT) \
595                                 goto out; \
596                         pn = fn->parent; \
597                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
598                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
599                         else \
600                                 fn = pn; \
601                         if (fn->fn_flags & RTN_RTINFO) \
602                                 goto restart; \
603                 } \
604         } \
605 } while(0)
606
607 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
608                                              struct fib6_table *table,
609                                              struct flowi6 *fl6, int flags)
610 {
611         struct fib6_node *fn;
612         struct rt6_info *rt;
613
614         read_lock_bh(&table->tb6_lock);
615         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
616 restart:
617         rt = fn->leaf;
618         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
619         BACKTRACK(net, &fl6->saddr);
620 out:
621         dst_use(&rt->dst, jiffies);
622         read_unlock_bh(&table->tb6_lock);
623         return rt;
624
625 }
626
627 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
628                             const struct in6_addr *saddr, int oif, int strict)
629 {
630         struct flowi6 fl6 = {
631                 .flowi6_oif = oif,
632                 .daddr = *daddr,
633         };
634         struct dst_entry *dst;
635         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
636
637         if (saddr) {
638                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
639                 flags |= RT6_LOOKUP_F_HAS_SADDR;
640         }
641
642         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
643         if (dst->error == 0)
644                 return (struct rt6_info *) dst;
645
646         dst_release(dst);
647
648         return NULL;
649 }
650
651 EXPORT_SYMBOL(rt6_lookup);
652
653 /* ip6_ins_rt is called with FREE table->tb6_lock.
654    It takes new route entry, the addition fails by any reason the
655    route is freed. In any case, if caller does not hold it, it may
656    be destroyed.
657  */
658
659 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
660 {
661         int err;
662         struct fib6_table *table;
663
664         table = rt->rt6i_table;
665         write_lock_bh(&table->tb6_lock);
666         err = fib6_add(&table->tb6_root, rt, info);
667         write_unlock_bh(&table->tb6_lock);
668
669         return err;
670 }
671
672 int ip6_ins_rt(struct rt6_info *rt)
673 {
674         struct nl_info info = {
675                 .nl_net = dev_net(rt->rt6i_dev),
676         };
677         return __ip6_ins_rt(rt, &info);
678 }
679
680 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
681                                       struct in6_addr *saddr)
682 {
683         struct rt6_info *rt;
684
685         /*
686          *      Clone the route.
687          */
688
689         rt = ip6_rt_copy(ort);
690
691         if (rt) {
692                 struct neighbour *neigh;
693                 int attempts = !in_softirq();
694
695                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
696                         if (rt->rt6i_dst.plen != 128 &&
697                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
698                                 rt->rt6i_flags |= RTF_ANYCAST;
699                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
700                 }
701
702                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
703                 rt->rt6i_dst.plen = 128;
704                 rt->rt6i_flags |= RTF_CACHE;
705                 rt->dst.flags |= DST_HOST;
706
707 #ifdef CONFIG_IPV6_SUBTREES
708                 if (rt->rt6i_src.plen && saddr) {
709                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
710                         rt->rt6i_src.plen = 128;
711                 }
712 #endif
713
714         retry:
715                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
716                 if (IS_ERR(neigh)) {
717                         struct net *net = dev_net(rt->rt6i_dev);
718                         int saved_rt_min_interval =
719                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
720                         int saved_rt_elasticity =
721                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
722
723                         if (attempts-- > 0) {
724                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
725                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
726
727                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
728
729                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
730                                         saved_rt_elasticity;
731                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
732                                         saved_rt_min_interval;
733                                 goto retry;
734                         }
735
736                         if (net_ratelimit())
737                                 printk(KERN_WARNING
738                                        "ipv6: Neighbour table overflow.\n");
739                         dst_free(&rt->dst);
740                         return NULL;
741                 }
742                 rt->rt6i_nexthop = neigh;
743
744         }
745
746         return rt;
747 }
748
749 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
750 {
751         struct rt6_info *rt = ip6_rt_copy(ort);
752         if (rt) {
753                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
754                 rt->rt6i_dst.plen = 128;
755                 rt->rt6i_flags |= RTF_CACHE;
756                 rt->dst.flags |= DST_HOST;
757                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
758         }
759         return rt;
760 }
761
762 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
763                                       struct flowi6 *fl6, int flags)
764 {
765         struct fib6_node *fn;
766         struct rt6_info *rt, *nrt;
767         int strict = 0;
768         int attempts = 3;
769         int err;
770         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
771
772         strict |= flags & RT6_LOOKUP_F_IFACE;
773
774 relookup:
775         read_lock_bh(&table->tb6_lock);
776
777 restart_2:
778         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
779
780 restart:
781         rt = rt6_select(fn, oif, strict | reachable);
782
783         BACKTRACK(net, &fl6->saddr);
784         if (rt == net->ipv6.ip6_null_entry ||
785             rt->rt6i_flags & RTF_CACHE)
786                 goto out;
787
788         dst_hold(&rt->dst);
789         read_unlock_bh(&table->tb6_lock);
790
791         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
792                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
793         else if (!(rt->dst.flags & DST_HOST))
794                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
795         else
796                 goto out2;
797
798         dst_release(&rt->dst);
799         rt = nrt ? : net->ipv6.ip6_null_entry;
800
801         dst_hold(&rt->dst);
802         if (nrt) {
803                 err = ip6_ins_rt(nrt);
804                 if (!err)
805                         goto out2;
806         }
807
808         if (--attempts <= 0)
809                 goto out2;
810
811         /*
812          * Race condition! In the gap, when table->tb6_lock was
813          * released someone could insert this route.  Relookup.
814          */
815         dst_release(&rt->dst);
816         goto relookup;
817
818 out:
819         if (reachable) {
820                 reachable = 0;
821                 goto restart_2;
822         }
823         dst_hold(&rt->dst);
824         read_unlock_bh(&table->tb6_lock);
825 out2:
826         rt->dst.lastuse = jiffies;
827         rt->dst.__use++;
828
829         return rt;
830 }
831
832 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
833                                             struct flowi6 *fl6, int flags)
834 {
835         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
836 }
837
838 void ip6_route_input(struct sk_buff *skb)
839 {
840         struct ipv6hdr *iph = ipv6_hdr(skb);
841         struct net *net = dev_net(skb->dev);
842         int flags = RT6_LOOKUP_F_HAS_SADDR;
843         struct flowi6 fl6 = {
844                 .flowi6_iif = skb->dev->ifindex,
845                 .daddr = iph->daddr,
846                 .saddr = iph->saddr,
847                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
848                 .flowi6_mark = skb->mark,
849                 .flowi6_proto = iph->nexthdr,
850         };
851
852         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
853                 flags |= RT6_LOOKUP_F_IFACE;
854
855         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
856 }
857
858 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
859                                              struct flowi6 *fl6, int flags)
860 {
861         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
862 }
863
864 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
865                                     struct flowi6 *fl6)
866 {
867         int flags = 0;
868
869         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
870                 flags |= RT6_LOOKUP_F_IFACE;
871
872         if (!ipv6_addr_any(&fl6->saddr))
873                 flags |= RT6_LOOKUP_F_HAS_SADDR;
874         else if (sk)
875                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
876
877         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
878 }
879
880 EXPORT_SYMBOL(ip6_route_output);
881
882 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
883 {
884         struct rt6_info *rt = dst_alloc(&ip6_dst_blackhole_ops, 1);
885         struct rt6_info *ort = (struct rt6_info *) dst_orig;
886         struct dst_entry *new = NULL;
887
888         if (rt) {
889                 new = &rt->dst;
890
891                 new->__use = 1;
892                 new->input = dst_discard;
893                 new->output = dst_discard;
894
895                 dst_copy_metrics(new, &ort->dst);
896                 new->dev = ort->dst.dev;
897                 if (new->dev)
898                         dev_hold(new->dev);
899                 rt->rt6i_idev = ort->rt6i_idev;
900                 if (rt->rt6i_idev)
901                         in6_dev_hold(rt->rt6i_idev);
902                 rt->rt6i_expires = 0;
903
904                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
905                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
906                 rt->rt6i_metric = 0;
907
908                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
909 #ifdef CONFIG_IPV6_SUBTREES
910                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
911 #endif
912
913                 dst_free(new);
914         }
915
916         dst_release(dst_orig);
917         return new ? new : ERR_PTR(-ENOMEM);
918 }
919
920 /*
921  *      Destination cache support functions
922  */
923
924 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
925 {
926         struct rt6_info *rt;
927
928         rt = (struct rt6_info *) dst;
929
930         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
931                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
932                         if (!rt->rt6i_peer)
933                                 rt6_bind_peer(rt, 0);
934                         rt->rt6i_peer_genid = rt6_peer_genid();
935                 }
936                 return dst;
937         }
938         return NULL;
939 }
940
941 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
942 {
943         struct rt6_info *rt = (struct rt6_info *) dst;
944
945         if (rt) {
946                 if (rt->rt6i_flags & RTF_CACHE) {
947                         if (rt6_check_expired(rt)) {
948                                 ip6_del_rt(rt);
949                                 dst = NULL;
950                         }
951                 } else {
952                         dst_release(dst);
953                         dst = NULL;
954                 }
955         }
956         return dst;
957 }
958
959 static void ip6_link_failure(struct sk_buff *skb)
960 {
961         struct rt6_info *rt;
962
963         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
964
965         rt = (struct rt6_info *) skb_dst(skb);
966         if (rt) {
967                 if (rt->rt6i_flags&RTF_CACHE) {
968                         dst_set_expires(&rt->dst, 0);
969                         rt->rt6i_flags |= RTF_EXPIRES;
970                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
971                         rt->rt6i_node->fn_sernum = -1;
972         }
973 }
974
975 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
976 {
977         struct rt6_info *rt6 = (struct rt6_info*)dst;
978
979         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
980                 rt6->rt6i_flags |= RTF_MODIFIED;
981                 if (mtu < IPV6_MIN_MTU) {
982                         u32 features = dst_metric(dst, RTAX_FEATURES);
983                         mtu = IPV6_MIN_MTU;
984                         features |= RTAX_FEATURE_ALLFRAG;
985                         dst_metric_set(dst, RTAX_FEATURES, features);
986                 }
987                 dst_metric_set(dst, RTAX_MTU, mtu);
988         }
989 }
990
991 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
992 {
993         struct net_device *dev = dst->dev;
994         unsigned int mtu = dst_mtu(dst);
995         struct net *net = dev_net(dev);
996
997         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
998
999         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1000                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1001
1002         /*
1003          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1004          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1005          * IPV6_MAXPLEN is also valid and means: "any MSS,
1006          * rely only on pmtu discovery"
1007          */
1008         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1009                 mtu = IPV6_MAXPLEN;
1010         return mtu;
1011 }
1012
1013 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1014 {
1015         unsigned int mtu = IPV6_MIN_MTU;
1016         struct inet6_dev *idev;
1017
1018         rcu_read_lock();
1019         idev = __in6_dev_get(dst->dev);
1020         if (idev)
1021                 mtu = idev->cnf.mtu6;
1022         rcu_read_unlock();
1023
1024         return mtu;
1025 }
1026
1027 static struct dst_entry *icmp6_dst_gc_list;
1028 static DEFINE_SPINLOCK(icmp6_dst_lock);
1029
1030 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1031                                   struct neighbour *neigh,
1032                                   const struct in6_addr *addr)
1033 {
1034         struct rt6_info *rt;
1035         struct inet6_dev *idev = in6_dev_get(dev);
1036         struct net *net = dev_net(dev);
1037
1038         if (unlikely(idev == NULL))
1039                 return NULL;
1040
1041         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1042         if (unlikely(rt == NULL)) {
1043                 in6_dev_put(idev);
1044                 goto out;
1045         }
1046
1047         dev_hold(dev);
1048         if (neigh)
1049                 neigh_hold(neigh);
1050         else {
1051                 neigh = ndisc_get_neigh(dev, addr);
1052                 if (IS_ERR(neigh))
1053                         neigh = NULL;
1054         }
1055
1056         rt->rt6i_dev      = dev;
1057         rt->rt6i_idev     = idev;
1058         rt->rt6i_nexthop  = neigh;
1059         atomic_set(&rt->dst.__refcnt, 1);
1060         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1061         rt->dst.output  = ip6_output;
1062
1063 #if 0   /* there's no chance to use these for ndisc */
1064         rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
1065                                 ? DST_HOST
1066                                 : 0;
1067         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1068         rt->rt6i_dst.plen = 128;
1069 #endif
1070
1071         spin_lock_bh(&icmp6_dst_lock);
1072         rt->dst.next = icmp6_dst_gc_list;
1073         icmp6_dst_gc_list = &rt->dst;
1074         spin_unlock_bh(&icmp6_dst_lock);
1075
1076         fib6_force_start_gc(net);
1077
1078 out:
1079         return &rt->dst;
1080 }
1081
1082 int icmp6_dst_gc(void)
1083 {
1084         struct dst_entry *dst, **pprev;
1085         int more = 0;
1086
1087         spin_lock_bh(&icmp6_dst_lock);
1088         pprev = &icmp6_dst_gc_list;
1089
1090         while ((dst = *pprev) != NULL) {
1091                 if (!atomic_read(&dst->__refcnt)) {
1092                         *pprev = dst->next;
1093                         dst_free(dst);
1094                 } else {
1095                         pprev = &dst->next;
1096                         ++more;
1097                 }
1098         }
1099
1100         spin_unlock_bh(&icmp6_dst_lock);
1101
1102         return more;
1103 }
1104
1105 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1106                             void *arg)
1107 {
1108         struct dst_entry *dst, **pprev;
1109
1110         spin_lock_bh(&icmp6_dst_lock);
1111         pprev = &icmp6_dst_gc_list;
1112         while ((dst = *pprev) != NULL) {
1113                 struct rt6_info *rt = (struct rt6_info *) dst;
1114                 if (func(rt, arg)) {
1115                         *pprev = dst->next;
1116                         dst_free(dst);
1117                 } else {
1118                         pprev = &dst->next;
1119                 }
1120         }
1121         spin_unlock_bh(&icmp6_dst_lock);
1122 }
1123
1124 static int ip6_dst_gc(struct dst_ops *ops)
1125 {
1126         unsigned long now = jiffies;
1127         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1128         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1129         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1130         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1131         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1132         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1133         int entries;
1134
1135         entries = dst_entries_get_fast(ops);
1136         if (time_after(rt_last_gc + rt_min_interval, now) &&
1137             entries <= rt_max_size)
1138                 goto out;
1139
1140         net->ipv6.ip6_rt_gc_expire++;
1141         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1142         net->ipv6.ip6_rt_last_gc = now;
1143         entries = dst_entries_get_slow(ops);
1144         if (entries < ops->gc_thresh)
1145                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1146 out:
1147         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1148         return entries > rt_max_size;
1149 }
1150
1151 /* Clean host part of a prefix. Not necessary in radix tree,
1152    but results in cleaner routing tables.
1153
1154    Remove it only when all the things will work!
1155  */
1156
1157 int ip6_dst_hoplimit(struct dst_entry *dst)
1158 {
1159         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1160         if (hoplimit == 0) {
1161                 struct net_device *dev = dst->dev;
1162                 struct inet6_dev *idev;
1163
1164                 rcu_read_lock();
1165                 idev = __in6_dev_get(dev);
1166                 if (idev)
1167                         hoplimit = idev->cnf.hop_limit;
1168                 else
1169                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1170                 rcu_read_unlock();
1171         }
1172         return hoplimit;
1173 }
1174 EXPORT_SYMBOL(ip6_dst_hoplimit);
1175
1176 /*
1177  *
1178  */
1179
1180 int ip6_route_add(struct fib6_config *cfg)
1181 {
1182         int err;
1183         struct net *net = cfg->fc_nlinfo.nl_net;
1184         struct rt6_info *rt = NULL;
1185         struct net_device *dev = NULL;
1186         struct inet6_dev *idev = NULL;
1187         struct fib6_table *table;
1188         int addr_type;
1189
1190         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1191                 return -EINVAL;
1192 #ifndef CONFIG_IPV6_SUBTREES
1193         if (cfg->fc_src_len)
1194                 return -EINVAL;
1195 #endif
1196         if (cfg->fc_ifindex) {
1197                 err = -ENODEV;
1198                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1199                 if (!dev)
1200                         goto out;
1201                 idev = in6_dev_get(dev);
1202                 if (!idev)
1203                         goto out;
1204         }
1205
1206         if (cfg->fc_metric == 0)
1207                 cfg->fc_metric = IP6_RT_PRIO_USER;
1208
1209         table = fib6_new_table(net, cfg->fc_table);
1210         if (table == NULL) {
1211                 err = -ENOBUFS;
1212                 goto out;
1213         }
1214
1215         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1216
1217         if (rt == NULL) {
1218                 err = -ENOMEM;
1219                 goto out;
1220         }
1221
1222         rt->dst.obsolete = -1;
1223         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1224                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1225                                 0;
1226
1227         if (cfg->fc_protocol == RTPROT_UNSPEC)
1228                 cfg->fc_protocol = RTPROT_BOOT;
1229         rt->rt6i_protocol = cfg->fc_protocol;
1230
1231         addr_type = ipv6_addr_type(&cfg->fc_dst);
1232
1233         if (addr_type & IPV6_ADDR_MULTICAST)
1234                 rt->dst.input = ip6_mc_input;
1235         else if (cfg->fc_flags & RTF_LOCAL)
1236                 rt->dst.input = ip6_input;
1237         else
1238                 rt->dst.input = ip6_forward;
1239
1240         rt->dst.output = ip6_output;
1241
1242         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1243         rt->rt6i_dst.plen = cfg->fc_dst_len;
1244         if (rt->rt6i_dst.plen == 128)
1245                rt->dst.flags = DST_HOST;
1246
1247 #ifdef CONFIG_IPV6_SUBTREES
1248         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1249         rt->rt6i_src.plen = cfg->fc_src_len;
1250 #endif
1251
1252         rt->rt6i_metric = cfg->fc_metric;
1253
1254         /* We cannot add true routes via loopback here,
1255            they would result in kernel looping; promote them to reject routes
1256          */
1257         if ((cfg->fc_flags & RTF_REJECT) ||
1258             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1259                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1260                 /* hold loopback dev/idev if we haven't done so. */
1261                 if (dev != net->loopback_dev) {
1262                         if (dev) {
1263                                 dev_put(dev);
1264                                 in6_dev_put(idev);
1265                         }
1266                         dev = net->loopback_dev;
1267                         dev_hold(dev);
1268                         idev = in6_dev_get(dev);
1269                         if (!idev) {
1270                                 err = -ENODEV;
1271                                 goto out;
1272                         }
1273                 }
1274                 rt->dst.output = ip6_pkt_discard_out;
1275                 rt->dst.input = ip6_pkt_discard;
1276                 rt->dst.error = -ENETUNREACH;
1277                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1278                 goto install_route;
1279         }
1280
1281         if (cfg->fc_flags & RTF_GATEWAY) {
1282                 struct in6_addr *gw_addr;
1283                 int gwa_type;
1284
1285                 gw_addr = &cfg->fc_gateway;
1286                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1287                 gwa_type = ipv6_addr_type(gw_addr);
1288
1289                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1290                         struct rt6_info *grt;
1291
1292                         /* IPv6 strictly inhibits using not link-local
1293                            addresses as nexthop address.
1294                            Otherwise, router will not able to send redirects.
1295                            It is very good, but in some (rare!) circumstances
1296                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1297                            some exceptions. --ANK
1298                          */
1299                         err = -EINVAL;
1300                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1301                                 goto out;
1302
1303                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1304
1305                         err = -EHOSTUNREACH;
1306                         if (grt == NULL)
1307                                 goto out;
1308                         if (dev) {
1309                                 if (dev != grt->rt6i_dev) {
1310                                         dst_release(&grt->dst);
1311                                         goto out;
1312                                 }
1313                         } else {
1314                                 dev = grt->rt6i_dev;
1315                                 idev = grt->rt6i_idev;
1316                                 dev_hold(dev);
1317                                 in6_dev_hold(grt->rt6i_idev);
1318                         }
1319                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1320                                 err = 0;
1321                         dst_release(&grt->dst);
1322
1323                         if (err)
1324                                 goto out;
1325                 }
1326                 err = -EINVAL;
1327                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1328                         goto out;
1329         }
1330
1331         err = -ENODEV;
1332         if (dev == NULL)
1333                 goto out;
1334
1335         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1336                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1337                 if (IS_ERR(rt->rt6i_nexthop)) {
1338                         err = PTR_ERR(rt->rt6i_nexthop);
1339                         rt->rt6i_nexthop = NULL;
1340                         goto out;
1341                 }
1342         }
1343
1344         rt->rt6i_flags = cfg->fc_flags;
1345
1346 install_route:
1347         if (cfg->fc_mx) {
1348                 struct nlattr *nla;
1349                 int remaining;
1350
1351                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1352                         int type = nla_type(nla);
1353
1354                         if (type) {
1355                                 if (type > RTAX_MAX) {
1356                                         err = -EINVAL;
1357                                         goto out;
1358                                 }
1359
1360                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1361                         }
1362                 }
1363         }
1364
1365         rt->dst.dev = dev;
1366         rt->rt6i_idev = idev;
1367         rt->rt6i_table = table;
1368
1369         cfg->fc_nlinfo.nl_net = dev_net(dev);
1370
1371         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1372
1373 out:
1374         if (dev)
1375                 dev_put(dev);
1376         if (idev)
1377                 in6_dev_put(idev);
1378         if (rt)
1379                 dst_free(&rt->dst);
1380         return err;
1381 }
1382
1383 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1384 {
1385         int err;
1386         struct fib6_table *table;
1387         struct net *net = dev_net(rt->rt6i_dev);
1388
1389         if (rt == net->ipv6.ip6_null_entry)
1390                 return -ENOENT;
1391
1392         table = rt->rt6i_table;
1393         write_lock_bh(&table->tb6_lock);
1394
1395         err = fib6_del(rt, info);
1396         dst_release(&rt->dst);
1397
1398         write_unlock_bh(&table->tb6_lock);
1399
1400         return err;
1401 }
1402
1403 int ip6_del_rt(struct rt6_info *rt)
1404 {
1405         struct nl_info info = {
1406                 .nl_net = dev_net(rt->rt6i_dev),
1407         };
1408         return __ip6_del_rt(rt, &info);
1409 }
1410
1411 static int ip6_route_del(struct fib6_config *cfg)
1412 {
1413         struct fib6_table *table;
1414         struct fib6_node *fn;
1415         struct rt6_info *rt;
1416         int err = -ESRCH;
1417
1418         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1419         if (table == NULL)
1420                 return err;
1421
1422         read_lock_bh(&table->tb6_lock);
1423
1424         fn = fib6_locate(&table->tb6_root,
1425                          &cfg->fc_dst, cfg->fc_dst_len,
1426                          &cfg->fc_src, cfg->fc_src_len);
1427
1428         if (fn) {
1429                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1430                         if (cfg->fc_ifindex &&
1431                             (rt->rt6i_dev == NULL ||
1432                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1433                                 continue;
1434                         if (cfg->fc_flags & RTF_GATEWAY &&
1435                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1436                                 continue;
1437                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1438                                 continue;
1439                         dst_hold(&rt->dst);
1440                         read_unlock_bh(&table->tb6_lock);
1441
1442                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1443                 }
1444         }
1445         read_unlock_bh(&table->tb6_lock);
1446
1447         return err;
1448 }
1449
1450 /*
1451  *      Handle redirects
1452  */
1453 struct ip6rd_flowi {
1454         struct flowi6 fl6;
1455         struct in6_addr gateway;
1456 };
1457
1458 static struct rt6_info *__ip6_route_redirect(struct net *net,
1459                                              struct fib6_table *table,
1460                                              struct flowi6 *fl6,
1461                                              int flags)
1462 {
1463         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1464         struct rt6_info *rt;
1465         struct fib6_node *fn;
1466
1467         /*
1468          * Get the "current" route for this destination and
1469          * check if the redirect has come from approriate router.
1470          *
1471          * RFC 2461 specifies that redirects should only be
1472          * accepted if they come from the nexthop to the target.
1473          * Due to the way the routes are chosen, this notion
1474          * is a bit fuzzy and one might need to check all possible
1475          * routes.
1476          */
1477
1478         read_lock_bh(&table->tb6_lock);
1479         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1480 restart:
1481         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1482                 /*
1483                  * Current route is on-link; redirect is always invalid.
1484                  *
1485                  * Seems, previous statement is not true. It could
1486                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1487                  * But then router serving it might decide, that we should
1488                  * know truth 8)8) --ANK (980726).
1489                  */
1490                 if (rt6_check_expired(rt))
1491                         continue;
1492                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1493                         continue;
1494                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1495                         continue;
1496                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1497                         continue;
1498                 break;
1499         }
1500
1501         if (!rt)
1502                 rt = net->ipv6.ip6_null_entry;
1503         BACKTRACK(net, &fl6->saddr);
1504 out:
1505         dst_hold(&rt->dst);
1506
1507         read_unlock_bh(&table->tb6_lock);
1508
1509         return rt;
1510 };
1511
1512 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1513                                            struct in6_addr *src,
1514                                            struct in6_addr *gateway,
1515                                            struct net_device *dev)
1516 {
1517         int flags = RT6_LOOKUP_F_HAS_SADDR;
1518         struct net *net = dev_net(dev);
1519         struct ip6rd_flowi rdfl = {
1520                 .fl6 = {
1521                         .flowi6_oif = dev->ifindex,
1522                         .daddr = *dest,
1523                         .saddr = *src,
1524                 },
1525         };
1526
1527         ipv6_addr_copy(&rdfl.gateway, gateway);
1528
1529         if (rt6_need_strict(dest))
1530                 flags |= RT6_LOOKUP_F_IFACE;
1531
1532         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1533                                                    flags, __ip6_route_redirect);
1534 }
1535
1536 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1537                   struct in6_addr *saddr,
1538                   struct neighbour *neigh, u8 *lladdr, int on_link)
1539 {
1540         struct rt6_info *rt, *nrt = NULL;
1541         struct netevent_redirect netevent;
1542         struct net *net = dev_net(neigh->dev);
1543
1544         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1545
1546         if (rt == net->ipv6.ip6_null_entry) {
1547                 if (net_ratelimit())
1548                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1549                                "for redirect target\n");
1550                 goto out;
1551         }
1552
1553         /*
1554          *      We have finally decided to accept it.
1555          */
1556
1557         neigh_update(neigh, lladdr, NUD_STALE,
1558                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1559                      NEIGH_UPDATE_F_OVERRIDE|
1560                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1561                                      NEIGH_UPDATE_F_ISROUTER))
1562                      );
1563
1564         /*
1565          * Redirect received -> path was valid.
1566          * Look, redirects are sent only in response to data packets,
1567          * so that this nexthop apparently is reachable. --ANK
1568          */
1569         dst_confirm(&rt->dst);
1570
1571         /* Duplicate redirect: silently ignore. */
1572         if (neigh == rt->dst.neighbour)
1573                 goto out;
1574
1575         nrt = ip6_rt_copy(rt);
1576         if (nrt == NULL)
1577                 goto out;
1578
1579         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1580         if (on_link)
1581                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1582
1583         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1584         nrt->rt6i_dst.plen = 128;
1585         nrt->dst.flags |= DST_HOST;
1586
1587         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1588         nrt->rt6i_nexthop = neigh_clone(neigh);
1589
1590         if (ip6_ins_rt(nrt))
1591                 goto out;
1592
1593         netevent.old = &rt->dst;
1594         netevent.new = &nrt->dst;
1595         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1596
1597         if (rt->rt6i_flags&RTF_CACHE) {
1598                 ip6_del_rt(rt);
1599                 return;
1600         }
1601
1602 out:
1603         dst_release(&rt->dst);
1604 }
1605
1606 /*
1607  *      Handle ICMP "packet too big" messages
1608  *      i.e. Path MTU discovery
1609  */
1610
1611 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1612                              struct net *net, u32 pmtu, int ifindex)
1613 {
1614         struct rt6_info *rt, *nrt;
1615         int allfrag = 0;
1616 again:
1617         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1618         if (rt == NULL)
1619                 return;
1620
1621         if (rt6_check_expired(rt)) {
1622                 ip6_del_rt(rt);
1623                 goto again;
1624         }
1625
1626         if (pmtu >= dst_mtu(&rt->dst))
1627                 goto out;
1628
1629         if (pmtu < IPV6_MIN_MTU) {
1630                 /*
1631                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1632                  * MTU (1280) and a fragment header should always be included
1633                  * after a node receiving Too Big message reporting PMTU is
1634                  * less than the IPv6 Minimum Link MTU.
1635                  */
1636                 pmtu = IPV6_MIN_MTU;
1637                 allfrag = 1;
1638         }
1639
1640         /* New mtu received -> path was valid.
1641            They are sent only in response to data packets,
1642            so that this nexthop apparently is reachable. --ANK
1643          */
1644         dst_confirm(&rt->dst);
1645
1646         /* Host route. If it is static, it would be better
1647            not to override it, but add new one, so that
1648            when cache entry will expire old pmtu
1649            would return automatically.
1650          */
1651         if (rt->rt6i_flags & RTF_CACHE) {
1652                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1653                 if (allfrag) {
1654                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1655                         features |= RTAX_FEATURE_ALLFRAG;
1656                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1657                 }
1658                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1659                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1660                 goto out;
1661         }
1662
1663         /* Network route.
1664            Two cases are possible:
1665            1. It is connected route. Action: COW
1666            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1667          */
1668         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1669                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1670         else
1671                 nrt = rt6_alloc_clone(rt, daddr);
1672
1673         if (nrt) {
1674                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1675                 if (allfrag) {
1676                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1677                         features |= RTAX_FEATURE_ALLFRAG;
1678                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1679                 }
1680
1681                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1682                  * happened within 5 mins, the recommended timer is 10 mins.
1683                  * Here this route expiration time is set to ip6_rt_mtu_expires
1684                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1685                  * and detecting PMTU increase will be automatically happened.
1686                  */
1687                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1688                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1689
1690                 ip6_ins_rt(nrt);
1691         }
1692 out:
1693         dst_release(&rt->dst);
1694 }
1695
1696 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1697                         struct net_device *dev, u32 pmtu)
1698 {
1699         struct net *net = dev_net(dev);
1700
1701         /*
1702          * RFC 1981 states that a node "MUST reduce the size of the packets it
1703          * is sending along the path" that caused the Packet Too Big message.
1704          * Since it's not possible in the general case to determine which
1705          * interface was used to send the original packet, we update the MTU
1706          * on the interface that will be used to send future packets. We also
1707          * update the MTU on the interface that received the Packet Too Big in
1708          * case the original packet was forced out that interface with
1709          * SO_BINDTODEVICE or similar. This is the next best thing to the
1710          * correct behaviour, which would be to update the MTU on all
1711          * interfaces.
1712          */
1713         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1714         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1715 }
1716
1717 /*
1718  *      Misc support functions
1719  */
1720
1721 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1722 {
1723         struct net *net = dev_net(ort->rt6i_dev);
1724         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1725
1726         if (rt) {
1727                 rt->dst.input = ort->dst.input;
1728                 rt->dst.output = ort->dst.output;
1729
1730                 dst_copy_metrics(&rt->dst, &ort->dst);
1731                 rt->dst.error = ort->dst.error;
1732                 rt->dst.dev = ort->dst.dev;
1733                 if (rt->dst.dev)
1734                         dev_hold(rt->dst.dev);
1735                 rt->rt6i_idev = ort->rt6i_idev;
1736                 if (rt->rt6i_idev)
1737                         in6_dev_hold(rt->rt6i_idev);
1738                 rt->dst.lastuse = jiffies;
1739                 rt->rt6i_expires = 0;
1740
1741                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1742                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1743                 rt->rt6i_metric = 0;
1744
1745                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1746 #ifdef CONFIG_IPV6_SUBTREES
1747                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1748 #endif
1749                 rt->rt6i_table = ort->rt6i_table;
1750         }
1751         return rt;
1752 }
1753
1754 #ifdef CONFIG_IPV6_ROUTE_INFO
1755 static struct rt6_info *rt6_get_route_info(struct net *net,
1756                                            struct in6_addr *prefix, int prefixlen,
1757                                            struct in6_addr *gwaddr, int ifindex)
1758 {
1759         struct fib6_node *fn;
1760         struct rt6_info *rt = NULL;
1761         struct fib6_table *table;
1762
1763         table = fib6_get_table(net, RT6_TABLE_INFO);
1764         if (table == NULL)
1765                 return NULL;
1766
1767         write_lock_bh(&table->tb6_lock);
1768         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1769         if (!fn)
1770                 goto out;
1771
1772         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1773                 if (rt->rt6i_dev->ifindex != ifindex)
1774                         continue;
1775                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1776                         continue;
1777                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1778                         continue;
1779                 dst_hold(&rt->dst);
1780                 break;
1781         }
1782 out:
1783         write_unlock_bh(&table->tb6_lock);
1784         return rt;
1785 }
1786
1787 static struct rt6_info *rt6_add_route_info(struct net *net,
1788                                            struct in6_addr *prefix, int prefixlen,
1789                                            struct in6_addr *gwaddr, int ifindex,
1790                                            unsigned pref)
1791 {
1792         struct fib6_config cfg = {
1793                 .fc_table       = RT6_TABLE_INFO,
1794                 .fc_metric      = IP6_RT_PRIO_USER,
1795                 .fc_ifindex     = ifindex,
1796                 .fc_dst_len     = prefixlen,
1797                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1798                                   RTF_UP | RTF_PREF(pref),
1799                 .fc_nlinfo.pid = 0,
1800                 .fc_nlinfo.nlh = NULL,
1801                 .fc_nlinfo.nl_net = net,
1802         };
1803
1804         ipv6_addr_copy(&cfg.fc_dst, prefix);
1805         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1806
1807         /* We should treat it as a default route if prefix length is 0. */
1808         if (!prefixlen)
1809                 cfg.fc_flags |= RTF_DEFAULT;
1810
1811         ip6_route_add(&cfg);
1812
1813         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1814 }
1815 #endif
1816
1817 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1818 {
1819         struct rt6_info *rt;
1820         struct fib6_table *table;
1821
1822         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1823         if (table == NULL)
1824                 return NULL;
1825
1826         write_lock_bh(&table->tb6_lock);
1827         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1828                 if (dev == rt->rt6i_dev &&
1829                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1830                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1831                         break;
1832         }
1833         if (rt)
1834                 dst_hold(&rt->dst);
1835         write_unlock_bh(&table->tb6_lock);
1836         return rt;
1837 }
1838
1839 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1840                                      struct net_device *dev,
1841                                      unsigned int pref)
1842 {
1843         struct fib6_config cfg = {
1844                 .fc_table       = RT6_TABLE_DFLT,
1845                 .fc_metric      = IP6_RT_PRIO_USER,
1846                 .fc_ifindex     = dev->ifindex,
1847                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1848                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1849                 .fc_nlinfo.pid = 0,
1850                 .fc_nlinfo.nlh = NULL,
1851                 .fc_nlinfo.nl_net = dev_net(dev),
1852         };
1853
1854         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1855
1856         ip6_route_add(&cfg);
1857
1858         return rt6_get_dflt_router(gwaddr, dev);
1859 }
1860
1861 void rt6_purge_dflt_routers(struct net *net)
1862 {
1863         struct rt6_info *rt;
1864         struct fib6_table *table;
1865
1866         /* NOTE: Keep consistent with rt6_get_dflt_router */
1867         table = fib6_get_table(net, RT6_TABLE_DFLT);
1868         if (table == NULL)
1869                 return;
1870
1871 restart:
1872         read_lock_bh(&table->tb6_lock);
1873         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1874                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1875                         dst_hold(&rt->dst);
1876                         read_unlock_bh(&table->tb6_lock);
1877                         ip6_del_rt(rt);
1878                         goto restart;
1879                 }
1880         }
1881         read_unlock_bh(&table->tb6_lock);
1882 }
1883
1884 static void rtmsg_to_fib6_config(struct net *net,
1885                                  struct in6_rtmsg *rtmsg,
1886                                  struct fib6_config *cfg)
1887 {
1888         memset(cfg, 0, sizeof(*cfg));
1889
1890         cfg->fc_table = RT6_TABLE_MAIN;
1891         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1892         cfg->fc_metric = rtmsg->rtmsg_metric;
1893         cfg->fc_expires = rtmsg->rtmsg_info;
1894         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1895         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1896         cfg->fc_flags = rtmsg->rtmsg_flags;
1897
1898         cfg->fc_nlinfo.nl_net = net;
1899
1900         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1901         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1902         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1903 }
1904
1905 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1906 {
1907         struct fib6_config cfg;
1908         struct in6_rtmsg rtmsg;
1909         int err;
1910
1911         switch(cmd) {
1912         case SIOCADDRT:         /* Add a route */
1913         case SIOCDELRT:         /* Delete a route */
1914                 if (!capable(CAP_NET_ADMIN))
1915                         return -EPERM;
1916                 err = copy_from_user(&rtmsg, arg,
1917                                      sizeof(struct in6_rtmsg));
1918                 if (err)
1919                         return -EFAULT;
1920
1921                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1922
1923                 rtnl_lock();
1924                 switch (cmd) {
1925                 case SIOCADDRT:
1926                         err = ip6_route_add(&cfg);
1927                         break;
1928                 case SIOCDELRT:
1929                         err = ip6_route_del(&cfg);
1930                         break;
1931                 default:
1932                         err = -EINVAL;
1933                 }
1934                 rtnl_unlock();
1935
1936                 return err;
1937         }
1938
1939         return -EINVAL;
1940 }
1941
1942 /*
1943  *      Drop the packet on the floor
1944  */
1945
1946 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1947 {
1948         int type;
1949         struct dst_entry *dst = skb_dst(skb);
1950         switch (ipstats_mib_noroutes) {
1951         case IPSTATS_MIB_INNOROUTES:
1952                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1953                 if (type == IPV6_ADDR_ANY) {
1954                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1955                                       IPSTATS_MIB_INADDRERRORS);
1956                         break;
1957                 }
1958                 /* FALLTHROUGH */
1959         case IPSTATS_MIB_OUTNOROUTES:
1960                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1961                               ipstats_mib_noroutes);
1962                 break;
1963         }
1964         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1965         kfree_skb(skb);
1966         return 0;
1967 }
1968
1969 static int ip6_pkt_discard(struct sk_buff *skb)
1970 {
1971         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1972 }
1973
1974 static int ip6_pkt_discard_out(struct sk_buff *skb)
1975 {
1976         skb->dev = skb_dst(skb)->dev;
1977         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1978 }
1979
1980 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1981
1982 static int ip6_pkt_prohibit(struct sk_buff *skb)
1983 {
1984         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1985 }
1986
1987 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1988 {
1989         skb->dev = skb_dst(skb)->dev;
1990         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1991 }
1992
1993 #endif
1994
1995 /*
1996  *      Allocate a dst for local (unicast / anycast) address.
1997  */
1998
1999 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2000                                     const struct in6_addr *addr,
2001                                     int anycast)
2002 {
2003         struct net *net = dev_net(idev->dev);
2004         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
2005         struct neighbour *neigh;
2006
2007         if (rt == NULL) {
2008                 if (net_ratelimit())
2009                         pr_warning("IPv6:  Maximum number of routes reached,"
2010                                    " consider increasing route/max_size.\n");
2011                 return ERR_PTR(-ENOMEM);
2012         }
2013
2014         dev_hold(net->loopback_dev);
2015         in6_dev_hold(idev);
2016
2017         rt->dst.flags = DST_HOST;
2018         rt->dst.input = ip6_input;
2019         rt->dst.output = ip6_output;
2020         rt->rt6i_dev = net->loopback_dev;
2021         rt->rt6i_idev = idev;
2022         rt->dst.obsolete = -1;
2023
2024         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2025         if (anycast)
2026                 rt->rt6i_flags |= RTF_ANYCAST;
2027         else
2028                 rt->rt6i_flags |= RTF_LOCAL;
2029         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2030         if (IS_ERR(neigh)) {
2031                 dst_free(&rt->dst);
2032
2033                 return ERR_CAST(neigh);
2034         }
2035         rt->rt6i_nexthop = neigh;
2036
2037         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2038         rt->rt6i_dst.plen = 128;
2039         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2040
2041         atomic_set(&rt->dst.__refcnt, 1);
2042
2043         return rt;
2044 }
2045
2046 struct arg_dev_net {
2047         struct net_device *dev;
2048         struct net *net;
2049 };
2050
2051 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2052 {
2053         const struct arg_dev_net *adn = arg;
2054         const struct net_device *dev = adn->dev;
2055
2056         if ((rt->rt6i_dev == dev || dev == NULL) &&
2057             rt != adn->net->ipv6.ip6_null_entry) {
2058                 RT6_TRACE("deleted by ifdown %p\n", rt);
2059                 return -1;
2060         }
2061         return 0;
2062 }
2063
2064 void rt6_ifdown(struct net *net, struct net_device *dev)
2065 {
2066         struct arg_dev_net adn = {
2067                 .dev = dev,
2068                 .net = net,
2069         };
2070
2071         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2072         icmp6_clean_all(fib6_ifdown, &adn);
2073 }
2074
2075 struct rt6_mtu_change_arg
2076 {
2077         struct net_device *dev;
2078         unsigned mtu;
2079 };
2080
2081 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2082 {
2083         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2084         struct inet6_dev *idev;
2085
2086         /* In IPv6 pmtu discovery is not optional,
2087            so that RTAX_MTU lock cannot disable it.
2088            We still use this lock to block changes
2089            caused by addrconf/ndisc.
2090         */
2091
2092         idev = __in6_dev_get(arg->dev);
2093         if (idev == NULL)
2094                 return 0;
2095
2096         /* For administrative MTU increase, there is no way to discover
2097            IPv6 PMTU increase, so PMTU increase should be updated here.
2098            Since RFC 1981 doesn't include administrative MTU increase
2099            update PMTU increase is a MUST. (i.e. jumbo frame)
2100          */
2101         /*
2102            If new MTU is less than route PMTU, this new MTU will be the
2103            lowest MTU in the path, update the route PMTU to reflect PMTU
2104            decreases; if new MTU is greater than route PMTU, and the
2105            old MTU is the lowest MTU in the path, update the route PMTU
2106            to reflect the increase. In this case if the other nodes' MTU
2107            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2108            PMTU discouvery.
2109          */
2110         if (rt->rt6i_dev == arg->dev &&
2111             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2112             (dst_mtu(&rt->dst) >= arg->mtu ||
2113              (dst_mtu(&rt->dst) < arg->mtu &&
2114               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2115                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2116         }
2117         return 0;
2118 }
2119
2120 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2121 {
2122         struct rt6_mtu_change_arg arg = {
2123                 .dev = dev,
2124                 .mtu = mtu,
2125         };
2126
2127         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2128 }
2129
2130 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2131         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2132         [RTA_OIF]               = { .type = NLA_U32 },
2133         [RTA_IIF]               = { .type = NLA_U32 },
2134         [RTA_PRIORITY]          = { .type = NLA_U32 },
2135         [RTA_METRICS]           = { .type = NLA_NESTED },
2136 };
2137
2138 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2139                               struct fib6_config *cfg)
2140 {
2141         struct rtmsg *rtm;
2142         struct nlattr *tb[RTA_MAX+1];
2143         int err;
2144
2145         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2146         if (err < 0)
2147                 goto errout;
2148
2149         err = -EINVAL;
2150         rtm = nlmsg_data(nlh);
2151         memset(cfg, 0, sizeof(*cfg));
2152
2153         cfg->fc_table = rtm->rtm_table;
2154         cfg->fc_dst_len = rtm->rtm_dst_len;
2155         cfg->fc_src_len = rtm->rtm_src_len;
2156         cfg->fc_flags = RTF_UP;
2157         cfg->fc_protocol = rtm->rtm_protocol;
2158
2159         if (rtm->rtm_type == RTN_UNREACHABLE)
2160                 cfg->fc_flags |= RTF_REJECT;
2161
2162         if (rtm->rtm_type == RTN_LOCAL)
2163                 cfg->fc_flags |= RTF_LOCAL;
2164
2165         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2166         cfg->fc_nlinfo.nlh = nlh;
2167         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2168
2169         if (tb[RTA_GATEWAY]) {
2170                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2171                 cfg->fc_flags |= RTF_GATEWAY;
2172         }
2173
2174         if (tb[RTA_DST]) {
2175                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2176
2177                 if (nla_len(tb[RTA_DST]) < plen)
2178                         goto errout;
2179
2180                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2181         }
2182
2183         if (tb[RTA_SRC]) {
2184                 int plen = (rtm->rtm_src_len + 7) >> 3;
2185
2186                 if (nla_len(tb[RTA_SRC]) < plen)
2187                         goto errout;
2188
2189                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2190         }
2191
2192         if (tb[RTA_OIF])
2193                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2194
2195         if (tb[RTA_PRIORITY])
2196                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2197
2198         if (tb[RTA_METRICS]) {
2199                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2200                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2201         }
2202
2203         if (tb[RTA_TABLE])
2204                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2205
2206         err = 0;
2207 errout:
2208         return err;
2209 }
2210
2211 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2212 {
2213         struct fib6_config cfg;
2214         int err;
2215
2216         err = rtm_to_fib6_config(skb, nlh, &cfg);
2217         if (err < 0)
2218                 return err;
2219
2220         return ip6_route_del(&cfg);
2221 }
2222
2223 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2224 {
2225         struct fib6_config cfg;
2226         int err;
2227
2228         err = rtm_to_fib6_config(skb, nlh, &cfg);
2229         if (err < 0)
2230                 return err;
2231
2232         return ip6_route_add(&cfg);
2233 }
2234
2235 static inline size_t rt6_nlmsg_size(void)
2236 {
2237         return NLMSG_ALIGN(sizeof(struct rtmsg))
2238                + nla_total_size(16) /* RTA_SRC */
2239                + nla_total_size(16) /* RTA_DST */
2240                + nla_total_size(16) /* RTA_GATEWAY */
2241                + nla_total_size(16) /* RTA_PREFSRC */
2242                + nla_total_size(4) /* RTA_TABLE */
2243                + nla_total_size(4) /* RTA_IIF */
2244                + nla_total_size(4) /* RTA_OIF */
2245                + nla_total_size(4) /* RTA_PRIORITY */
2246                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2247                + nla_total_size(sizeof(struct rta_cacheinfo));
2248 }
2249
2250 static int rt6_fill_node(struct net *net,
2251                          struct sk_buff *skb, struct rt6_info *rt,
2252                          struct in6_addr *dst, struct in6_addr *src,
2253                          int iif, int type, u32 pid, u32 seq,
2254                          int prefix, int nowait, unsigned int flags)
2255 {
2256         struct rtmsg *rtm;
2257         struct nlmsghdr *nlh;
2258         long expires;
2259         u32 table;
2260
2261         if (prefix) {   /* user wants prefix routes only */
2262                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2263                         /* success since this is not a prefix route */
2264                         return 1;
2265                 }
2266         }
2267
2268         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2269         if (nlh == NULL)
2270                 return -EMSGSIZE;
2271
2272         rtm = nlmsg_data(nlh);
2273         rtm->rtm_family = AF_INET6;
2274         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2275         rtm->rtm_src_len = rt->rt6i_src.plen;
2276         rtm->rtm_tos = 0;
2277         if (rt->rt6i_table)
2278                 table = rt->rt6i_table->tb6_id;
2279         else
2280                 table = RT6_TABLE_UNSPEC;
2281         rtm->rtm_table = table;
2282         NLA_PUT_U32(skb, RTA_TABLE, table);
2283         if (rt->rt6i_flags&RTF_REJECT)
2284                 rtm->rtm_type = RTN_UNREACHABLE;
2285         else if (rt->rt6i_flags&RTF_LOCAL)
2286                 rtm->rtm_type = RTN_LOCAL;
2287         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2288                 rtm->rtm_type = RTN_LOCAL;
2289         else
2290                 rtm->rtm_type = RTN_UNICAST;
2291         rtm->rtm_flags = 0;
2292         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2293         rtm->rtm_protocol = rt->rt6i_protocol;
2294         if (rt->rt6i_flags&RTF_DYNAMIC)
2295                 rtm->rtm_protocol = RTPROT_REDIRECT;
2296         else if (rt->rt6i_flags & RTF_ADDRCONF)
2297                 rtm->rtm_protocol = RTPROT_KERNEL;
2298         else if (rt->rt6i_flags&RTF_DEFAULT)
2299                 rtm->rtm_protocol = RTPROT_RA;
2300
2301         if (rt->rt6i_flags&RTF_CACHE)
2302                 rtm->rtm_flags |= RTM_F_CLONED;
2303
2304         if (dst) {
2305                 NLA_PUT(skb, RTA_DST, 16, dst);
2306                 rtm->rtm_dst_len = 128;
2307         } else if (rtm->rtm_dst_len)
2308                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2309 #ifdef CONFIG_IPV6_SUBTREES
2310         if (src) {
2311                 NLA_PUT(skb, RTA_SRC, 16, src);
2312                 rtm->rtm_src_len = 128;
2313         } else if (rtm->rtm_src_len)
2314                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2315 #endif
2316         if (iif) {
2317 #ifdef CONFIG_IPV6_MROUTE
2318                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2319                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2320                         if (err <= 0) {
2321                                 if (!nowait) {
2322                                         if (err == 0)
2323                                                 return 0;
2324                                         goto nla_put_failure;
2325                                 } else {
2326                                         if (err == -EMSGSIZE)
2327                                                 goto nla_put_failure;
2328                                 }
2329                         }
2330                 } else
2331 #endif
2332                         NLA_PUT_U32(skb, RTA_IIF, iif);
2333         } else if (dst) {
2334                 struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2335                 struct in6_addr saddr_buf;
2336                 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2337                                        dst, 0, &saddr_buf) == 0)
2338                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2339         }
2340
2341         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2342                 goto nla_put_failure;
2343
2344         if (rt->dst.neighbour)
2345                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2346
2347         if (rt->dst.dev)
2348                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2349
2350         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2351
2352         if (!(rt->rt6i_flags & RTF_EXPIRES))
2353                 expires = 0;
2354         else if (rt->rt6i_expires - jiffies < INT_MAX)
2355                 expires = rt->rt6i_expires - jiffies;
2356         else
2357                 expires = INT_MAX;
2358
2359         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2360                                expires, rt->dst.error) < 0)
2361                 goto nla_put_failure;
2362
2363         return nlmsg_end(skb, nlh);
2364
2365 nla_put_failure:
2366         nlmsg_cancel(skb, nlh);
2367         return -EMSGSIZE;
2368 }
2369
2370 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2371 {
2372         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2373         int prefix;
2374
2375         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2376                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2377                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2378         } else
2379                 prefix = 0;
2380
2381         return rt6_fill_node(arg->net,
2382                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2383                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2384                      prefix, 0, NLM_F_MULTI);
2385 }
2386
2387 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2388 {
2389         struct net *net = sock_net(in_skb->sk);
2390         struct nlattr *tb[RTA_MAX+1];
2391         struct rt6_info *rt;
2392         struct sk_buff *skb;
2393         struct rtmsg *rtm;
2394         struct flowi6 fl6;
2395         int err, iif = 0;
2396
2397         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2398         if (err < 0)
2399                 goto errout;
2400
2401         err = -EINVAL;
2402         memset(&fl6, 0, sizeof(fl6));
2403
2404         if (tb[RTA_SRC]) {
2405                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2406                         goto errout;
2407
2408                 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2409         }
2410
2411         if (tb[RTA_DST]) {
2412                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2413                         goto errout;
2414
2415                 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2416         }
2417
2418         if (tb[RTA_IIF])
2419                 iif = nla_get_u32(tb[RTA_IIF]);
2420
2421         if (tb[RTA_OIF])
2422                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2423
2424         if (iif) {
2425                 struct net_device *dev;
2426                 dev = __dev_get_by_index(net, iif);
2427                 if (!dev) {
2428                         err = -ENODEV;
2429                         goto errout;
2430                 }
2431         }
2432
2433         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2434         if (skb == NULL) {
2435                 err = -ENOBUFS;
2436                 goto errout;
2437         }
2438
2439         /* Reserve room for dummy headers, this skb can pass
2440            through good chunk of routing engine.
2441          */
2442         skb_reset_mac_header(skb);
2443         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2444
2445         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2446         skb_dst_set(skb, &rt->dst);
2447
2448         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2449                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2450                             nlh->nlmsg_seq, 0, 0, 0);
2451         if (err < 0) {
2452                 kfree_skb(skb);
2453                 goto errout;
2454         }
2455
2456         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2457 errout:
2458         return err;
2459 }
2460
2461 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2462 {
2463         struct sk_buff *skb;
2464         struct net *net = info->nl_net;
2465         u32 seq;
2466         int err;
2467
2468         err = -ENOBUFS;
2469         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2470
2471         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2472         if (skb == NULL)
2473                 goto errout;
2474
2475         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2476                                 event, info->pid, seq, 0, 0, 0);
2477         if (err < 0) {
2478                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2479                 WARN_ON(err == -EMSGSIZE);
2480                 kfree_skb(skb);
2481                 goto errout;
2482         }
2483         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2484                     info->nlh, gfp_any());
2485         return;
2486 errout:
2487         if (err < 0)
2488                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2489 }
2490
2491 static int ip6_route_dev_notify(struct notifier_block *this,
2492                                 unsigned long event, void *data)
2493 {
2494         struct net_device *dev = (struct net_device *)data;
2495         struct net *net = dev_net(dev);
2496
2497         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2498                 net->ipv6.ip6_null_entry->dst.dev = dev;
2499                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2500 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2501                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2502                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2503                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2504                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2505 #endif
2506         }
2507
2508         return NOTIFY_OK;
2509 }
2510
2511 /*
2512  *      /proc
2513  */
2514
2515 #ifdef CONFIG_PROC_FS
2516
2517 struct rt6_proc_arg
2518 {
2519         char *buffer;
2520         int offset;
2521         int length;
2522         int skip;
2523         int len;
2524 };
2525
2526 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2527 {
2528         struct seq_file *m = p_arg;
2529
2530         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2531
2532 #ifdef CONFIG_IPV6_SUBTREES
2533         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2534 #else
2535         seq_puts(m, "00000000000000000000000000000000 00 ");
2536 #endif
2537
2538         if (rt->rt6i_nexthop) {
2539                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2540         } else {
2541                 seq_puts(m, "00000000000000000000000000000000");
2542         }
2543         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2544                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2545                    rt->dst.__use, rt->rt6i_flags,
2546                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2547         return 0;
2548 }
2549
2550 static int ipv6_route_show(struct seq_file *m, void *v)
2551 {
2552         struct net *net = (struct net *)m->private;
2553         fib6_clean_all(net, rt6_info_route, 0, m);
2554         return 0;
2555 }
2556
2557 static int ipv6_route_open(struct inode *inode, struct file *file)
2558 {
2559         return single_open_net(inode, file, ipv6_route_show);
2560 }
2561
2562 static const struct file_operations ipv6_route_proc_fops = {
2563         .owner          = THIS_MODULE,
2564         .open           = ipv6_route_open,
2565         .read           = seq_read,
2566         .llseek         = seq_lseek,
2567         .release        = single_release_net,
2568 };
2569
2570 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2571 {
2572         struct net *net = (struct net *)seq->private;
2573         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2574                    net->ipv6.rt6_stats->fib_nodes,
2575                    net->ipv6.rt6_stats->fib_route_nodes,
2576                    net->ipv6.rt6_stats->fib_rt_alloc,
2577                    net->ipv6.rt6_stats->fib_rt_entries,
2578                    net->ipv6.rt6_stats->fib_rt_cache,
2579                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2580                    net->ipv6.rt6_stats->fib_discarded_routes);
2581
2582         return 0;
2583 }
2584
2585 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2586 {
2587         return single_open_net(inode, file, rt6_stats_seq_show);
2588 }
2589
2590 static const struct file_operations rt6_stats_seq_fops = {
2591         .owner   = THIS_MODULE,
2592         .open    = rt6_stats_seq_open,
2593         .read    = seq_read,
2594         .llseek  = seq_lseek,
2595         .release = single_release_net,
2596 };
2597 #endif  /* CONFIG_PROC_FS */
2598
2599 #ifdef CONFIG_SYSCTL
2600
2601 static
2602 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2603                               void __user *buffer, size_t *lenp, loff_t *ppos)
2604 {
2605         struct net *net;
2606         int delay;
2607         if (!write)
2608                 return -EINVAL;
2609
2610         net = (struct net *)ctl->extra1;
2611         delay = net->ipv6.sysctl.flush_delay;
2612         proc_dointvec(ctl, write, buffer, lenp, ppos);
2613         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2614         return 0;
2615 }
2616
2617 ctl_table ipv6_route_table_template[] = {
2618         {
2619                 .procname       =       "flush",
2620                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2621                 .maxlen         =       sizeof(int),
2622                 .mode           =       0200,
2623                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2624         },
2625         {
2626                 .procname       =       "gc_thresh",
2627                 .data           =       &ip6_dst_ops_template.gc_thresh,
2628                 .maxlen         =       sizeof(int),
2629                 .mode           =       0644,
2630                 .proc_handler   =       proc_dointvec,
2631         },
2632         {
2633                 .procname       =       "max_size",
2634                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2635                 .maxlen         =       sizeof(int),
2636                 .mode           =       0644,
2637                 .proc_handler   =       proc_dointvec,
2638         },
2639         {
2640                 .procname       =       "gc_min_interval",
2641                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2642                 .maxlen         =       sizeof(int),
2643                 .mode           =       0644,
2644                 .proc_handler   =       proc_dointvec_jiffies,
2645         },
2646         {
2647                 .procname       =       "gc_timeout",
2648                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2649                 .maxlen         =       sizeof(int),
2650                 .mode           =       0644,
2651                 .proc_handler   =       proc_dointvec_jiffies,
2652         },
2653         {
2654                 .procname       =       "gc_interval",
2655                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2656                 .maxlen         =       sizeof(int),
2657                 .mode           =       0644,
2658                 .proc_handler   =       proc_dointvec_jiffies,
2659         },
2660         {
2661                 .procname       =       "gc_elasticity",
2662                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2663                 .maxlen         =       sizeof(int),
2664                 .mode           =       0644,
2665                 .proc_handler   =       proc_dointvec,
2666         },
2667         {
2668                 .procname       =       "mtu_expires",
2669                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2670                 .maxlen         =       sizeof(int),
2671                 .mode           =       0644,
2672                 .proc_handler   =       proc_dointvec_jiffies,
2673         },
2674         {
2675                 .procname       =       "min_adv_mss",
2676                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2677                 .maxlen         =       sizeof(int),
2678                 .mode           =       0644,
2679                 .proc_handler   =       proc_dointvec,
2680         },
2681         {
2682                 .procname       =       "gc_min_interval_ms",
2683                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2684                 .maxlen         =       sizeof(int),
2685                 .mode           =       0644,
2686                 .proc_handler   =       proc_dointvec_ms_jiffies,
2687         },
2688         { }
2689 };
2690
2691 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2692 {
2693         struct ctl_table *table;
2694
2695         table = kmemdup(ipv6_route_table_template,
2696                         sizeof(ipv6_route_table_template),
2697                         GFP_KERNEL);
2698
2699         if (table) {
2700                 table[0].data = &net->ipv6.sysctl.flush_delay;
2701                 table[0].extra1 = net;
2702                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2703                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2704                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2705                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2706                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2707                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2708                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2709                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2710                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2711         }
2712
2713         return table;
2714 }
2715 #endif
2716
2717 static int __net_init ip6_route_net_init(struct net *net)
2718 {
2719         int ret = -ENOMEM;
2720
2721         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2722                sizeof(net->ipv6.ip6_dst_ops));
2723
2724         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2725                 goto out_ip6_dst_ops;
2726
2727         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2728                                            sizeof(*net->ipv6.ip6_null_entry),
2729                                            GFP_KERNEL);
2730         if (!net->ipv6.ip6_null_entry)
2731                 goto out_ip6_dst_entries;
2732         net->ipv6.ip6_null_entry->dst.path =
2733                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2734         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2735         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2736                          ip6_template_metrics, true);
2737
2738 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2739         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2740                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2741                                                GFP_KERNEL);
2742         if (!net->ipv6.ip6_prohibit_entry)
2743                 goto out_ip6_null_entry;
2744         net->ipv6.ip6_prohibit_entry->dst.path =
2745                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2746         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2747         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2748                          ip6_template_metrics, true);
2749
2750         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2751                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2752                                                GFP_KERNEL);
2753         if (!net->ipv6.ip6_blk_hole_entry)
2754                 goto out_ip6_prohibit_entry;
2755         net->ipv6.ip6_blk_hole_entry->dst.path =
2756                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2757         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2758         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2759                          ip6_template_metrics, true);
2760 #endif
2761
2762         net->ipv6.sysctl.flush_delay = 0;
2763         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2764         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2765         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2766         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2767         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2768         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2769         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2770
2771 #ifdef CONFIG_PROC_FS
2772         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2773         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2774 #endif
2775         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2776
2777         ret = 0;
2778 out:
2779         return ret;
2780
2781 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2782 out_ip6_prohibit_entry:
2783         kfree(net->ipv6.ip6_prohibit_entry);
2784 out_ip6_null_entry:
2785         kfree(net->ipv6.ip6_null_entry);
2786 #endif
2787 out_ip6_dst_entries:
2788         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2789 out_ip6_dst_ops:
2790         goto out;
2791 }
2792
2793 static void __net_exit ip6_route_net_exit(struct net *net)
2794 {
2795 #ifdef CONFIG_PROC_FS
2796         proc_net_remove(net, "ipv6_route");
2797         proc_net_remove(net, "rt6_stats");
2798 #endif
2799         kfree(net->ipv6.ip6_null_entry);
2800 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2801         kfree(net->ipv6.ip6_prohibit_entry);
2802         kfree(net->ipv6.ip6_blk_hole_entry);
2803 #endif
2804         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2805 }
2806
2807 static struct pernet_operations ip6_route_net_ops = {
2808         .init = ip6_route_net_init,
2809         .exit = ip6_route_net_exit,
2810 };
2811
2812 static struct notifier_block ip6_route_dev_notifier = {
2813         .notifier_call = ip6_route_dev_notify,
2814         .priority = 0,
2815 };
2816
2817 int __init ip6_route_init(void)
2818 {
2819         int ret;
2820
2821         ret = -ENOMEM;
2822         ip6_dst_ops_template.kmem_cachep =
2823                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2824                                   SLAB_HWCACHE_ALIGN, NULL);
2825         if (!ip6_dst_ops_template.kmem_cachep)
2826                 goto out;
2827
2828         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2829         if (ret)
2830                 goto out_kmem_cache;
2831
2832         ret = register_pernet_subsys(&ip6_route_net_ops);
2833         if (ret)
2834                 goto out_dst_entries;
2835
2836         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2837
2838         /* Registering of the loopback is done before this portion of code,
2839          * the loopback reference in rt6_info will not be taken, do it
2840          * manually for init_net */
2841         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2842         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2843   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2844         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2845         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2846         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2847         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2848   #endif
2849         ret = fib6_init();
2850         if (ret)
2851                 goto out_register_subsys;
2852
2853         ret = xfrm6_init();
2854         if (ret)
2855                 goto out_fib6_init;
2856
2857         ret = fib6_rules_init();
2858         if (ret)
2859                 goto xfrm6_init;
2860
2861         ret = -ENOBUFS;
2862         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2863             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2864             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2865                 goto fib6_rules_init;
2866
2867         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2868         if (ret)
2869                 goto fib6_rules_init;
2870
2871 out:
2872         return ret;
2873
2874 fib6_rules_init:
2875         fib6_rules_cleanup();
2876 xfrm6_init:
2877         xfrm6_fini();
2878 out_fib6_init:
2879         fib6_gc_cleanup();
2880 out_register_subsys:
2881         unregister_pernet_subsys(&ip6_route_net_ops);
2882 out_dst_entries:
2883         dst_entries_destroy(&ip6_dst_blackhole_ops);
2884 out_kmem_cache:
2885         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2886         goto out;
2887 }
2888
2889 void ip6_route_cleanup(void)
2890 {
2891         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2892         fib6_rules_cleanup();
2893         xfrm6_fini();
2894         fib6_gc_cleanup();
2895         unregister_pernet_subsys(&ip6_route_net_ops);
2896         dst_entries_destroy(&ip6_dst_blackhole_ops);
2897         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2898 }