2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
53 #include <linux/rtnetlink.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
59 #include <asm/uaccess.h>
62 #include <linux/sysctl.h>
65 /* Set to 3 to get tracing. */
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
73 #define RT6_TRACE(x...) do { ; } while (0)
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77 const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void ip6_dst_destroy(struct dst_entry *);
83 static void ip6_dst_ifdown(struct dst_entry *,
84 struct net_device *dev, int how);
85 static int ip6_dst_gc(struct dst_ops *ops);
87 static int ip6_pkt_discard(struct sk_buff *skb);
88 static int ip6_pkt_discard_out(struct sk_buff *skb);
89 static void ip6_link_failure(struct sk_buff *skb);
90 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94 const struct in6_addr *prefix, int prefixlen,
95 const struct in6_addr *gwaddr, int ifindex,
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98 const struct in6_addr *prefix, int prefixlen,
99 const struct in6_addr *gwaddr, int ifindex);
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
104 struct rt6_info *rt = (struct rt6_info *) dst;
105 struct inet_peer *peer;
108 if (!(rt->dst.flags & DST_HOST))
112 rt6_bind_peer(rt, 1);
114 peer = rt->rt6i_peer;
116 u32 *old_p = __DST_METRICS_PTR(old);
117 unsigned long prev, new;
120 if (inet_metrics_new(peer))
121 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
123 new = (unsigned long) p;
124 prev = cmpxchg(&dst->_metrics, old, new);
127 p = __DST_METRICS_PTR(prev);
128 if (prev & DST_METRICS_READ_ONLY)
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
137 return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
140 static struct dst_ops ip6_dst_ops_template = {
142 .protocol = cpu_to_be16(ETH_P_IPV6),
145 .check = ip6_dst_check,
146 .default_advmss = ip6_default_advmss,
148 .cow_metrics = ipv6_cow_metrics,
149 .destroy = ip6_dst_destroy,
150 .ifdown = ip6_dst_ifdown,
151 .negative_advice = ip6_negative_advice,
152 .link_failure = ip6_link_failure,
153 .update_pmtu = ip6_rt_update_pmtu,
154 .local_out = __ip6_local_out,
155 .neigh_lookup = ip6_neigh_lookup,
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
160 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
162 return mtu ? : dst->dev->mtu;
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
175 static struct dst_ops ip6_dst_blackhole_ops = {
177 .protocol = cpu_to_be16(ETH_P_IPV6),
178 .destroy = ip6_dst_destroy,
179 .check = ip6_dst_check,
180 .mtu = ip6_blackhole_mtu,
181 .default_advmss = ip6_default_advmss,
182 .update_pmtu = ip6_rt_blackhole_update_pmtu,
183 .cow_metrics = ip6_rt_blackhole_cow_metrics,
184 .neigh_lookup = ip6_neigh_lookup,
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188 [RTAX_HOPLIMIT - 1] = 0,
191 static struct rt6_info ip6_null_entry_template = {
193 .__refcnt = ATOMIC_INIT(1),
196 .error = -ENETUNREACH,
197 .input = ip6_pkt_discard,
198 .output = ip6_pkt_discard_out,
200 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
201 .rt6i_protocol = RTPROT_KERNEL,
202 .rt6i_metric = ~(u32) 0,
203 .rt6i_ref = ATOMIC_INIT(1),
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
211 static struct rt6_info ip6_prohibit_entry_template = {
213 .__refcnt = ATOMIC_INIT(1),
217 .input = ip6_pkt_prohibit,
218 .output = ip6_pkt_prohibit_out,
220 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
221 .rt6i_protocol = RTPROT_KERNEL,
222 .rt6i_metric = ~(u32) 0,
223 .rt6i_ref = ATOMIC_INIT(1),
226 static struct rt6_info ip6_blk_hole_entry_template = {
228 .__refcnt = ATOMIC_INIT(1),
232 .input = dst_discard,
233 .output = dst_discard,
235 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
236 .rt6i_protocol = RTPROT_KERNEL,
237 .rt6i_metric = ~(u32) 0,
238 .rt6i_ref = ATOMIC_INIT(1),
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245 struct net_device *dev,
248 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
251 memset(&rt->rt6i_table, 0,
252 sizeof(*rt) - sizeof(struct dst_entry));
257 static void ip6_dst_destroy(struct dst_entry *dst)
259 struct rt6_info *rt = (struct rt6_info *)dst;
260 struct inet6_dev *idev = rt->rt6i_idev;
261 struct inet_peer *peer = rt->rt6i_peer;
263 if (!(rt->dst.flags & DST_HOST))
264 dst_destroy_metrics_generic(dst);
267 rt->rt6i_idev = NULL;
271 rt->rt6i_peer = NULL;
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
278 static u32 rt6_peer_genid(void)
280 return atomic_read(&__rt6_peer_genid);
283 void rt6_bind_peer(struct rt6_info *rt, int create)
285 struct inet_peer *peer;
287 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
291 rt->rt6i_peer_genid = rt6_peer_genid();
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
297 struct rt6_info *rt = (struct rt6_info *)dst;
298 struct inet6_dev *idev = rt->rt6i_idev;
299 struct net_device *loopback_dev =
300 dev_net(dev)->loopback_dev;
302 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
303 struct inet6_dev *loopback_idev =
304 in6_dev_get(loopback_dev);
305 if (loopback_idev != NULL) {
306 rt->rt6i_idev = loopback_idev;
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
314 return (rt->rt6i_flags & RTF_EXPIRES) &&
315 time_after(jiffies, rt->rt6i_expires);
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
320 return ipv6_addr_type(daddr) &
321 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
325 * Route lookup. Any table->tb6_lock is implied.
328 static inline struct rt6_info *rt6_device_match(struct net *net,
330 const struct in6_addr *saddr,
334 struct rt6_info *local = NULL;
335 struct rt6_info *sprt;
337 if (!oif && ipv6_addr_any(saddr))
340 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341 struct net_device *dev = sprt->rt6i_dev;
344 if (dev->ifindex == oif)
346 if (dev->flags & IFF_LOOPBACK) {
347 if (sprt->rt6i_idev == NULL ||
348 sprt->rt6i_idev->dev->ifindex != oif) {
349 if (flags & RT6_LOOKUP_F_IFACE && oif)
351 if (local && (!oif ||
352 local->rt6i_idev->dev->ifindex == oif))
358 if (ipv6_chk_addr(net, saddr, dev,
359 flags & RT6_LOOKUP_F_IFACE))
368 if (flags & RT6_LOOKUP_F_IFACE)
369 return net->ipv6.ip6_null_entry;
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
378 struct neighbour *neigh;
380 * Okay, this does not seem to be appropriate
381 * for now, however, we need to check if it
382 * is really so; aka Router Reachability Probing.
384 * Router Reachability Probe MUST be rate-limited
385 * to no more than one per minute.
388 neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389 if (!neigh || (neigh->nud_state & NUD_VALID))
391 read_lock_bh(&neigh->lock);
392 if (!(neigh->nud_state & NUD_VALID) &&
393 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394 struct in6_addr mcaddr;
395 struct in6_addr *target;
397 neigh->updated = jiffies;
398 read_unlock_bh(&neigh->lock);
400 target = (struct in6_addr *)&neigh->primary_key;
401 addrconf_addr_solict_mult(target, &mcaddr);
402 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
404 read_unlock_bh(&neigh->lock);
410 static inline void rt6_probe(struct rt6_info *rt)
416 * Default Router Selection (RFC 2461 6.3.6)
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
420 struct net_device *dev = rt->rt6i_dev;
421 if (!oif || dev->ifindex == oif)
423 if ((dev->flags & IFF_LOOPBACK) &&
424 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
429 static inline int rt6_check_neigh(struct rt6_info *rt)
431 struct neighbour *neigh;
435 neigh = dst_get_neighbour(&rt->dst);
436 if (rt->rt6i_flags & RTF_NONEXTHOP ||
437 !(rt->rt6i_flags & RTF_GATEWAY))
440 read_lock_bh(&neigh->lock);
441 if (neigh->nud_state & NUD_VALID)
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444 else if (neigh->nud_state & NUD_FAILED)
449 read_unlock_bh(&neigh->lock);
456 static int rt6_score_route(struct rt6_info *rt, int oif,
461 m = rt6_check_dev(rt, oif);
462 if (!m && (strict & RT6_LOOKUP_F_IFACE))
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
467 n = rt6_check_neigh(rt);
468 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474 int *mpri, struct rt6_info *match)
478 if (rt6_check_expired(rt))
481 m = rt6_score_route(rt, oif, strict);
486 if (strict & RT6_LOOKUP_F_REACHABLE)
490 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499 struct rt6_info *rr_head,
500 u32 metric, int oif, int strict)
502 struct rt6_info *rt, *match;
506 for (rt = rr_head; rt && rt->rt6i_metric == metric;
507 rt = rt->dst.rt6_next)
508 match = find_match(rt, oif, strict, &mpri, match);
509 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510 rt = rt->dst.rt6_next)
511 match = find_match(rt, oif, strict, &mpri, match);
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
518 struct rt6_info *match, *rt0;
521 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522 __func__, fn->leaf, oif);
526 fn->rr_ptr = rt0 = fn->leaf;
528 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
531 (strict & RT6_LOOKUP_F_REACHABLE)) {
532 struct rt6_info *next = rt0->dst.rt6_next;
534 /* no entries matched; do round-robin */
535 if (!next || next->rt6i_metric != rt0->rt6i_metric)
542 RT6_TRACE("%s() => %p\n",
545 net = dev_net(rt0->rt6i_dev);
546 return match ? match : net->ipv6.ip6_null_entry;
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551 const struct in6_addr *gwaddr)
553 struct net *net = dev_net(dev);
554 struct route_info *rinfo = (struct route_info *) opt;
555 struct in6_addr prefix_buf, *prefix;
557 unsigned long lifetime;
560 if (len < sizeof(struct route_info)) {
564 /* Sanity check for prefix_len and length */
565 if (rinfo->length > 3) {
567 } else if (rinfo->prefix_len > 128) {
569 } else if (rinfo->prefix_len > 64) {
570 if (rinfo->length < 2) {
573 } else if (rinfo->prefix_len > 0) {
574 if (rinfo->length < 1) {
579 pref = rinfo->route_pref;
580 if (pref == ICMPV6_ROUTER_PREF_INVALID)
583 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
585 if (rinfo->length == 3)
586 prefix = (struct in6_addr *)rinfo->prefix;
588 /* this function is safe */
589 ipv6_addr_prefix(&prefix_buf,
590 (struct in6_addr *)rinfo->prefix,
592 prefix = &prefix_buf;
595 if (rinfo->prefix_len == 0)
596 rt = rt6_get_dflt_router(gwaddr, dev);
598 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
599 gwaddr, dev->ifindex);
601 if (rt && !lifetime) {
607 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
610 rt->rt6i_flags = RTF_ROUTEINFO |
611 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
614 if (!addrconf_finite_timeout(lifetime)) {
615 rt->rt6i_flags &= ~RTF_EXPIRES;
617 rt->rt6i_expires = jiffies + HZ * lifetime;
618 rt->rt6i_flags |= RTF_EXPIRES;
620 dst_release(&rt->dst);
626 #define BACKTRACK(__net, saddr) \
628 if (rt == __net->ipv6.ip6_null_entry) { \
629 struct fib6_node *pn; \
631 if (fn->fn_flags & RTN_TL_ROOT) \
634 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
635 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
638 if (fn->fn_flags & RTN_RTINFO) \
644 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
645 struct fib6_table *table,
646 struct flowi6 *fl6, int flags)
648 struct fib6_node *fn;
651 read_lock_bh(&table->tb6_lock);
652 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
655 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
656 BACKTRACK(net, &fl6->saddr);
658 dst_use(&rt->dst, jiffies);
659 read_unlock_bh(&table->tb6_lock);
664 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
665 const struct in6_addr *saddr, int oif, int strict)
667 struct flowi6 fl6 = {
671 struct dst_entry *dst;
672 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
675 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
676 flags |= RT6_LOOKUP_F_HAS_SADDR;
679 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
681 return (struct rt6_info *) dst;
688 EXPORT_SYMBOL(rt6_lookup);
690 /* ip6_ins_rt is called with FREE table->tb6_lock.
691 It takes new route entry, the addition fails by any reason the
692 route is freed. In any case, if caller does not hold it, it may
696 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
699 struct fib6_table *table;
701 table = rt->rt6i_table;
702 write_lock_bh(&table->tb6_lock);
703 err = fib6_add(&table->tb6_root, rt, info);
704 write_unlock_bh(&table->tb6_lock);
709 int ip6_ins_rt(struct rt6_info *rt)
711 struct nl_info info = {
712 .nl_net = dev_net(rt->rt6i_dev),
714 return __ip6_ins_rt(rt, &info);
717 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
718 const struct in6_addr *daddr,
719 const struct in6_addr *saddr)
727 rt = ip6_rt_copy(ort, daddr);
730 struct neighbour *neigh;
731 int attempts = !in_softirq();
733 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
734 if (ort->rt6i_dst.plen != 128 &&
735 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
736 rt->rt6i_flags |= RTF_ANYCAST;
737 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
740 rt->rt6i_flags |= RTF_CACHE;
742 #ifdef CONFIG_IPV6_SUBTREES
743 if (rt->rt6i_src.plen && saddr) {
744 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
745 rt->rt6i_src.plen = 128;
750 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
752 struct net *net = dev_net(rt->rt6i_dev);
753 int saved_rt_min_interval =
754 net->ipv6.sysctl.ip6_rt_gc_min_interval;
755 int saved_rt_elasticity =
756 net->ipv6.sysctl.ip6_rt_gc_elasticity;
758 if (attempts-- > 0) {
759 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
760 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
762 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
764 net->ipv6.sysctl.ip6_rt_gc_elasticity =
766 net->ipv6.sysctl.ip6_rt_gc_min_interval =
767 saved_rt_min_interval;
773 "ipv6: Neighbour table overflow.\n");
777 dst_set_neighbour(&rt->dst, neigh);
784 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
785 const struct in6_addr *daddr)
787 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
790 rt->rt6i_flags |= RTF_CACHE;
791 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
796 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
797 struct flowi6 *fl6, int flags, bool input)
799 struct fib6_node *fn;
800 struct rt6_info *rt, *nrt;
804 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
805 int local = RTF_NONEXTHOP;
807 strict |= flags & RT6_LOOKUP_F_IFACE;
812 read_lock_bh(&table->tb6_lock);
815 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
818 rt = rt6_select(fn, oif, strict | reachable);
820 BACKTRACK(net, &fl6->saddr);
821 if (rt == net->ipv6.ip6_null_entry ||
822 rt->rt6i_flags & RTF_CACHE)
826 read_unlock_bh(&table->tb6_lock);
828 if (!dst_get_neighbour_raw(&rt->dst)
829 && !(rt->rt6i_flags & local))
830 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
831 else if (!(rt->dst.flags & DST_HOST))
832 nrt = rt6_alloc_clone(rt, &fl6->daddr);
836 dst_release(&rt->dst);
837 rt = nrt ? : net->ipv6.ip6_null_entry;
841 err = ip6_ins_rt(nrt);
850 * Race condition! In the gap, when table->tb6_lock was
851 * released someone could insert this route. Relookup.
853 dst_release(&rt->dst);
862 read_unlock_bh(&table->tb6_lock);
864 rt->dst.lastuse = jiffies;
870 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
871 struct flowi6 *fl6, int flags)
873 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags, true);
876 void ip6_route_input(struct sk_buff *skb)
878 const struct ipv6hdr *iph = ipv6_hdr(skb);
879 struct net *net = dev_net(skb->dev);
880 int flags = RT6_LOOKUP_F_HAS_SADDR;
881 struct flowi6 fl6 = {
882 .flowi6_iif = skb->dev->ifindex,
885 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
886 .flowi6_mark = skb->mark,
887 .flowi6_proto = iph->nexthdr,
890 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
891 flags |= RT6_LOOKUP_F_IFACE;
893 skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
896 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
897 struct flowi6 *fl6, int flags)
899 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags, false);
902 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
907 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
908 flags |= RT6_LOOKUP_F_IFACE;
910 if (!ipv6_addr_any(&fl6->saddr))
911 flags |= RT6_LOOKUP_F_HAS_SADDR;
913 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
915 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
918 EXPORT_SYMBOL(ip6_route_output);
920 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
922 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
923 struct dst_entry *new = NULL;
925 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
927 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
932 new->input = dst_discard;
933 new->output = dst_discard;
935 if (dst_metrics_read_only(&ort->dst))
936 new->_metrics = ort->dst._metrics;
938 dst_copy_metrics(new, &ort->dst);
939 rt->rt6i_idev = ort->rt6i_idev;
941 in6_dev_hold(rt->rt6i_idev);
942 rt->rt6i_expires = 0;
944 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
945 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
948 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
949 #ifdef CONFIG_IPV6_SUBTREES
950 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
956 dst_release(dst_orig);
957 return new ? new : ERR_PTR(-ENOMEM);
961 * Destination cache support functions
964 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
968 rt = (struct rt6_info *) dst;
970 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
971 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
973 rt6_bind_peer(rt, 0);
974 rt->rt6i_peer_genid = rt6_peer_genid();
981 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
983 struct rt6_info *rt = (struct rt6_info *) dst;
986 if (rt->rt6i_flags & RTF_CACHE) {
987 if (rt6_check_expired(rt)) {
999 static void ip6_link_failure(struct sk_buff *skb)
1001 struct rt6_info *rt;
1003 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1005 rt = (struct rt6_info *) skb_dst(skb);
1007 if (rt->rt6i_flags&RTF_CACHE) {
1008 dst_set_expires(&rt->dst, 0);
1009 rt->rt6i_flags |= RTF_EXPIRES;
1010 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1011 rt->rt6i_node->fn_sernum = -1;
1015 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1017 struct rt6_info *rt6 = (struct rt6_info*)dst;
1019 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1020 rt6->rt6i_flags |= RTF_MODIFIED;
1021 if (mtu < IPV6_MIN_MTU) {
1022 u32 features = dst_metric(dst, RTAX_FEATURES);
1024 features |= RTAX_FEATURE_ALLFRAG;
1025 dst_metric_set(dst, RTAX_FEATURES, features);
1027 dst_metric_set(dst, RTAX_MTU, mtu);
1031 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1033 struct net_device *dev = dst->dev;
1034 unsigned int mtu = dst_mtu(dst);
1035 struct net *net = dev_net(dev);
1037 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1039 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1040 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1043 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1044 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1045 * IPV6_MAXPLEN is also valid and means: "any MSS,
1046 * rely only on pmtu discovery"
1048 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1053 static unsigned int ip6_mtu(const struct dst_entry *dst)
1055 struct inet6_dev *idev;
1056 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1064 idev = __in6_dev_get(dst->dev);
1066 mtu = idev->cnf.mtu6;
1070 return min_t(unsigned int, mtu, IP6_MAX_MTU);
1073 static struct dst_entry *icmp6_dst_gc_list;
1074 static DEFINE_SPINLOCK(icmp6_dst_lock);
1076 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1077 struct neighbour *neigh,
1078 const struct in6_addr *addr)
1080 struct rt6_info *rt;
1081 struct inet6_dev *idev = in6_dev_get(dev);
1082 struct net *net = dev_net(dev);
1084 if (unlikely(idev == NULL))
1087 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1088 if (unlikely(rt == NULL)) {
1096 neigh = ndisc_get_neigh(dev, addr);
1101 rt->dst.flags |= DST_HOST;
1102 rt->dst.output = ip6_output;
1103 dst_set_neighbour(&rt->dst, neigh);
1104 atomic_set(&rt->dst.__refcnt, 1);
1105 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1106 rt->rt6i_dst.plen = 128;
1107 rt->rt6i_idev = idev;
1108 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1110 spin_lock_bh(&icmp6_dst_lock);
1111 rt->dst.next = icmp6_dst_gc_list;
1112 icmp6_dst_gc_list = &rt->dst;
1113 spin_unlock_bh(&icmp6_dst_lock);
1115 fib6_force_start_gc(net);
1121 int icmp6_dst_gc(void)
1123 struct dst_entry *dst, **pprev;
1126 spin_lock_bh(&icmp6_dst_lock);
1127 pprev = &icmp6_dst_gc_list;
1129 while ((dst = *pprev) != NULL) {
1130 if (!atomic_read(&dst->__refcnt)) {
1139 spin_unlock_bh(&icmp6_dst_lock);
1144 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1147 struct dst_entry *dst, **pprev;
1149 spin_lock_bh(&icmp6_dst_lock);
1150 pprev = &icmp6_dst_gc_list;
1151 while ((dst = *pprev) != NULL) {
1152 struct rt6_info *rt = (struct rt6_info *) dst;
1153 if (func(rt, arg)) {
1160 spin_unlock_bh(&icmp6_dst_lock);
1163 static int ip6_dst_gc(struct dst_ops *ops)
1165 unsigned long now = jiffies;
1166 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1167 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1168 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1169 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1170 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1171 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1174 entries = dst_entries_get_fast(ops);
1175 if (time_after(rt_last_gc + rt_min_interval, now) &&
1176 entries <= rt_max_size)
1179 net->ipv6.ip6_rt_gc_expire++;
1180 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1181 net->ipv6.ip6_rt_last_gc = now;
1182 entries = dst_entries_get_slow(ops);
1183 if (entries < ops->gc_thresh)
1184 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1186 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1187 return entries > rt_max_size;
1190 /* Clean host part of a prefix. Not necessary in radix tree,
1191 but results in cleaner routing tables.
1193 Remove it only when all the things will work!
1196 int ip6_dst_hoplimit(struct dst_entry *dst)
1198 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1199 if (hoplimit == 0) {
1200 struct net_device *dev = dst->dev;
1201 struct inet6_dev *idev;
1204 idev = __in6_dev_get(dev);
1206 hoplimit = idev->cnf.hop_limit;
1208 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1213 EXPORT_SYMBOL(ip6_dst_hoplimit);
1219 int ip6_route_add(struct fib6_config *cfg)
1222 struct net *net = cfg->fc_nlinfo.nl_net;
1223 struct rt6_info *rt = NULL;
1224 struct net_device *dev = NULL;
1225 struct inet6_dev *idev = NULL;
1226 struct fib6_table *table;
1229 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1231 #ifndef CONFIG_IPV6_SUBTREES
1232 if (cfg->fc_src_len)
1235 if (cfg->fc_ifindex) {
1237 dev = dev_get_by_index(net, cfg->fc_ifindex);
1240 idev = in6_dev_get(dev);
1245 if (cfg->fc_metric == 0)
1246 cfg->fc_metric = IP6_RT_PRIO_USER;
1248 table = fib6_new_table(net, cfg->fc_table);
1249 if (table == NULL) {
1254 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1261 rt->dst.obsolete = -1;
1262 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1263 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1266 if (cfg->fc_protocol == RTPROT_UNSPEC)
1267 cfg->fc_protocol = RTPROT_BOOT;
1268 rt->rt6i_protocol = cfg->fc_protocol;
1270 addr_type = ipv6_addr_type(&cfg->fc_dst);
1272 if (addr_type & IPV6_ADDR_MULTICAST)
1273 rt->dst.input = ip6_mc_input;
1274 else if (cfg->fc_flags & RTF_LOCAL)
1275 rt->dst.input = ip6_input;
1277 rt->dst.input = ip6_forward;
1279 rt->dst.output = ip6_output;
1281 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1282 rt->rt6i_dst.plen = cfg->fc_dst_len;
1283 if (rt->rt6i_dst.plen == 128)
1284 rt->dst.flags |= DST_HOST;
1286 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1287 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1292 dst_init_metrics(&rt->dst, metrics, 0);
1294 #ifdef CONFIG_IPV6_SUBTREES
1295 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1296 rt->rt6i_src.plen = cfg->fc_src_len;
1299 rt->rt6i_metric = cfg->fc_metric;
1301 /* We cannot add true routes via loopback here,
1302 they would result in kernel looping; promote them to reject routes
1304 if ((cfg->fc_flags & RTF_REJECT) ||
1305 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1306 && !(cfg->fc_flags&RTF_LOCAL))) {
1307 /* hold loopback dev/idev if we haven't done so. */
1308 if (dev != net->loopback_dev) {
1313 dev = net->loopback_dev;
1315 idev = in6_dev_get(dev);
1321 rt->dst.output = ip6_pkt_discard_out;
1322 rt->dst.input = ip6_pkt_discard;
1323 rt->dst.error = -ENETUNREACH;
1324 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1328 if (cfg->fc_flags & RTF_GATEWAY) {
1329 const struct in6_addr *gw_addr;
1332 gw_addr = &cfg->fc_gateway;
1333 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1334 gwa_type = ipv6_addr_type(gw_addr);
1336 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1337 struct rt6_info *grt;
1339 /* IPv6 strictly inhibits using not link-local
1340 addresses as nexthop address.
1341 Otherwise, router will not able to send redirects.
1342 It is very good, but in some (rare!) circumstances
1343 (SIT, PtP, NBMA NOARP links) it is handy to allow
1344 some exceptions. --ANK
1347 if (!(gwa_type&IPV6_ADDR_UNICAST))
1350 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1352 err = -EHOSTUNREACH;
1356 if (dev != grt->rt6i_dev) {
1357 dst_release(&grt->dst);
1361 dev = grt->rt6i_dev;
1362 idev = grt->rt6i_idev;
1364 in6_dev_hold(grt->rt6i_idev);
1366 if (!(grt->rt6i_flags&RTF_GATEWAY))
1368 dst_release(&grt->dst);
1374 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1382 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1383 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1387 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1388 rt->rt6i_prefsrc.plen = 128;
1390 rt->rt6i_prefsrc.plen = 0;
1392 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1393 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1398 dst_set_neighbour(&rt->dst, n);
1401 rt->rt6i_flags = cfg->fc_flags;
1408 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1409 int type = nla_type(nla);
1412 if (type > RTAX_MAX) {
1417 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1423 rt->rt6i_idev = idev;
1424 rt->rt6i_table = table;
1426 cfg->fc_nlinfo.nl_net = dev_net(dev);
1428 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1440 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1443 struct fib6_table *table;
1444 struct net *net = dev_net(rt->rt6i_dev);
1446 if (rt == net->ipv6.ip6_null_entry) {
1451 table = rt->rt6i_table;
1452 write_lock_bh(&table->tb6_lock);
1453 err = fib6_del(rt, info);
1454 write_unlock_bh(&table->tb6_lock);
1457 dst_release(&rt->dst);
1461 int ip6_del_rt(struct rt6_info *rt)
1463 struct nl_info info = {
1464 .nl_net = dev_net(rt->rt6i_dev),
1466 return __ip6_del_rt(rt, &info);
1469 static int ip6_route_del(struct fib6_config *cfg)
1471 struct fib6_table *table;
1472 struct fib6_node *fn;
1473 struct rt6_info *rt;
1476 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1480 read_lock_bh(&table->tb6_lock);
1482 fn = fib6_locate(&table->tb6_root,
1483 &cfg->fc_dst, cfg->fc_dst_len,
1484 &cfg->fc_src, cfg->fc_src_len);
1487 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1488 if (cfg->fc_ifindex &&
1489 (rt->rt6i_dev == NULL ||
1490 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1492 if (cfg->fc_flags & RTF_GATEWAY &&
1493 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1495 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1498 read_unlock_bh(&table->tb6_lock);
1500 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1503 read_unlock_bh(&table->tb6_lock);
1511 struct ip6rd_flowi {
1513 struct in6_addr gateway;
1516 static struct rt6_info *__ip6_route_redirect(struct net *net,
1517 struct fib6_table *table,
1521 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1522 struct rt6_info *rt;
1523 struct fib6_node *fn;
1526 * Get the "current" route for this destination and
1527 * check if the redirect has come from approriate router.
1529 * RFC 2461 specifies that redirects should only be
1530 * accepted if they come from the nexthop to the target.
1531 * Due to the way the routes are chosen, this notion
1532 * is a bit fuzzy and one might need to check all possible
1536 read_lock_bh(&table->tb6_lock);
1537 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1539 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1541 * Current route is on-link; redirect is always invalid.
1543 * Seems, previous statement is not true. It could
1544 * be node, which looks for us as on-link (f.e. proxy ndisc)
1545 * But then router serving it might decide, that we should
1546 * know truth 8)8) --ANK (980726).
1548 if (rt6_check_expired(rt))
1550 if (!(rt->rt6i_flags & RTF_GATEWAY))
1552 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1554 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1560 rt = net->ipv6.ip6_null_entry;
1561 BACKTRACK(net, &fl6->saddr);
1565 read_unlock_bh(&table->tb6_lock);
1570 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1571 const struct in6_addr *src,
1572 const struct in6_addr *gateway,
1573 struct net_device *dev)
1575 int flags = RT6_LOOKUP_F_HAS_SADDR;
1576 struct net *net = dev_net(dev);
1577 struct ip6rd_flowi rdfl = {
1579 .flowi6_oif = dev->ifindex,
1585 ipv6_addr_copy(&rdfl.gateway, gateway);
1587 if (rt6_need_strict(dest))
1588 flags |= RT6_LOOKUP_F_IFACE;
1590 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1591 flags, __ip6_route_redirect);
1594 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1595 const struct in6_addr *saddr,
1596 struct neighbour *neigh, u8 *lladdr, int on_link)
1598 struct rt6_info *rt, *nrt = NULL;
1599 struct netevent_redirect netevent;
1600 struct net *net = dev_net(neigh->dev);
1602 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1604 if (rt == net->ipv6.ip6_null_entry) {
1605 if (net_ratelimit())
1606 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1607 "for redirect target\n");
1612 * We have finally decided to accept it.
1615 neigh_update(neigh, lladdr, NUD_STALE,
1616 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1617 NEIGH_UPDATE_F_OVERRIDE|
1618 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1619 NEIGH_UPDATE_F_ISROUTER))
1623 * Redirect received -> path was valid.
1624 * Look, redirects are sent only in response to data packets,
1625 * so that this nexthop apparently is reachable. --ANK
1627 dst_confirm(&rt->dst);
1629 /* Duplicate redirect: silently ignore. */
1630 if (neigh == dst_get_neighbour_raw(&rt->dst))
1633 nrt = ip6_rt_copy(rt, dest);
1637 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1639 nrt->rt6i_flags &= ~RTF_GATEWAY;
1641 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1642 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1644 if (ip6_ins_rt(nrt))
1647 netevent.old = &rt->dst;
1648 netevent.new = &nrt->dst;
1649 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1651 if (rt->rt6i_flags&RTF_CACHE) {
1657 dst_release(&rt->dst);
1661 * Handle ICMP "packet too big" messages
1662 * i.e. Path MTU discovery
1665 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1666 struct net *net, u32 pmtu, int ifindex)
1668 struct rt6_info *rt, *nrt;
1671 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1675 if (rt6_check_expired(rt)) {
1680 if (pmtu >= dst_mtu(&rt->dst))
1683 if (pmtu < IPV6_MIN_MTU) {
1685 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1686 * MTU (1280) and a fragment header should always be included
1687 * after a node receiving Too Big message reporting PMTU is
1688 * less than the IPv6 Minimum Link MTU.
1690 pmtu = IPV6_MIN_MTU;
1694 /* New mtu received -> path was valid.
1695 They are sent only in response to data packets,
1696 so that this nexthop apparently is reachable. --ANK
1698 dst_confirm(&rt->dst);
1700 /* Host route. If it is static, it would be better
1701 not to override it, but add new one, so that
1702 when cache entry will expire old pmtu
1703 would return automatically.
1705 if (rt->rt6i_flags & RTF_CACHE) {
1706 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1708 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1709 features |= RTAX_FEATURE_ALLFRAG;
1710 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1712 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1713 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1718 Two cases are possible:
1719 1. It is connected route. Action: COW
1720 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1722 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1723 nrt = rt6_alloc_cow(rt, daddr, saddr);
1725 nrt = rt6_alloc_clone(rt, daddr);
1728 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1730 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1731 features |= RTAX_FEATURE_ALLFRAG;
1732 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1735 /* According to RFC 1981, detecting PMTU increase shouldn't be
1736 * happened within 5 mins, the recommended timer is 10 mins.
1737 * Here this route expiration time is set to ip6_rt_mtu_expires
1738 * which is 10 mins. After 10 mins the decreased pmtu is expired
1739 * and detecting PMTU increase will be automatically happened.
1741 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1742 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1747 dst_release(&rt->dst);
1750 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1751 struct net_device *dev, u32 pmtu)
1753 struct net *net = dev_net(dev);
1756 * RFC 1981 states that a node "MUST reduce the size of the packets it
1757 * is sending along the path" that caused the Packet Too Big message.
1758 * Since it's not possible in the general case to determine which
1759 * interface was used to send the original packet, we update the MTU
1760 * on the interface that will be used to send future packets. We also
1761 * update the MTU on the interface that received the Packet Too Big in
1762 * case the original packet was forced out that interface with
1763 * SO_BINDTODEVICE or similar. This is the next best thing to the
1764 * correct behaviour, which would be to update the MTU on all
1767 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1768 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1772 * Misc support functions
1775 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1776 const struct in6_addr *dest)
1778 struct net *net = dev_net(ort->rt6i_dev);
1779 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1783 rt->dst.input = ort->dst.input;
1784 rt->dst.output = ort->dst.output;
1785 rt->dst.flags |= DST_HOST;
1787 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1788 rt->rt6i_dst.plen = 128;
1789 dst_copy_metrics(&rt->dst, &ort->dst);
1790 rt->dst.error = ort->dst.error;
1791 rt->rt6i_idev = ort->rt6i_idev;
1793 in6_dev_hold(rt->rt6i_idev);
1794 rt->dst.lastuse = jiffies;
1795 rt->rt6i_expires = 0;
1797 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1798 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1799 rt->rt6i_metric = 0;
1801 #ifdef CONFIG_IPV6_SUBTREES
1802 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1804 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1805 rt->rt6i_table = ort->rt6i_table;
1810 #ifdef CONFIG_IPV6_ROUTE_INFO
1811 static struct rt6_info *rt6_get_route_info(struct net *net,
1812 const struct in6_addr *prefix, int prefixlen,
1813 const struct in6_addr *gwaddr, int ifindex)
1815 struct fib6_node *fn;
1816 struct rt6_info *rt = NULL;
1817 struct fib6_table *table;
1819 table = fib6_get_table(net, RT6_TABLE_INFO);
1823 write_lock_bh(&table->tb6_lock);
1824 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1828 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1829 if (rt->rt6i_dev->ifindex != ifindex)
1831 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1833 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1839 write_unlock_bh(&table->tb6_lock);
1843 static struct rt6_info *rt6_add_route_info(struct net *net,
1844 const struct in6_addr *prefix, int prefixlen,
1845 const struct in6_addr *gwaddr, int ifindex,
1848 struct fib6_config cfg = {
1849 .fc_table = RT6_TABLE_INFO,
1850 .fc_metric = IP6_RT_PRIO_USER,
1851 .fc_ifindex = ifindex,
1852 .fc_dst_len = prefixlen,
1853 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1854 RTF_UP | RTF_PREF(pref),
1856 .fc_nlinfo.nlh = NULL,
1857 .fc_nlinfo.nl_net = net,
1860 ipv6_addr_copy(&cfg.fc_dst, prefix);
1861 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1863 /* We should treat it as a default route if prefix length is 0. */
1865 cfg.fc_flags |= RTF_DEFAULT;
1867 ip6_route_add(&cfg);
1869 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1873 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1875 struct rt6_info *rt;
1876 struct fib6_table *table;
1878 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1882 write_lock_bh(&table->tb6_lock);
1883 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1884 if (dev == rt->rt6i_dev &&
1885 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1886 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1891 write_unlock_bh(&table->tb6_lock);
1895 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1896 struct net_device *dev,
1899 struct fib6_config cfg = {
1900 .fc_table = RT6_TABLE_DFLT,
1901 .fc_metric = IP6_RT_PRIO_USER,
1902 .fc_ifindex = dev->ifindex,
1903 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1904 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1906 .fc_nlinfo.nlh = NULL,
1907 .fc_nlinfo.nl_net = dev_net(dev),
1910 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1912 ip6_route_add(&cfg);
1914 return rt6_get_dflt_router(gwaddr, dev);
1917 void rt6_purge_dflt_routers(struct net *net)
1919 struct rt6_info *rt;
1920 struct fib6_table *table;
1922 /* NOTE: Keep consistent with rt6_get_dflt_router */
1923 table = fib6_get_table(net, RT6_TABLE_DFLT);
1928 read_lock_bh(&table->tb6_lock);
1929 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1930 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
1931 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
1933 read_unlock_bh(&table->tb6_lock);
1938 read_unlock_bh(&table->tb6_lock);
1941 static void rtmsg_to_fib6_config(struct net *net,
1942 struct in6_rtmsg *rtmsg,
1943 struct fib6_config *cfg)
1945 memset(cfg, 0, sizeof(*cfg));
1947 cfg->fc_table = RT6_TABLE_MAIN;
1948 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1949 cfg->fc_metric = rtmsg->rtmsg_metric;
1950 cfg->fc_expires = rtmsg->rtmsg_info;
1951 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1952 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1953 cfg->fc_flags = rtmsg->rtmsg_flags;
1955 cfg->fc_nlinfo.nl_net = net;
1957 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1958 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1959 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1962 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1964 struct fib6_config cfg;
1965 struct in6_rtmsg rtmsg;
1969 case SIOCADDRT: /* Add a route */
1970 case SIOCDELRT: /* Delete a route */
1971 if (!capable(CAP_NET_ADMIN))
1973 err = copy_from_user(&rtmsg, arg,
1974 sizeof(struct in6_rtmsg));
1978 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1983 err = ip6_route_add(&cfg);
1986 err = ip6_route_del(&cfg);
2000 * Drop the packet on the floor
2003 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2006 struct dst_entry *dst = skb_dst(skb);
2007 switch (ipstats_mib_noroutes) {
2008 case IPSTATS_MIB_INNOROUTES:
2009 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2010 if (type == IPV6_ADDR_ANY) {
2011 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2012 IPSTATS_MIB_INADDRERRORS);
2016 case IPSTATS_MIB_OUTNOROUTES:
2017 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2018 ipstats_mib_noroutes);
2021 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2026 static int ip6_pkt_discard(struct sk_buff *skb)
2028 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2031 static int ip6_pkt_discard_out(struct sk_buff *skb)
2033 skb->dev = skb_dst(skb)->dev;
2034 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2037 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2039 static int ip6_pkt_prohibit(struct sk_buff *skb)
2041 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2044 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2046 skb->dev = skb_dst(skb)->dev;
2047 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2053 * Allocate a dst for local (unicast / anycast) address.
2056 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2057 const struct in6_addr *addr,
2060 struct net *net = dev_net(idev->dev);
2061 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2062 net->loopback_dev, DST_NOCOUNT);
2063 struct neighbour *neigh;
2066 return ERR_PTR(-ENOMEM);
2070 rt->dst.flags |= DST_HOST;
2071 rt->dst.input = ip6_input;
2072 rt->dst.output = ip6_output;
2073 rt->rt6i_idev = idev;
2074 rt->dst.obsolete = -1;
2076 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2078 rt->rt6i_flags |= RTF_ANYCAST;
2080 rt->rt6i_flags |= RTF_LOCAL;
2081 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2082 if (IS_ERR(neigh)) {
2085 return ERR_CAST(neigh);
2087 dst_set_neighbour(&rt->dst, neigh);
2089 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2090 rt->rt6i_dst.plen = 128;
2091 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2093 atomic_set(&rt->dst.__refcnt, 1);
2098 int ip6_route_get_saddr(struct net *net,
2099 struct rt6_info *rt,
2100 const struct in6_addr *daddr,
2102 struct in6_addr *saddr)
2104 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2106 if (rt->rt6i_prefsrc.plen)
2107 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2109 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2110 daddr, prefs, saddr);
2114 /* remove deleted ip from prefsrc entries */
2115 struct arg_dev_net_ip {
2116 struct net_device *dev;
2118 struct in6_addr *addr;
2121 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2123 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2124 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2125 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2127 if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2128 rt != net->ipv6.ip6_null_entry &&
2129 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2130 /* remove prefsrc entry */
2131 rt->rt6i_prefsrc.plen = 0;
2136 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2138 struct net *net = dev_net(ifp->idev->dev);
2139 struct arg_dev_net_ip adni = {
2140 .dev = ifp->idev->dev,
2144 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2147 struct arg_dev_net {
2148 struct net_device *dev;
2152 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2154 const struct arg_dev_net *adn = arg;
2155 const struct net_device *dev = adn->dev;
2157 if ((rt->rt6i_dev == dev || dev == NULL) &&
2158 rt != adn->net->ipv6.ip6_null_entry) {
2159 RT6_TRACE("deleted by ifdown %p\n", rt);
2165 void rt6_ifdown(struct net *net, struct net_device *dev)
2167 struct arg_dev_net adn = {
2172 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2173 icmp6_clean_all(fib6_ifdown, &adn);
2176 struct rt6_mtu_change_arg
2178 struct net_device *dev;
2182 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2184 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2185 struct inet6_dev *idev;
2187 /* In IPv6 pmtu discovery is not optional,
2188 so that RTAX_MTU lock cannot disable it.
2189 We still use this lock to block changes
2190 caused by addrconf/ndisc.
2193 idev = __in6_dev_get(arg->dev);
2197 /* For administrative MTU increase, there is no way to discover
2198 IPv6 PMTU increase, so PMTU increase should be updated here.
2199 Since RFC 1981 doesn't include administrative MTU increase
2200 update PMTU increase is a MUST. (i.e. jumbo frame)
2203 If new MTU is less than route PMTU, this new MTU will be the
2204 lowest MTU in the path, update the route PMTU to reflect PMTU
2205 decreases; if new MTU is greater than route PMTU, and the
2206 old MTU is the lowest MTU in the path, update the route PMTU
2207 to reflect the increase. In this case if the other nodes' MTU
2208 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2211 if (rt->rt6i_dev == arg->dev &&
2212 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2213 (dst_mtu(&rt->dst) >= arg->mtu ||
2214 (dst_mtu(&rt->dst) < arg->mtu &&
2215 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2216 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2221 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2223 struct rt6_mtu_change_arg arg = {
2228 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2231 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2232 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2233 [RTA_OIF] = { .type = NLA_U32 },
2234 [RTA_IIF] = { .type = NLA_U32 },
2235 [RTA_PRIORITY] = { .type = NLA_U32 },
2236 [RTA_METRICS] = { .type = NLA_NESTED },
2239 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2240 struct fib6_config *cfg)
2243 struct nlattr *tb[RTA_MAX+1];
2246 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2251 rtm = nlmsg_data(nlh);
2252 memset(cfg, 0, sizeof(*cfg));
2254 cfg->fc_table = rtm->rtm_table;
2255 cfg->fc_dst_len = rtm->rtm_dst_len;
2256 cfg->fc_src_len = rtm->rtm_src_len;
2257 cfg->fc_flags = RTF_UP;
2258 cfg->fc_protocol = rtm->rtm_protocol;
2260 if (rtm->rtm_type == RTN_UNREACHABLE)
2261 cfg->fc_flags |= RTF_REJECT;
2263 if (rtm->rtm_type == RTN_LOCAL)
2264 cfg->fc_flags |= RTF_LOCAL;
2266 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2267 cfg->fc_nlinfo.nlh = nlh;
2268 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2270 if (tb[RTA_GATEWAY]) {
2271 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2272 cfg->fc_flags |= RTF_GATEWAY;
2276 int plen = (rtm->rtm_dst_len + 7) >> 3;
2278 if (nla_len(tb[RTA_DST]) < plen)
2281 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2285 int plen = (rtm->rtm_src_len + 7) >> 3;
2287 if (nla_len(tb[RTA_SRC]) < plen)
2290 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2293 if (tb[RTA_PREFSRC])
2294 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2297 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2299 if (tb[RTA_PRIORITY])
2300 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2302 if (tb[RTA_METRICS]) {
2303 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2304 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2308 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2315 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2317 struct fib6_config cfg;
2320 err = rtm_to_fib6_config(skb, nlh, &cfg);
2324 return ip6_route_del(&cfg);
2327 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2329 struct fib6_config cfg;
2332 err = rtm_to_fib6_config(skb, nlh, &cfg);
2336 return ip6_route_add(&cfg);
2339 static inline size_t rt6_nlmsg_size(void)
2341 return NLMSG_ALIGN(sizeof(struct rtmsg))
2342 + nla_total_size(16) /* RTA_SRC */
2343 + nla_total_size(16) /* RTA_DST */
2344 + nla_total_size(16) /* RTA_GATEWAY */
2345 + nla_total_size(16) /* RTA_PREFSRC */
2346 + nla_total_size(4) /* RTA_TABLE */
2347 + nla_total_size(4) /* RTA_IIF */
2348 + nla_total_size(4) /* RTA_OIF */
2349 + nla_total_size(4) /* RTA_PRIORITY */
2350 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2351 + nla_total_size(sizeof(struct rta_cacheinfo));
2354 static int rt6_fill_node(struct net *net,
2355 struct sk_buff *skb, struct rt6_info *rt,
2356 struct in6_addr *dst, struct in6_addr *src,
2357 int iif, int type, u32 pid, u32 seq,
2358 int prefix, int nowait, unsigned int flags)
2361 struct nlmsghdr *nlh;
2364 struct neighbour *n;
2366 if (prefix) { /* user wants prefix routes only */
2367 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2368 /* success since this is not a prefix route */
2373 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2377 rtm = nlmsg_data(nlh);
2378 rtm->rtm_family = AF_INET6;
2379 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2380 rtm->rtm_src_len = rt->rt6i_src.plen;
2383 table = rt->rt6i_table->tb6_id;
2385 table = RT6_TABLE_UNSPEC;
2386 rtm->rtm_table = table;
2387 NLA_PUT_U32(skb, RTA_TABLE, table);
2388 if (rt->rt6i_flags&RTF_REJECT)
2389 rtm->rtm_type = RTN_UNREACHABLE;
2390 else if (rt->rt6i_flags&RTF_LOCAL)
2391 rtm->rtm_type = RTN_LOCAL;
2392 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2393 rtm->rtm_type = RTN_LOCAL;
2395 rtm->rtm_type = RTN_UNICAST;
2397 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2398 rtm->rtm_protocol = rt->rt6i_protocol;
2399 if (rt->rt6i_flags&RTF_DYNAMIC)
2400 rtm->rtm_protocol = RTPROT_REDIRECT;
2401 else if (rt->rt6i_flags & RTF_ADDRCONF)
2402 rtm->rtm_protocol = RTPROT_KERNEL;
2403 else if (rt->rt6i_flags&RTF_DEFAULT)
2404 rtm->rtm_protocol = RTPROT_RA;
2406 if (rt->rt6i_flags&RTF_CACHE)
2407 rtm->rtm_flags |= RTM_F_CLONED;
2410 NLA_PUT(skb, RTA_DST, 16, dst);
2411 rtm->rtm_dst_len = 128;
2412 } else if (rtm->rtm_dst_len)
2413 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2414 #ifdef CONFIG_IPV6_SUBTREES
2416 NLA_PUT(skb, RTA_SRC, 16, src);
2417 rtm->rtm_src_len = 128;
2418 } else if (rtm->rtm_src_len)
2419 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2422 #ifdef CONFIG_IPV6_MROUTE
2423 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2424 int err = ip6mr_get_route(net, skb, rtm, nowait);
2429 goto nla_put_failure;
2431 if (err == -EMSGSIZE)
2432 goto nla_put_failure;
2437 NLA_PUT_U32(skb, RTA_IIF, iif);
2439 struct in6_addr saddr_buf;
2440 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2441 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2444 if (rt->rt6i_prefsrc.plen) {
2445 struct in6_addr saddr_buf;
2446 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2447 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2450 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2451 goto nla_put_failure;
2454 n = dst_get_neighbour(&rt->dst);
2456 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2458 goto nla_put_failure;
2464 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2466 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2468 if (!(rt->rt6i_flags & RTF_EXPIRES))
2470 else if (rt->rt6i_expires - jiffies < INT_MAX)
2471 expires = rt->rt6i_expires - jiffies;
2475 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2476 expires, rt->dst.error) < 0)
2477 goto nla_put_failure;
2479 return nlmsg_end(skb, nlh);
2482 nlmsg_cancel(skb, nlh);
2486 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2488 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2491 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2492 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2493 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2497 return rt6_fill_node(arg->net,
2498 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2499 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2500 prefix, 0, NLM_F_MULTI);
2503 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2505 struct net *net = sock_net(in_skb->sk);
2506 struct nlattr *tb[RTA_MAX+1];
2507 struct rt6_info *rt;
2508 struct sk_buff *skb;
2513 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2518 memset(&fl6, 0, sizeof(fl6));
2521 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2524 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2528 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2531 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2535 iif = nla_get_u32(tb[RTA_IIF]);
2538 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2541 struct net_device *dev;
2542 dev = __dev_get_by_index(net, iif);
2549 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2555 /* Reserve room for dummy headers, this skb can pass
2556 through good chunk of routing engine.
2558 skb_reset_mac_header(skb);
2559 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2561 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2562 skb_dst_set(skb, &rt->dst);
2564 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2565 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2566 nlh->nlmsg_seq, 0, 0, 0);
2572 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2577 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2579 struct sk_buff *skb;
2580 struct net *net = info->nl_net;
2585 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2587 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2591 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2592 event, info->pid, seq, 0, 0, 0);
2594 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2595 WARN_ON(err == -EMSGSIZE);
2599 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2600 info->nlh, gfp_any());
2604 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2607 static int ip6_route_dev_notify(struct notifier_block *this,
2608 unsigned long event, void *data)
2610 struct net_device *dev = (struct net_device *)data;
2611 struct net *net = dev_net(dev);
2613 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2614 net->ipv6.ip6_null_entry->dst.dev = dev;
2615 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2616 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2617 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2618 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2619 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2620 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2631 #ifdef CONFIG_PROC_FS
2642 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2644 struct seq_file *m = p_arg;
2645 struct neighbour *n;
2647 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2649 #ifdef CONFIG_IPV6_SUBTREES
2650 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2652 seq_puts(m, "00000000000000000000000000000000 00 ");
2655 n = dst_get_neighbour(&rt->dst);
2657 seq_printf(m, "%pi6", n->primary_key);
2659 seq_puts(m, "00000000000000000000000000000000");
2662 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2663 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2664 rt->dst.__use, rt->rt6i_flags,
2665 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2669 static int ipv6_route_show(struct seq_file *m, void *v)
2671 struct net *net = (struct net *)m->private;
2672 fib6_clean_all(net, rt6_info_route, 0, m);
2676 static int ipv6_route_open(struct inode *inode, struct file *file)
2678 return single_open_net(inode, file, ipv6_route_show);
2681 static const struct file_operations ipv6_route_proc_fops = {
2682 .owner = THIS_MODULE,
2683 .open = ipv6_route_open,
2685 .llseek = seq_lseek,
2686 .release = single_release_net,
2689 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2691 struct net *net = (struct net *)seq->private;
2692 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2693 net->ipv6.rt6_stats->fib_nodes,
2694 net->ipv6.rt6_stats->fib_route_nodes,
2695 net->ipv6.rt6_stats->fib_rt_alloc,
2696 net->ipv6.rt6_stats->fib_rt_entries,
2697 net->ipv6.rt6_stats->fib_rt_cache,
2698 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2699 net->ipv6.rt6_stats->fib_discarded_routes);
2704 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2706 return single_open_net(inode, file, rt6_stats_seq_show);
2709 static const struct file_operations rt6_stats_seq_fops = {
2710 .owner = THIS_MODULE,
2711 .open = rt6_stats_seq_open,
2713 .llseek = seq_lseek,
2714 .release = single_release_net,
2716 #endif /* CONFIG_PROC_FS */
2718 #ifdef CONFIG_SYSCTL
2721 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2722 void __user *buffer, size_t *lenp, loff_t *ppos)
2729 net = (struct net *)ctl->extra1;
2730 delay = net->ipv6.sysctl.flush_delay;
2731 proc_dointvec(ctl, write, buffer, lenp, ppos);
2732 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2736 ctl_table ipv6_route_table_template[] = {
2738 .procname = "flush",
2739 .data = &init_net.ipv6.sysctl.flush_delay,
2740 .maxlen = sizeof(int),
2742 .proc_handler = ipv6_sysctl_rtcache_flush
2745 .procname = "gc_thresh",
2746 .data = &ip6_dst_ops_template.gc_thresh,
2747 .maxlen = sizeof(int),
2749 .proc_handler = proc_dointvec,
2752 .procname = "max_size",
2753 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2754 .maxlen = sizeof(int),
2756 .proc_handler = proc_dointvec,
2759 .procname = "gc_min_interval",
2760 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2761 .maxlen = sizeof(int),
2763 .proc_handler = proc_dointvec_jiffies,
2766 .procname = "gc_timeout",
2767 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2768 .maxlen = sizeof(int),
2770 .proc_handler = proc_dointvec_jiffies,
2773 .procname = "gc_interval",
2774 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2775 .maxlen = sizeof(int),
2777 .proc_handler = proc_dointvec_jiffies,
2780 .procname = "gc_elasticity",
2781 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2782 .maxlen = sizeof(int),
2784 .proc_handler = proc_dointvec,
2787 .procname = "mtu_expires",
2788 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2789 .maxlen = sizeof(int),
2791 .proc_handler = proc_dointvec_jiffies,
2794 .procname = "min_adv_mss",
2795 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2796 .maxlen = sizeof(int),
2798 .proc_handler = proc_dointvec,
2801 .procname = "gc_min_interval_ms",
2802 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2803 .maxlen = sizeof(int),
2805 .proc_handler = proc_dointvec_ms_jiffies,
2810 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2812 struct ctl_table *table;
2814 table = kmemdup(ipv6_route_table_template,
2815 sizeof(ipv6_route_table_template),
2819 table[0].data = &net->ipv6.sysctl.flush_delay;
2820 table[0].extra1 = net;
2821 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2822 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2823 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2824 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2825 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2826 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2827 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2828 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2829 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2836 static int __net_init ip6_route_net_init(struct net *net)
2840 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2841 sizeof(net->ipv6.ip6_dst_ops));
2843 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2844 goto out_ip6_dst_ops;
2846 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2847 sizeof(*net->ipv6.ip6_null_entry),
2849 if (!net->ipv6.ip6_null_entry)
2850 goto out_ip6_dst_entries;
2851 net->ipv6.ip6_null_entry->dst.path =
2852 (struct dst_entry *)net->ipv6.ip6_null_entry;
2853 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2854 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2855 ip6_template_metrics, true);
2857 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2858 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2859 sizeof(*net->ipv6.ip6_prohibit_entry),
2861 if (!net->ipv6.ip6_prohibit_entry)
2862 goto out_ip6_null_entry;
2863 net->ipv6.ip6_prohibit_entry->dst.path =
2864 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2865 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2866 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2867 ip6_template_metrics, true);
2869 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2870 sizeof(*net->ipv6.ip6_blk_hole_entry),
2872 if (!net->ipv6.ip6_blk_hole_entry)
2873 goto out_ip6_prohibit_entry;
2874 net->ipv6.ip6_blk_hole_entry->dst.path =
2875 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2876 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2877 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2878 ip6_template_metrics, true);
2881 net->ipv6.sysctl.flush_delay = 0;
2882 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2883 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2884 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2885 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2886 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2887 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2888 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2890 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2896 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2897 out_ip6_prohibit_entry:
2898 kfree(net->ipv6.ip6_prohibit_entry);
2900 kfree(net->ipv6.ip6_null_entry);
2902 out_ip6_dst_entries:
2903 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2908 static void __net_exit ip6_route_net_exit(struct net *net)
2910 kfree(net->ipv6.ip6_null_entry);
2911 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2912 kfree(net->ipv6.ip6_prohibit_entry);
2913 kfree(net->ipv6.ip6_blk_hole_entry);
2915 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2918 static int __net_init ip6_route_net_init_late(struct net *net)
2920 #ifdef CONFIG_PROC_FS
2921 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2922 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2927 static void __net_exit ip6_route_net_exit_late(struct net *net)
2929 #ifdef CONFIG_PROC_FS
2930 proc_net_remove(net, "ipv6_route");
2931 proc_net_remove(net, "rt6_stats");
2935 static struct pernet_operations ip6_route_net_ops = {
2936 .init = ip6_route_net_init,
2937 .exit = ip6_route_net_exit,
2940 static struct pernet_operations ip6_route_net_late_ops = {
2941 .init = ip6_route_net_init_late,
2942 .exit = ip6_route_net_exit_late,
2945 static struct notifier_block ip6_route_dev_notifier = {
2946 .notifier_call = ip6_route_dev_notify,
2950 int __init ip6_route_init(void)
2955 ip6_dst_ops_template.kmem_cachep =
2956 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2957 SLAB_HWCACHE_ALIGN, NULL);
2958 if (!ip6_dst_ops_template.kmem_cachep)
2961 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2963 goto out_kmem_cache;
2965 ret = register_pernet_subsys(&ip6_route_net_ops);
2967 goto out_dst_entries;
2969 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2971 /* Registering of the loopback is done before this portion of code,
2972 * the loopback reference in rt6_info will not be taken, do it
2973 * manually for init_net */
2974 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2975 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2976 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2977 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2978 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2979 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2980 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2984 goto out_register_subsys;
2990 ret = fib6_rules_init();
2994 ret = register_pernet_subsys(&ip6_route_net_late_ops);
2996 goto fib6_rules_init;
2999 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3000 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3001 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3002 goto out_register_late_subsys;
3004 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3006 goto out_register_late_subsys;
3011 out_register_late_subsys:
3012 unregister_pernet_subsys(&ip6_route_net_late_ops);
3014 fib6_rules_cleanup();
3019 out_register_subsys:
3020 unregister_pernet_subsys(&ip6_route_net_ops);
3022 dst_entries_destroy(&ip6_dst_blackhole_ops);
3024 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3028 void ip6_route_cleanup(void)
3030 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3031 unregister_pernet_subsys(&ip6_route_net_late_ops);
3032 fib6_rules_cleanup();
3035 unregister_pernet_subsys(&ip6_route_net_ops);
3036 dst_entries_destroy(&ip6_dst_blackhole_ops);
3037 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);