2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
53 #include <linux/rtnetlink.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
59 #include <asm/uaccess.h>
62 #include <linux/sysctl.h>
65 /* Set to 3 to get tracing. */
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
73 #define RT6_TRACE(x...) do { ; } while (0)
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77 const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void ip6_dst_destroy(struct dst_entry *);
83 static void ip6_dst_ifdown(struct dst_entry *,
84 struct net_device *dev, int how);
85 static int ip6_dst_gc(struct dst_ops *ops);
87 static int ip6_pkt_discard(struct sk_buff *skb);
88 static int ip6_pkt_discard_out(struct sk_buff *skb);
89 static void ip6_link_failure(struct sk_buff *skb);
90 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94 const struct in6_addr *prefix, int prefixlen,
95 const struct in6_addr *gwaddr, int ifindex,
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98 const struct in6_addr *prefix, int prefixlen,
99 const struct in6_addr *gwaddr, int ifindex);
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
104 struct rt6_info *rt = (struct rt6_info *) dst;
105 struct inet_peer *peer;
108 if (!(rt->dst.flags & DST_HOST))
109 return dst_cow_metrics_generic(dst, old);
112 rt6_bind_peer(rt, 1);
114 peer = rt->rt6i_peer;
116 u32 *old_p = __DST_METRICS_PTR(old);
117 unsigned long prev, new;
120 if (inet_metrics_new(peer))
121 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
123 new = (unsigned long) p;
124 prev = cmpxchg(&dst->_metrics, old, new);
127 p = __DST_METRICS_PTR(prev);
128 if (prev & DST_METRICS_READ_ONLY)
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
137 return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
140 static struct dst_ops ip6_dst_ops_template = {
142 .protocol = cpu_to_be16(ETH_P_IPV6),
145 .check = ip6_dst_check,
146 .default_advmss = ip6_default_advmss,
148 .cow_metrics = ipv6_cow_metrics,
149 .destroy = ip6_dst_destroy,
150 .ifdown = ip6_dst_ifdown,
151 .negative_advice = ip6_negative_advice,
152 .link_failure = ip6_link_failure,
153 .update_pmtu = ip6_rt_update_pmtu,
154 .local_out = __ip6_local_out,
155 .neigh_lookup = ip6_neigh_lookup,
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
160 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
162 return mtu ? : dst->dev->mtu;
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
175 static struct dst_ops ip6_dst_blackhole_ops = {
177 .protocol = cpu_to_be16(ETH_P_IPV6),
178 .destroy = ip6_dst_destroy,
179 .check = ip6_dst_check,
180 .mtu = ip6_blackhole_mtu,
181 .default_advmss = ip6_default_advmss,
182 .update_pmtu = ip6_rt_blackhole_update_pmtu,
183 .cow_metrics = ip6_rt_blackhole_cow_metrics,
184 .neigh_lookup = ip6_neigh_lookup,
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188 [RTAX_HOPLIMIT - 1] = 0,
191 static struct rt6_info ip6_null_entry_template = {
193 .__refcnt = ATOMIC_INIT(1),
196 .error = -ENETUNREACH,
197 .input = ip6_pkt_discard,
198 .output = ip6_pkt_discard_out,
200 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
201 .rt6i_protocol = RTPROT_KERNEL,
202 .rt6i_metric = ~(u32) 0,
203 .rt6i_ref = ATOMIC_INIT(1),
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
211 static struct rt6_info ip6_prohibit_entry_template = {
213 .__refcnt = ATOMIC_INIT(1),
217 .input = ip6_pkt_prohibit,
218 .output = ip6_pkt_prohibit_out,
220 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
221 .rt6i_protocol = RTPROT_KERNEL,
222 .rt6i_metric = ~(u32) 0,
223 .rt6i_ref = ATOMIC_INIT(1),
226 static struct rt6_info ip6_blk_hole_entry_template = {
228 .__refcnt = ATOMIC_INIT(1),
232 .input = dst_discard,
233 .output = dst_discard,
235 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
236 .rt6i_protocol = RTPROT_KERNEL,
237 .rt6i_metric = ~(u32) 0,
238 .rt6i_ref = ATOMIC_INIT(1),
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245 struct net_device *dev,
248 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
251 memset(&rt->rt6i_table, 0,
252 sizeof(*rt) - sizeof(struct dst_entry));
257 static void ip6_dst_destroy(struct dst_entry *dst)
259 struct rt6_info *rt = (struct rt6_info *)dst;
260 struct inet6_dev *idev = rt->rt6i_idev;
261 struct inet_peer *peer = rt->rt6i_peer;
263 if (!(rt->dst.flags & DST_HOST))
264 dst_destroy_metrics_generic(dst);
267 rt->rt6i_idev = NULL;
271 rt->rt6i_peer = NULL;
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
278 static u32 rt6_peer_genid(void)
280 return atomic_read(&__rt6_peer_genid);
283 void rt6_bind_peer(struct rt6_info *rt, int create)
285 struct inet_peer *peer;
287 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
291 rt->rt6i_peer_genid = rt6_peer_genid();
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
297 struct rt6_info *rt = (struct rt6_info *)dst;
298 struct inet6_dev *idev = rt->rt6i_idev;
299 struct net_device *loopback_dev =
300 dev_net(dev)->loopback_dev;
302 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
303 struct inet6_dev *loopback_idev =
304 in6_dev_get(loopback_dev);
305 if (loopback_idev != NULL) {
306 rt->rt6i_idev = loopback_idev;
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
314 return (rt->rt6i_flags & RTF_EXPIRES) &&
315 time_after(jiffies, rt->rt6i_expires);
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
320 return ipv6_addr_type(daddr) &
321 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
325 * Route lookup. Any table->tb6_lock is implied.
328 static inline struct rt6_info *rt6_device_match(struct net *net,
330 const struct in6_addr *saddr,
334 struct rt6_info *local = NULL;
335 struct rt6_info *sprt;
337 if (!oif && ipv6_addr_any(saddr))
340 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341 struct net_device *dev = sprt->rt6i_dev;
344 if (dev->ifindex == oif)
346 if (dev->flags & IFF_LOOPBACK) {
347 if (sprt->rt6i_idev == NULL ||
348 sprt->rt6i_idev->dev->ifindex != oif) {
349 if (flags & RT6_LOOKUP_F_IFACE && oif)
351 if (local && (!oif ||
352 local->rt6i_idev->dev->ifindex == oif))
358 if (ipv6_chk_addr(net, saddr, dev,
359 flags & RT6_LOOKUP_F_IFACE))
368 if (flags & RT6_LOOKUP_F_IFACE)
369 return net->ipv6.ip6_null_entry;
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
378 struct neighbour *neigh;
380 * Okay, this does not seem to be appropriate
381 * for now, however, we need to check if it
382 * is really so; aka Router Reachability Probing.
384 * Router Reachability Probe MUST be rate-limited
385 * to no more than one per minute.
388 neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389 if (!neigh || (neigh->nud_state & NUD_VALID))
391 read_lock_bh(&neigh->lock);
392 if (!(neigh->nud_state & NUD_VALID) &&
393 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394 struct in6_addr mcaddr;
395 struct in6_addr *target;
397 neigh->updated = jiffies;
398 read_unlock_bh(&neigh->lock);
400 target = (struct in6_addr *)&neigh->primary_key;
401 addrconf_addr_solict_mult(target, &mcaddr);
402 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
404 read_unlock_bh(&neigh->lock);
410 static inline void rt6_probe(struct rt6_info *rt)
416 * Default Router Selection (RFC 2461 6.3.6)
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
420 struct net_device *dev = rt->rt6i_dev;
421 if (!oif || dev->ifindex == oif)
423 if ((dev->flags & IFF_LOOPBACK) &&
424 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
429 static inline int rt6_check_neigh(struct rt6_info *rt)
431 struct neighbour *neigh;
435 neigh = dst_get_neighbour(&rt->dst);
436 if (rt->rt6i_flags & RTF_NONEXTHOP ||
437 !(rt->rt6i_flags & RTF_GATEWAY))
440 read_lock_bh(&neigh->lock);
441 if (neigh->nud_state & NUD_VALID)
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444 else if (neigh->nud_state & NUD_FAILED)
449 read_unlock_bh(&neigh->lock);
456 static int rt6_score_route(struct rt6_info *rt, int oif,
461 m = rt6_check_dev(rt, oif);
462 if (!m && (strict & RT6_LOOKUP_F_IFACE))
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
467 n = rt6_check_neigh(rt);
468 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474 int *mpri, struct rt6_info *match)
478 if (rt6_check_expired(rt))
481 m = rt6_score_route(rt, oif, strict);
486 if (strict & RT6_LOOKUP_F_REACHABLE)
490 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499 struct rt6_info *rr_head,
500 u32 metric, int oif, int strict)
502 struct rt6_info *rt, *match;
506 for (rt = rr_head; rt && rt->rt6i_metric == metric;
507 rt = rt->dst.rt6_next)
508 match = find_match(rt, oif, strict, &mpri, match);
509 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510 rt = rt->dst.rt6_next)
511 match = find_match(rt, oif, strict, &mpri, match);
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
518 struct rt6_info *match, *rt0;
521 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522 __func__, fn->leaf, oif);
526 fn->rr_ptr = rt0 = fn->leaf;
528 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
531 (strict & RT6_LOOKUP_F_REACHABLE)) {
532 struct rt6_info *next = rt0->dst.rt6_next;
534 /* no entries matched; do round-robin */
535 if (!next || next->rt6i_metric != rt0->rt6i_metric)
542 RT6_TRACE("%s() => %p\n",
545 net = dev_net(rt0->rt6i_dev);
546 return match ? match : net->ipv6.ip6_null_entry;
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551 const struct in6_addr *gwaddr)
553 struct net *net = dev_net(dev);
554 struct route_info *rinfo = (struct route_info *) opt;
555 struct in6_addr prefix_buf, *prefix;
557 unsigned long lifetime;
560 if (len < sizeof(struct route_info)) {
564 /* Sanity check for prefix_len and length */
565 if (rinfo->length > 3) {
567 } else if (rinfo->prefix_len > 128) {
569 } else if (rinfo->prefix_len > 64) {
570 if (rinfo->length < 2) {
573 } else if (rinfo->prefix_len > 0) {
574 if (rinfo->length < 1) {
579 pref = rinfo->route_pref;
580 if (pref == ICMPV6_ROUTER_PREF_INVALID)
583 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
585 if (rinfo->length == 3)
586 prefix = (struct in6_addr *)rinfo->prefix;
588 /* this function is safe */
589 ipv6_addr_prefix(&prefix_buf,
590 (struct in6_addr *)rinfo->prefix,
592 prefix = &prefix_buf;
595 if (rinfo->prefix_len == 0)
596 rt = rt6_get_dflt_router(gwaddr, dev);
598 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
599 gwaddr, dev->ifindex);
601 if (rt && !lifetime) {
607 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
610 rt->rt6i_flags = RTF_ROUTEINFO |
611 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
614 if (!addrconf_finite_timeout(lifetime)) {
615 rt->rt6i_flags &= ~RTF_EXPIRES;
617 rt->rt6i_expires = jiffies + HZ * lifetime;
618 rt->rt6i_flags |= RTF_EXPIRES;
620 dst_release(&rt->dst);
626 #define BACKTRACK(__net, saddr) \
628 if (rt == __net->ipv6.ip6_null_entry) { \
629 struct fib6_node *pn; \
631 if (fn->fn_flags & RTN_TL_ROOT) \
634 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
635 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
638 if (fn->fn_flags & RTN_RTINFO) \
644 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
645 struct fib6_table *table,
646 struct flowi6 *fl6, int flags)
648 struct fib6_node *fn;
651 read_lock_bh(&table->tb6_lock);
652 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
655 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
656 BACKTRACK(net, &fl6->saddr);
658 dst_use(&rt->dst, jiffies);
659 read_unlock_bh(&table->tb6_lock);
664 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
665 const struct in6_addr *saddr, int oif, int strict)
667 struct flowi6 fl6 = {
671 struct dst_entry *dst;
672 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
675 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
676 flags |= RT6_LOOKUP_F_HAS_SADDR;
679 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
681 return (struct rt6_info *) dst;
688 EXPORT_SYMBOL(rt6_lookup);
690 /* ip6_ins_rt is called with FREE table->tb6_lock.
691 It takes new route entry, the addition fails by any reason the
692 route is freed. In any case, if caller does not hold it, it may
696 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
699 struct fib6_table *table;
701 table = rt->rt6i_table;
702 write_lock_bh(&table->tb6_lock);
703 err = fib6_add(&table->tb6_root, rt, info);
704 write_unlock_bh(&table->tb6_lock);
709 int ip6_ins_rt(struct rt6_info *rt)
711 struct nl_info info = {
712 .nl_net = dev_net(rt->rt6i_dev),
714 return __ip6_ins_rt(rt, &info);
717 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
718 const struct in6_addr *daddr,
719 const struct in6_addr *saddr)
727 rt = ip6_rt_copy(ort, daddr);
730 struct neighbour *neigh;
731 int attempts = !in_softirq();
733 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
734 if (ort->rt6i_dst.plen != 128 &&
735 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
736 rt->rt6i_flags |= RTF_ANYCAST;
737 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
740 rt->rt6i_flags |= RTF_CACHE;
742 #ifdef CONFIG_IPV6_SUBTREES
743 if (rt->rt6i_src.plen && saddr) {
744 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
745 rt->rt6i_src.plen = 128;
750 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
752 struct net *net = dev_net(rt->rt6i_dev);
753 int saved_rt_min_interval =
754 net->ipv6.sysctl.ip6_rt_gc_min_interval;
755 int saved_rt_elasticity =
756 net->ipv6.sysctl.ip6_rt_gc_elasticity;
758 if (attempts-- > 0) {
759 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
760 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
762 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
764 net->ipv6.sysctl.ip6_rt_gc_elasticity =
766 net->ipv6.sysctl.ip6_rt_gc_min_interval =
767 saved_rt_min_interval;
773 "ipv6: Neighbour table overflow.\n");
777 dst_set_neighbour(&rt->dst, neigh);
784 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
785 const struct in6_addr *daddr)
787 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
790 rt->rt6i_flags |= RTF_CACHE;
791 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
796 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
797 struct flowi6 *fl6, int flags, bool input)
799 struct fib6_node *fn;
800 struct rt6_info *rt, *nrt;
804 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
805 int local = RTF_NONEXTHOP;
807 strict |= flags & RT6_LOOKUP_F_IFACE;
812 read_lock_bh(&table->tb6_lock);
815 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
818 rt = rt6_select(fn, oif, strict | reachable);
820 BACKTRACK(net, &fl6->saddr);
821 if (rt == net->ipv6.ip6_null_entry ||
822 rt->rt6i_flags & RTF_CACHE)
826 read_unlock_bh(&table->tb6_lock);
828 if (!dst_get_neighbour_raw(&rt->dst)
829 && !(rt->rt6i_flags & local))
830 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
831 else if (!(rt->dst.flags & DST_HOST))
832 nrt = rt6_alloc_clone(rt, &fl6->daddr);
836 dst_release(&rt->dst);
837 rt = nrt ? : net->ipv6.ip6_null_entry;
841 err = ip6_ins_rt(nrt);
850 * Race condition! In the gap, when table->tb6_lock was
851 * released someone could insert this route. Relookup.
853 dst_release(&rt->dst);
862 read_unlock_bh(&table->tb6_lock);
864 rt->dst.lastuse = jiffies;
870 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
871 struct flowi6 *fl6, int flags)
873 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags, true);
876 void ip6_route_input(struct sk_buff *skb)
878 const struct ipv6hdr *iph = ipv6_hdr(skb);
879 struct net *net = dev_net(skb->dev);
880 int flags = RT6_LOOKUP_F_HAS_SADDR;
881 struct flowi6 fl6 = {
882 .flowi6_iif = skb->dev->ifindex,
885 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
886 .flowi6_mark = skb->mark,
887 .flowi6_proto = iph->nexthdr,
890 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
891 flags |= RT6_LOOKUP_F_IFACE;
893 skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
896 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
897 struct flowi6 *fl6, int flags)
899 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags, false);
902 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
907 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
908 flags |= RT6_LOOKUP_F_IFACE;
910 if (!ipv6_addr_any(&fl6->saddr))
911 flags |= RT6_LOOKUP_F_HAS_SADDR;
913 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
915 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
918 EXPORT_SYMBOL(ip6_route_output);
920 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
922 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
923 struct dst_entry *new = NULL;
925 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
927 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
932 new->input = dst_discard;
933 new->output = dst_discard;
935 if (dst_metrics_read_only(&ort->dst))
936 new->_metrics = ort->dst._metrics;
938 dst_copy_metrics(new, &ort->dst);
939 rt->rt6i_idev = ort->rt6i_idev;
941 in6_dev_hold(rt->rt6i_idev);
942 rt->rt6i_expires = 0;
944 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
945 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
948 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
949 #ifdef CONFIG_IPV6_SUBTREES
950 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
956 dst_release(dst_orig);
957 return new ? new : ERR_PTR(-ENOMEM);
961 * Destination cache support functions
964 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
969 rt = (struct rt6_info *) dst;
971 if (rt6_get_cookie_safe(rt, &rt_cookie) && rt_cookie == cookie) {
972 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
974 rt6_bind_peer(rt, 0);
975 rt->rt6i_peer_genid = rt6_peer_genid();
982 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
984 struct rt6_info *rt = (struct rt6_info *) dst;
987 if (rt->rt6i_flags & RTF_CACHE) {
988 if (rt6_check_expired(rt)) {
1000 static void ip6_link_failure(struct sk_buff *skb)
1002 struct rt6_info *rt;
1004 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1006 rt = (struct rt6_info *) skb_dst(skb);
1008 if (rt->rt6i_flags&RTF_CACHE) {
1009 dst_set_expires(&rt->dst, 0);
1010 rt->rt6i_flags |= RTF_EXPIRES;
1012 struct fib6_node *fn;
1015 fn = rcu_dereference(rt->rt6i_node);
1016 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1023 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1025 struct rt6_info *rt6 = (struct rt6_info*)dst;
1027 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1028 rt6->rt6i_flags |= RTF_MODIFIED;
1029 if (mtu < IPV6_MIN_MTU)
1032 dst_metric_set(dst, RTAX_MTU, mtu);
1036 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1038 struct net_device *dev = dst->dev;
1039 unsigned int mtu = dst_mtu(dst);
1040 struct net *net = dev_net(dev);
1042 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1044 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1045 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1048 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1049 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1050 * IPV6_MAXPLEN is also valid and means: "any MSS,
1051 * rely only on pmtu discovery"
1053 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1058 static unsigned int ip6_mtu(const struct dst_entry *dst)
1060 struct inet6_dev *idev;
1061 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1069 idev = __in6_dev_get(dst->dev);
1071 mtu = idev->cnf.mtu6;
1075 return min_t(unsigned int, mtu, IP6_MAX_MTU);
1078 static struct dst_entry *icmp6_dst_gc_list;
1079 static DEFINE_SPINLOCK(icmp6_dst_lock);
1081 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1082 struct neighbour *neigh,
1083 const struct in6_addr *addr)
1085 struct rt6_info *rt;
1086 struct inet6_dev *idev = in6_dev_get(dev);
1087 struct net *net = dev_net(dev);
1089 if (unlikely(idev == NULL))
1092 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1093 if (unlikely(rt == NULL)) {
1101 neigh = ndisc_get_neigh(dev, addr);
1106 rt->dst.flags |= DST_HOST;
1107 rt->dst.output = ip6_output;
1108 dst_set_neighbour(&rt->dst, neigh);
1109 atomic_set(&rt->dst.__refcnt, 1);
1110 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1111 rt->rt6i_dst.plen = 128;
1112 rt->rt6i_idev = idev;
1113 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1115 spin_lock_bh(&icmp6_dst_lock);
1116 rt->dst.next = icmp6_dst_gc_list;
1117 icmp6_dst_gc_list = &rt->dst;
1118 spin_unlock_bh(&icmp6_dst_lock);
1120 fib6_force_start_gc(net);
1126 int icmp6_dst_gc(void)
1128 struct dst_entry *dst, **pprev;
1131 spin_lock_bh(&icmp6_dst_lock);
1132 pprev = &icmp6_dst_gc_list;
1134 while ((dst = *pprev) != NULL) {
1135 if (!atomic_read(&dst->__refcnt)) {
1144 spin_unlock_bh(&icmp6_dst_lock);
1149 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1152 struct dst_entry *dst, **pprev;
1154 spin_lock_bh(&icmp6_dst_lock);
1155 pprev = &icmp6_dst_gc_list;
1156 while ((dst = *pprev) != NULL) {
1157 struct rt6_info *rt = (struct rt6_info *) dst;
1158 if (func(rt, arg)) {
1165 spin_unlock_bh(&icmp6_dst_lock);
1168 static int ip6_dst_gc(struct dst_ops *ops)
1170 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1171 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1172 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1173 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1174 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1175 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1178 entries = dst_entries_get_fast(ops);
1179 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1180 entries <= rt_max_size)
1183 net->ipv6.ip6_rt_gc_expire++;
1184 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, entries > rt_max_size);
1185 entries = dst_entries_get_slow(ops);
1186 if (entries < ops->gc_thresh)
1187 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1189 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1190 return entries > rt_max_size;
1193 /* Clean host part of a prefix. Not necessary in radix tree,
1194 but results in cleaner routing tables.
1196 Remove it only when all the things will work!
1199 int ip6_dst_hoplimit(struct dst_entry *dst)
1201 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1202 if (hoplimit == 0) {
1203 struct net_device *dev = dst->dev;
1204 struct inet6_dev *idev;
1207 idev = __in6_dev_get(dev);
1209 hoplimit = idev->cnf.hop_limit;
1211 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1216 EXPORT_SYMBOL(ip6_dst_hoplimit);
1222 int ip6_route_add(struct fib6_config *cfg)
1225 struct net *net = cfg->fc_nlinfo.nl_net;
1226 struct rt6_info *rt = NULL;
1227 struct net_device *dev = NULL;
1228 struct inet6_dev *idev = NULL;
1229 struct fib6_table *table;
1232 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1234 #ifndef CONFIG_IPV6_SUBTREES
1235 if (cfg->fc_src_len)
1238 if (cfg->fc_ifindex) {
1240 dev = dev_get_by_index(net, cfg->fc_ifindex);
1243 idev = in6_dev_get(dev);
1248 if (cfg->fc_metric == 0)
1249 cfg->fc_metric = IP6_RT_PRIO_USER;
1251 table = fib6_new_table(net, cfg->fc_table);
1252 if (table == NULL) {
1257 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1264 rt->dst.obsolete = -1;
1265 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1266 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1269 if (cfg->fc_protocol == RTPROT_UNSPEC)
1270 cfg->fc_protocol = RTPROT_BOOT;
1271 rt->rt6i_protocol = cfg->fc_protocol;
1273 addr_type = ipv6_addr_type(&cfg->fc_dst);
1275 if (addr_type & IPV6_ADDR_MULTICAST)
1276 rt->dst.input = ip6_mc_input;
1277 else if (cfg->fc_flags & RTF_LOCAL)
1278 rt->dst.input = ip6_input;
1280 rt->dst.input = ip6_forward;
1282 rt->dst.output = ip6_output;
1284 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1285 rt->rt6i_dst.plen = cfg->fc_dst_len;
1286 if (rt->rt6i_dst.plen == 128)
1287 rt->dst.flags |= DST_HOST;
1289 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1290 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1295 dst_init_metrics(&rt->dst, metrics, 0);
1297 #ifdef CONFIG_IPV6_SUBTREES
1298 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1299 rt->rt6i_src.plen = cfg->fc_src_len;
1302 rt->rt6i_metric = cfg->fc_metric;
1304 /* We cannot add true routes via loopback here,
1305 they would result in kernel looping; promote them to reject routes
1307 if ((cfg->fc_flags & RTF_REJECT) ||
1308 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1309 && !(cfg->fc_flags&RTF_LOCAL))) {
1310 /* hold loopback dev/idev if we haven't done so. */
1311 if (dev != net->loopback_dev) {
1316 dev = net->loopback_dev;
1318 idev = in6_dev_get(dev);
1324 rt->dst.output = ip6_pkt_discard_out;
1325 rt->dst.input = ip6_pkt_discard;
1326 rt->dst.error = -ENETUNREACH;
1327 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1331 if (cfg->fc_flags & RTF_GATEWAY) {
1332 const struct in6_addr *gw_addr;
1335 gw_addr = &cfg->fc_gateway;
1336 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1337 gwa_type = ipv6_addr_type(gw_addr);
1339 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1340 struct rt6_info *grt;
1342 /* IPv6 strictly inhibits using not link-local
1343 addresses as nexthop address.
1344 Otherwise, router will not able to send redirects.
1345 It is very good, but in some (rare!) circumstances
1346 (SIT, PtP, NBMA NOARP links) it is handy to allow
1347 some exceptions. --ANK
1350 if (!(gwa_type&IPV6_ADDR_UNICAST))
1353 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1355 err = -EHOSTUNREACH;
1359 if (dev != grt->rt6i_dev) {
1360 dst_release(&grt->dst);
1364 dev = grt->rt6i_dev;
1365 idev = grt->rt6i_idev;
1367 in6_dev_hold(grt->rt6i_idev);
1369 if (!(grt->rt6i_flags&RTF_GATEWAY))
1371 dst_release(&grt->dst);
1377 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1385 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1386 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1390 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1391 rt->rt6i_prefsrc.plen = 128;
1393 rt->rt6i_prefsrc.plen = 0;
1395 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1396 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1401 dst_set_neighbour(&rt->dst, n);
1404 rt->rt6i_flags = cfg->fc_flags;
1411 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1412 int type = nla_type(nla);
1415 if (type > RTAX_MAX) {
1420 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1426 rt->rt6i_idev = idev;
1427 rt->rt6i_table = table;
1429 cfg->fc_nlinfo.nl_net = dev_net(dev);
1431 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1443 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1446 struct fib6_table *table;
1447 struct net *net = dev_net(rt->rt6i_dev);
1449 if (rt == net->ipv6.ip6_null_entry) {
1454 table = rt->rt6i_table;
1455 write_lock_bh(&table->tb6_lock);
1456 err = fib6_del(rt, info);
1457 write_unlock_bh(&table->tb6_lock);
1460 dst_release(&rt->dst);
1464 int ip6_del_rt(struct rt6_info *rt)
1466 struct nl_info info = {
1467 .nl_net = dev_net(rt->rt6i_dev),
1469 return __ip6_del_rt(rt, &info);
1472 static int ip6_route_del(struct fib6_config *cfg)
1474 struct fib6_table *table;
1475 struct fib6_node *fn;
1476 struct rt6_info *rt;
1479 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1483 read_lock_bh(&table->tb6_lock);
1485 fn = fib6_locate(&table->tb6_root,
1486 &cfg->fc_dst, cfg->fc_dst_len,
1487 &cfg->fc_src, cfg->fc_src_len);
1490 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1491 if (cfg->fc_ifindex &&
1492 (rt->rt6i_dev == NULL ||
1493 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1495 if (cfg->fc_flags & RTF_GATEWAY &&
1496 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1498 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1501 read_unlock_bh(&table->tb6_lock);
1503 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1506 read_unlock_bh(&table->tb6_lock);
1514 struct ip6rd_flowi {
1516 struct in6_addr gateway;
1519 static struct rt6_info *__ip6_route_redirect(struct net *net,
1520 struct fib6_table *table,
1524 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1525 struct rt6_info *rt;
1526 struct fib6_node *fn;
1529 * Get the "current" route for this destination and
1530 * check if the redirect has come from approriate router.
1532 * RFC 2461 specifies that redirects should only be
1533 * accepted if they come from the nexthop to the target.
1534 * Due to the way the routes are chosen, this notion
1535 * is a bit fuzzy and one might need to check all possible
1539 read_lock_bh(&table->tb6_lock);
1540 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1542 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1544 * Current route is on-link; redirect is always invalid.
1546 * Seems, previous statement is not true. It could
1547 * be node, which looks for us as on-link (f.e. proxy ndisc)
1548 * But then router serving it might decide, that we should
1549 * know truth 8)8) --ANK (980726).
1551 if (rt6_check_expired(rt))
1553 if (!(rt->rt6i_flags & RTF_GATEWAY))
1555 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1557 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1563 rt = net->ipv6.ip6_null_entry;
1564 BACKTRACK(net, &fl6->saddr);
1568 read_unlock_bh(&table->tb6_lock);
1573 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1574 const struct in6_addr *src,
1575 const struct in6_addr *gateway,
1576 struct net_device *dev)
1578 int flags = RT6_LOOKUP_F_HAS_SADDR;
1579 struct net *net = dev_net(dev);
1580 struct ip6rd_flowi rdfl = {
1582 .flowi6_oif = dev->ifindex,
1588 ipv6_addr_copy(&rdfl.gateway, gateway);
1590 if (rt6_need_strict(dest))
1591 flags |= RT6_LOOKUP_F_IFACE;
1593 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1594 flags, __ip6_route_redirect);
1597 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1598 const struct in6_addr *saddr,
1599 struct neighbour *neigh, u8 *lladdr, int on_link)
1601 struct rt6_info *rt, *nrt = NULL;
1602 struct netevent_redirect netevent;
1603 struct net *net = dev_net(neigh->dev);
1605 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1607 if (rt == net->ipv6.ip6_null_entry) {
1608 if (net_ratelimit())
1609 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1610 "for redirect target\n");
1615 * We have finally decided to accept it.
1618 neigh_update(neigh, lladdr, NUD_STALE,
1619 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1620 NEIGH_UPDATE_F_OVERRIDE|
1621 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1622 NEIGH_UPDATE_F_ISROUTER))
1626 * Redirect received -> path was valid.
1627 * Look, redirects are sent only in response to data packets,
1628 * so that this nexthop apparently is reachable. --ANK
1630 dst_confirm(&rt->dst);
1632 /* Duplicate redirect: silently ignore. */
1633 if (neigh == dst_get_neighbour_raw(&rt->dst))
1636 nrt = ip6_rt_copy(rt, dest);
1640 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1642 nrt->rt6i_flags &= ~RTF_GATEWAY;
1644 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1645 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1647 if (ip6_ins_rt(nrt))
1650 netevent.old = &rt->dst;
1651 netevent.new = &nrt->dst;
1652 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1654 if (rt->rt6i_flags&RTF_CACHE) {
1660 dst_release(&rt->dst);
1664 * Handle ICMP "packet too big" messages
1665 * i.e. Path MTU discovery
1668 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1669 struct net *net, u32 pmtu, int ifindex)
1671 struct rt6_info *rt, *nrt;
1674 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1678 if (rt6_check_expired(rt)) {
1683 if (pmtu >= dst_mtu(&rt->dst))
1686 if (pmtu < IPV6_MIN_MTU) {
1688 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1689 * MTU (1280) and a fragment header should always be included
1690 * after a node receiving Too Big message reporting PMTU is
1691 * less than the IPv6 Minimum Link MTU.
1693 pmtu = IPV6_MIN_MTU;
1697 /* New mtu received -> path was valid.
1698 They are sent only in response to data packets,
1699 so that this nexthop apparently is reachable. --ANK
1701 dst_confirm(&rt->dst);
1703 /* Host route. If it is static, it would be better
1704 not to override it, but add new one, so that
1705 when cache entry will expire old pmtu
1706 would return automatically.
1708 if (rt->rt6i_flags & RTF_CACHE) {
1709 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1711 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1712 features |= RTAX_FEATURE_ALLFRAG;
1713 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1715 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1716 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1721 Two cases are possible:
1722 1. It is connected route. Action: COW
1723 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1725 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1726 nrt = rt6_alloc_cow(rt, daddr, saddr);
1728 nrt = rt6_alloc_clone(rt, daddr);
1731 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1733 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1734 features |= RTAX_FEATURE_ALLFRAG;
1735 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1738 /* According to RFC 1981, detecting PMTU increase shouldn't be
1739 * happened within 5 mins, the recommended timer is 10 mins.
1740 * Here this route expiration time is set to ip6_rt_mtu_expires
1741 * which is 10 mins. After 10 mins the decreased pmtu is expired
1742 * and detecting PMTU increase will be automatically happened.
1744 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1745 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1750 dst_release(&rt->dst);
1753 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1754 struct net_device *dev, u32 pmtu)
1756 struct net *net = dev_net(dev);
1759 * RFC 1981 states that a node "MUST reduce the size of the packets it
1760 * is sending along the path" that caused the Packet Too Big message.
1761 * Since it's not possible in the general case to determine which
1762 * interface was used to send the original packet, we update the MTU
1763 * on the interface that will be used to send future packets. We also
1764 * update the MTU on the interface that received the Packet Too Big in
1765 * case the original packet was forced out that interface with
1766 * SO_BINDTODEVICE or similar. This is the next best thing to the
1767 * correct behaviour, which would be to update the MTU on all
1770 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1771 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1775 * Misc support functions
1778 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1779 const struct in6_addr *dest)
1781 struct net *net = dev_net(ort->rt6i_dev);
1782 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1786 rt->dst.input = ort->dst.input;
1787 rt->dst.output = ort->dst.output;
1788 rt->dst.flags |= DST_HOST;
1790 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1791 rt->rt6i_dst.plen = 128;
1792 dst_copy_metrics(&rt->dst, &ort->dst);
1793 rt->dst.error = ort->dst.error;
1794 rt->rt6i_idev = ort->rt6i_idev;
1796 in6_dev_hold(rt->rt6i_idev);
1797 rt->dst.lastuse = jiffies;
1798 rt->rt6i_expires = 0;
1800 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1801 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1802 rt->rt6i_metric = 0;
1804 #ifdef CONFIG_IPV6_SUBTREES
1805 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1807 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1808 rt->rt6i_table = ort->rt6i_table;
1813 #ifdef CONFIG_IPV6_ROUTE_INFO
1814 static struct rt6_info *rt6_get_route_info(struct net *net,
1815 const struct in6_addr *prefix, int prefixlen,
1816 const struct in6_addr *gwaddr, int ifindex)
1818 struct fib6_node *fn;
1819 struct rt6_info *rt = NULL;
1820 struct fib6_table *table;
1822 table = fib6_get_table(net, RT6_TABLE_INFO);
1826 write_lock_bh(&table->tb6_lock);
1827 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1831 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1832 if (rt->rt6i_dev->ifindex != ifindex)
1834 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1836 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1842 write_unlock_bh(&table->tb6_lock);
1846 static struct rt6_info *rt6_add_route_info(struct net *net,
1847 const struct in6_addr *prefix, int prefixlen,
1848 const struct in6_addr *gwaddr, int ifindex,
1851 struct fib6_config cfg = {
1852 .fc_table = RT6_TABLE_INFO,
1853 .fc_metric = IP6_RT_PRIO_USER,
1854 .fc_ifindex = ifindex,
1855 .fc_dst_len = prefixlen,
1856 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1857 RTF_UP | RTF_PREF(pref),
1859 .fc_nlinfo.nlh = NULL,
1860 .fc_nlinfo.nl_net = net,
1863 ipv6_addr_copy(&cfg.fc_dst, prefix);
1864 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1866 /* We should treat it as a default route if prefix length is 0. */
1868 cfg.fc_flags |= RTF_DEFAULT;
1870 ip6_route_add(&cfg);
1872 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1876 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1878 struct rt6_info *rt;
1879 struct fib6_table *table;
1881 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1885 write_lock_bh(&table->tb6_lock);
1886 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1887 if (dev == rt->rt6i_dev &&
1888 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1889 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1894 write_unlock_bh(&table->tb6_lock);
1898 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1899 struct net_device *dev,
1902 struct fib6_config cfg = {
1903 .fc_table = RT6_TABLE_DFLT,
1904 .fc_metric = IP6_RT_PRIO_USER,
1905 .fc_ifindex = dev->ifindex,
1906 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1907 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1909 .fc_nlinfo.nlh = NULL,
1910 .fc_nlinfo.nl_net = dev_net(dev),
1913 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1915 ip6_route_add(&cfg);
1917 return rt6_get_dflt_router(gwaddr, dev);
1920 void rt6_purge_dflt_routers(struct net *net)
1922 struct rt6_info *rt;
1923 struct fib6_table *table;
1925 /* NOTE: Keep consistent with rt6_get_dflt_router */
1926 table = fib6_get_table(net, RT6_TABLE_DFLT);
1931 read_lock_bh(&table->tb6_lock);
1932 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1933 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
1934 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
1936 read_unlock_bh(&table->tb6_lock);
1941 read_unlock_bh(&table->tb6_lock);
1944 static void rtmsg_to_fib6_config(struct net *net,
1945 struct in6_rtmsg *rtmsg,
1946 struct fib6_config *cfg)
1948 memset(cfg, 0, sizeof(*cfg));
1950 cfg->fc_table = RT6_TABLE_MAIN;
1951 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1952 cfg->fc_metric = rtmsg->rtmsg_metric;
1953 cfg->fc_expires = rtmsg->rtmsg_info;
1954 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1955 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1956 cfg->fc_flags = rtmsg->rtmsg_flags;
1958 cfg->fc_nlinfo.nl_net = net;
1960 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1961 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1962 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1965 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1967 struct fib6_config cfg;
1968 struct in6_rtmsg rtmsg;
1972 case SIOCADDRT: /* Add a route */
1973 case SIOCDELRT: /* Delete a route */
1974 if (!capable(CAP_NET_ADMIN))
1976 err = copy_from_user(&rtmsg, arg,
1977 sizeof(struct in6_rtmsg));
1981 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1986 err = ip6_route_add(&cfg);
1989 err = ip6_route_del(&cfg);
2003 * Drop the packet on the floor
2006 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2009 struct dst_entry *dst = skb_dst(skb);
2010 switch (ipstats_mib_noroutes) {
2011 case IPSTATS_MIB_INNOROUTES:
2012 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2013 if (type == IPV6_ADDR_ANY) {
2014 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2015 IPSTATS_MIB_INADDRERRORS);
2019 case IPSTATS_MIB_OUTNOROUTES:
2020 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2021 ipstats_mib_noroutes);
2024 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2029 static int ip6_pkt_discard(struct sk_buff *skb)
2031 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2034 static int ip6_pkt_discard_out(struct sk_buff *skb)
2036 skb->dev = skb_dst(skb)->dev;
2037 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2040 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2042 static int ip6_pkt_prohibit(struct sk_buff *skb)
2044 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2047 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2049 skb->dev = skb_dst(skb)->dev;
2050 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2056 * Allocate a dst for local (unicast / anycast) address.
2059 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2060 const struct in6_addr *addr,
2063 struct net *net = dev_net(idev->dev);
2064 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2065 net->loopback_dev, DST_NOCOUNT);
2066 struct neighbour *neigh;
2069 return ERR_PTR(-ENOMEM);
2073 rt->dst.flags |= DST_HOST;
2074 rt->dst.input = ip6_input;
2075 rt->dst.output = ip6_output;
2076 rt->rt6i_idev = idev;
2077 rt->dst.obsolete = -1;
2079 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2081 rt->rt6i_flags |= RTF_ANYCAST;
2083 rt->rt6i_flags |= RTF_LOCAL;
2084 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2085 if (IS_ERR(neigh)) {
2088 return ERR_CAST(neigh);
2090 dst_set_neighbour(&rt->dst, neigh);
2092 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2093 rt->rt6i_dst.plen = 128;
2094 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2096 atomic_set(&rt->dst.__refcnt, 1);
2101 int ip6_route_get_saddr(struct net *net,
2102 struct rt6_info *rt,
2103 const struct in6_addr *daddr,
2105 struct in6_addr *saddr)
2107 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2109 if (rt->rt6i_prefsrc.plen)
2110 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2112 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2113 daddr, prefs, saddr);
2117 /* remove deleted ip from prefsrc entries */
2118 struct arg_dev_net_ip {
2119 struct net_device *dev;
2121 struct in6_addr *addr;
2124 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2126 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2127 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2128 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2130 if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2131 rt != net->ipv6.ip6_null_entry &&
2132 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2133 /* remove prefsrc entry */
2134 rt->rt6i_prefsrc.plen = 0;
2139 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2141 struct net *net = dev_net(ifp->idev->dev);
2142 struct arg_dev_net_ip adni = {
2143 .dev = ifp->idev->dev,
2147 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2150 struct arg_dev_net {
2151 struct net_device *dev;
2155 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2157 const struct arg_dev_net *adn = arg;
2158 const struct net_device *dev = adn->dev;
2160 if ((rt->rt6i_dev == dev || dev == NULL) &&
2161 rt != adn->net->ipv6.ip6_null_entry) {
2162 RT6_TRACE("deleted by ifdown %p\n", rt);
2168 void rt6_ifdown(struct net *net, struct net_device *dev)
2170 struct arg_dev_net adn = {
2175 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2176 icmp6_clean_all(fib6_ifdown, &adn);
2179 struct rt6_mtu_change_arg
2181 struct net_device *dev;
2185 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2187 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2188 struct inet6_dev *idev;
2190 /* In IPv6 pmtu discovery is not optional,
2191 so that RTAX_MTU lock cannot disable it.
2192 We still use this lock to block changes
2193 caused by addrconf/ndisc.
2196 idev = __in6_dev_get(arg->dev);
2200 /* For administrative MTU increase, there is no way to discover
2201 IPv6 PMTU increase, so PMTU increase should be updated here.
2202 Since RFC 1981 doesn't include administrative MTU increase
2203 update PMTU increase is a MUST. (i.e. jumbo frame)
2206 If new MTU is less than route PMTU, this new MTU will be the
2207 lowest MTU in the path, update the route PMTU to reflect PMTU
2208 decreases; if new MTU is greater than route PMTU, and the
2209 old MTU is the lowest MTU in the path, update the route PMTU
2210 to reflect the increase. In this case if the other nodes' MTU
2211 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2214 if (rt->rt6i_dev == arg->dev &&
2215 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2216 (dst_mtu(&rt->dst) >= arg->mtu ||
2217 (dst_mtu(&rt->dst) < arg->mtu &&
2218 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2219 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2224 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2226 struct rt6_mtu_change_arg arg = {
2231 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2234 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2235 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2236 [RTA_OIF] = { .type = NLA_U32 },
2237 [RTA_IIF] = { .type = NLA_U32 },
2238 [RTA_PRIORITY] = { .type = NLA_U32 },
2239 [RTA_METRICS] = { .type = NLA_NESTED },
2242 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2243 struct fib6_config *cfg)
2246 struct nlattr *tb[RTA_MAX+1];
2249 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2254 rtm = nlmsg_data(nlh);
2255 memset(cfg, 0, sizeof(*cfg));
2257 cfg->fc_table = rtm->rtm_table;
2258 cfg->fc_dst_len = rtm->rtm_dst_len;
2259 cfg->fc_src_len = rtm->rtm_src_len;
2260 cfg->fc_flags = RTF_UP;
2261 cfg->fc_protocol = rtm->rtm_protocol;
2263 if (rtm->rtm_type == RTN_UNREACHABLE)
2264 cfg->fc_flags |= RTF_REJECT;
2266 if (rtm->rtm_type == RTN_LOCAL)
2267 cfg->fc_flags |= RTF_LOCAL;
2269 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2270 cfg->fc_nlinfo.nlh = nlh;
2271 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2273 if (tb[RTA_GATEWAY]) {
2274 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2275 cfg->fc_flags |= RTF_GATEWAY;
2279 int plen = (rtm->rtm_dst_len + 7) >> 3;
2281 if (nla_len(tb[RTA_DST]) < plen)
2284 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2288 int plen = (rtm->rtm_src_len + 7) >> 3;
2290 if (nla_len(tb[RTA_SRC]) < plen)
2293 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2296 if (tb[RTA_PREFSRC])
2297 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2300 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2302 if (tb[RTA_PRIORITY])
2303 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2305 if (tb[RTA_METRICS]) {
2306 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2307 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2311 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2318 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2320 struct fib6_config cfg;
2323 err = rtm_to_fib6_config(skb, nlh, &cfg);
2327 return ip6_route_del(&cfg);
2330 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2332 struct fib6_config cfg;
2335 err = rtm_to_fib6_config(skb, nlh, &cfg);
2339 return ip6_route_add(&cfg);
2342 static inline size_t rt6_nlmsg_size(void)
2344 return NLMSG_ALIGN(sizeof(struct rtmsg))
2345 + nla_total_size(16) /* RTA_SRC */
2346 + nla_total_size(16) /* RTA_DST */
2347 + nla_total_size(16) /* RTA_GATEWAY */
2348 + nla_total_size(16) /* RTA_PREFSRC */
2349 + nla_total_size(4) /* RTA_TABLE */
2350 + nla_total_size(4) /* RTA_IIF */
2351 + nla_total_size(4) /* RTA_OIF */
2352 + nla_total_size(4) /* RTA_PRIORITY */
2353 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2354 + nla_total_size(sizeof(struct rta_cacheinfo));
2357 static int rt6_fill_node(struct net *net,
2358 struct sk_buff *skb, struct rt6_info *rt,
2359 struct in6_addr *dst, struct in6_addr *src,
2360 int iif, int type, u32 pid, u32 seq,
2361 int prefix, int nowait, unsigned int flags)
2364 struct nlmsghdr *nlh;
2367 struct neighbour *n;
2369 if (prefix) { /* user wants prefix routes only */
2370 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2371 /* success since this is not a prefix route */
2376 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2380 rtm = nlmsg_data(nlh);
2381 rtm->rtm_family = AF_INET6;
2382 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2383 rtm->rtm_src_len = rt->rt6i_src.plen;
2386 table = rt->rt6i_table->tb6_id;
2388 table = RT6_TABLE_UNSPEC;
2389 rtm->rtm_table = table;
2390 NLA_PUT_U32(skb, RTA_TABLE, table);
2391 if (rt->rt6i_flags&RTF_REJECT)
2392 rtm->rtm_type = RTN_UNREACHABLE;
2393 else if (rt->rt6i_flags&RTF_LOCAL)
2394 rtm->rtm_type = RTN_LOCAL;
2395 else if (rt->rt6i_flags & RTF_ANYCAST)
2396 rtm->rtm_type = RTN_ANYCAST;
2397 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2398 rtm->rtm_type = RTN_LOCAL;
2400 rtm->rtm_type = RTN_UNICAST;
2402 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2403 rtm->rtm_protocol = rt->rt6i_protocol;
2404 if (rt->rt6i_flags&RTF_DYNAMIC)
2405 rtm->rtm_protocol = RTPROT_REDIRECT;
2406 else if (rt->rt6i_flags & RTF_ADDRCONF)
2407 rtm->rtm_protocol = RTPROT_KERNEL;
2408 else if (rt->rt6i_flags&RTF_DEFAULT)
2409 rtm->rtm_protocol = RTPROT_RA;
2411 if (rt->rt6i_flags&RTF_CACHE)
2412 rtm->rtm_flags |= RTM_F_CLONED;
2415 NLA_PUT(skb, RTA_DST, 16, dst);
2416 rtm->rtm_dst_len = 128;
2417 } else if (rtm->rtm_dst_len)
2418 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2419 #ifdef CONFIG_IPV6_SUBTREES
2421 NLA_PUT(skb, RTA_SRC, 16, src);
2422 rtm->rtm_src_len = 128;
2423 } else if (rtm->rtm_src_len)
2424 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2427 #ifdef CONFIG_IPV6_MROUTE
2428 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2429 int err = ip6mr_get_route(net, skb, rtm, nowait,
2436 goto nla_put_failure;
2438 if (err == -EMSGSIZE)
2439 goto nla_put_failure;
2444 NLA_PUT_U32(skb, RTA_IIF, iif);
2446 struct in6_addr saddr_buf;
2447 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2448 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2451 if (rt->rt6i_prefsrc.plen) {
2452 struct in6_addr saddr_buf;
2453 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2454 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2457 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2458 goto nla_put_failure;
2461 n = dst_get_neighbour(&rt->dst);
2463 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2465 goto nla_put_failure;
2471 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2473 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2475 if (!(rt->rt6i_flags & RTF_EXPIRES))
2477 else if (rt->rt6i_expires - jiffies < INT_MAX)
2478 expires = rt->rt6i_expires - jiffies;
2482 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2483 expires, rt->dst.error) < 0)
2484 goto nla_put_failure;
2486 return nlmsg_end(skb, nlh);
2489 nlmsg_cancel(skb, nlh);
2493 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2495 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2498 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2499 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2500 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2504 return rt6_fill_node(arg->net,
2505 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2506 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2507 prefix, 0, NLM_F_MULTI);
2510 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2512 struct net *net = sock_net(in_skb->sk);
2513 struct nlattr *tb[RTA_MAX+1];
2514 struct rt6_info *rt;
2515 struct sk_buff *skb;
2520 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2525 memset(&fl6, 0, sizeof(fl6));
2528 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2531 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2535 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2538 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2542 iif = nla_get_u32(tb[RTA_IIF]);
2545 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2548 struct net_device *dev;
2549 dev = __dev_get_by_index(net, iif);
2556 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2562 /* Reserve room for dummy headers, this skb can pass
2563 through good chunk of routing engine.
2565 skb_reset_mac_header(skb);
2566 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2568 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2569 skb_dst_set(skb, &rt->dst);
2571 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2572 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2573 nlh->nlmsg_seq, 0, 0, 0);
2579 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2584 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2586 struct sk_buff *skb;
2587 struct net *net = info->nl_net;
2592 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2594 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2598 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2599 event, info->pid, seq, 0, 0, 0);
2601 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2602 WARN_ON(err == -EMSGSIZE);
2606 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2607 info->nlh, gfp_any());
2611 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2614 static int ip6_route_dev_notify(struct notifier_block *this,
2615 unsigned long event, void *data)
2617 struct net_device *dev = (struct net_device *)data;
2618 struct net *net = dev_net(dev);
2620 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2621 net->ipv6.ip6_null_entry->dst.dev = dev;
2622 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2623 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2624 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2625 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2626 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2627 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2638 #ifdef CONFIG_PROC_FS
2649 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2651 struct seq_file *m = p_arg;
2652 struct neighbour *n;
2654 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2656 #ifdef CONFIG_IPV6_SUBTREES
2657 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2659 seq_puts(m, "00000000000000000000000000000000 00 ");
2662 n = dst_get_neighbour(&rt->dst);
2664 seq_printf(m, "%pi6", n->primary_key);
2666 seq_puts(m, "00000000000000000000000000000000");
2669 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2670 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2671 rt->dst.__use, rt->rt6i_flags,
2672 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2676 static int ipv6_route_show(struct seq_file *m, void *v)
2678 struct net *net = (struct net *)m->private;
2679 fib6_clean_all(net, rt6_info_route, 0, m);
2683 static int ipv6_route_open(struct inode *inode, struct file *file)
2685 return single_open_net(inode, file, ipv6_route_show);
2688 static const struct file_operations ipv6_route_proc_fops = {
2689 .owner = THIS_MODULE,
2690 .open = ipv6_route_open,
2692 .llseek = seq_lseek,
2693 .release = single_release_net,
2696 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2698 struct net *net = (struct net *)seq->private;
2699 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2700 net->ipv6.rt6_stats->fib_nodes,
2701 net->ipv6.rt6_stats->fib_route_nodes,
2702 net->ipv6.rt6_stats->fib_rt_alloc,
2703 net->ipv6.rt6_stats->fib_rt_entries,
2704 net->ipv6.rt6_stats->fib_rt_cache,
2705 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2706 net->ipv6.rt6_stats->fib_discarded_routes);
2711 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2713 return single_open_net(inode, file, rt6_stats_seq_show);
2716 static const struct file_operations rt6_stats_seq_fops = {
2717 .owner = THIS_MODULE,
2718 .open = rt6_stats_seq_open,
2720 .llseek = seq_lseek,
2721 .release = single_release_net,
2723 #endif /* CONFIG_PROC_FS */
2725 #ifdef CONFIG_SYSCTL
2728 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2729 void __user *buffer, size_t *lenp, loff_t *ppos)
2736 net = (struct net *)ctl->extra1;
2737 delay = net->ipv6.sysctl.flush_delay;
2738 proc_dointvec(ctl, write, buffer, lenp, ppos);
2739 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
2743 ctl_table ipv6_route_table_template[] = {
2745 .procname = "flush",
2746 .data = &init_net.ipv6.sysctl.flush_delay,
2747 .maxlen = sizeof(int),
2749 .proc_handler = ipv6_sysctl_rtcache_flush
2752 .procname = "gc_thresh",
2753 .data = &ip6_dst_ops_template.gc_thresh,
2754 .maxlen = sizeof(int),
2756 .proc_handler = proc_dointvec,
2759 .procname = "max_size",
2760 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2761 .maxlen = sizeof(int),
2763 .proc_handler = proc_dointvec,
2766 .procname = "gc_min_interval",
2767 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2768 .maxlen = sizeof(int),
2770 .proc_handler = proc_dointvec_jiffies,
2773 .procname = "gc_timeout",
2774 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2775 .maxlen = sizeof(int),
2777 .proc_handler = proc_dointvec_jiffies,
2780 .procname = "gc_interval",
2781 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2782 .maxlen = sizeof(int),
2784 .proc_handler = proc_dointvec_jiffies,
2787 .procname = "gc_elasticity",
2788 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2789 .maxlen = sizeof(int),
2791 .proc_handler = proc_dointvec,
2794 .procname = "mtu_expires",
2795 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2796 .maxlen = sizeof(int),
2798 .proc_handler = proc_dointvec_jiffies,
2801 .procname = "min_adv_mss",
2802 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2803 .maxlen = sizeof(int),
2805 .proc_handler = proc_dointvec,
2808 .procname = "gc_min_interval_ms",
2809 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2810 .maxlen = sizeof(int),
2812 .proc_handler = proc_dointvec_ms_jiffies,
2817 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2819 struct ctl_table *table;
2821 table = kmemdup(ipv6_route_table_template,
2822 sizeof(ipv6_route_table_template),
2826 table[0].data = &net->ipv6.sysctl.flush_delay;
2827 table[0].extra1 = net;
2828 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2829 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2830 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2831 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2832 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2833 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2834 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2835 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2836 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2843 static int __net_init ip6_route_net_init(struct net *net)
2847 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2848 sizeof(net->ipv6.ip6_dst_ops));
2850 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2851 goto out_ip6_dst_ops;
2853 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2854 sizeof(*net->ipv6.ip6_null_entry),
2856 if (!net->ipv6.ip6_null_entry)
2857 goto out_ip6_dst_entries;
2858 net->ipv6.ip6_null_entry->dst.path =
2859 (struct dst_entry *)net->ipv6.ip6_null_entry;
2860 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2861 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2862 ip6_template_metrics, true);
2864 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2865 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2866 sizeof(*net->ipv6.ip6_prohibit_entry),
2868 if (!net->ipv6.ip6_prohibit_entry)
2869 goto out_ip6_null_entry;
2870 net->ipv6.ip6_prohibit_entry->dst.path =
2871 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2872 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2873 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2874 ip6_template_metrics, true);
2876 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2877 sizeof(*net->ipv6.ip6_blk_hole_entry),
2879 if (!net->ipv6.ip6_blk_hole_entry)
2880 goto out_ip6_prohibit_entry;
2881 net->ipv6.ip6_blk_hole_entry->dst.path =
2882 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2883 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2884 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2885 ip6_template_metrics, true);
2888 net->ipv6.sysctl.flush_delay = 0;
2889 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2890 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2891 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2892 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2893 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2894 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2895 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2897 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2903 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2904 out_ip6_prohibit_entry:
2905 kfree(net->ipv6.ip6_prohibit_entry);
2907 kfree(net->ipv6.ip6_null_entry);
2909 out_ip6_dst_entries:
2910 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2915 static void __net_exit ip6_route_net_exit(struct net *net)
2917 kfree(net->ipv6.ip6_null_entry);
2918 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2919 kfree(net->ipv6.ip6_prohibit_entry);
2920 kfree(net->ipv6.ip6_blk_hole_entry);
2922 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2925 static int __net_init ip6_route_net_init_late(struct net *net)
2927 #ifdef CONFIG_PROC_FS
2928 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2929 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2934 static void __net_exit ip6_route_net_exit_late(struct net *net)
2936 #ifdef CONFIG_PROC_FS
2937 proc_net_remove(net, "ipv6_route");
2938 proc_net_remove(net, "rt6_stats");
2942 static struct pernet_operations ip6_route_net_ops = {
2943 .init = ip6_route_net_init,
2944 .exit = ip6_route_net_exit,
2947 static struct pernet_operations ip6_route_net_late_ops = {
2948 .init = ip6_route_net_init_late,
2949 .exit = ip6_route_net_exit_late,
2952 static struct notifier_block ip6_route_dev_notifier = {
2953 .notifier_call = ip6_route_dev_notify,
2957 int __init ip6_route_init(void)
2962 ip6_dst_ops_template.kmem_cachep =
2963 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2964 SLAB_HWCACHE_ALIGN, NULL);
2965 if (!ip6_dst_ops_template.kmem_cachep)
2968 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2970 goto out_kmem_cache;
2972 ret = register_pernet_subsys(&ip6_route_net_ops);
2974 goto out_dst_entries;
2976 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2978 /* Registering of the loopback is done before this portion of code,
2979 * the loopback reference in rt6_info will not be taken, do it
2980 * manually for init_net */
2981 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2982 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2983 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2984 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2985 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2986 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2987 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2991 goto out_register_subsys;
2997 ret = fib6_rules_init();
3001 ret = register_pernet_subsys(&ip6_route_net_late_ops);
3003 goto fib6_rules_init;
3006 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3007 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3008 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3009 goto out_register_late_subsys;
3011 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3013 goto out_register_late_subsys;
3018 out_register_late_subsys:
3019 unregister_pernet_subsys(&ip6_route_net_late_ops);
3021 fib6_rules_cleanup();
3026 out_register_subsys:
3027 unregister_pernet_subsys(&ip6_route_net_ops);
3029 dst_entries_destroy(&ip6_dst_blackhole_ops);
3031 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3035 void ip6_route_cleanup(void)
3037 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3038 unregister_pernet_subsys(&ip6_route_net_late_ops);
3039 fib6_rules_cleanup();
3042 unregister_pernet_subsys(&ip6_route_net_ops);
3043 dst_entries_destroy(&ip6_dst_blackhole_ops);
3044 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);