2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
53 #include <linux/rtnetlink.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
59 #include <asm/uaccess.h>
62 #include <linux/sysctl.h>
65 /* Set to 3 to get tracing. */
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
73 #define RT6_TRACE(x...) do { ; } while (0)
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77 const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void ip6_dst_destroy(struct dst_entry *);
83 static void ip6_dst_ifdown(struct dst_entry *,
84 struct net_device *dev, int how);
85 static int ip6_dst_gc(struct dst_ops *ops);
87 static int ip6_pkt_discard(struct sk_buff *skb);
88 static int ip6_pkt_discard_out(struct sk_buff *skb);
89 static void ip6_link_failure(struct sk_buff *skb);
90 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94 const struct in6_addr *prefix, int prefixlen,
95 const struct in6_addr *gwaddr, int ifindex,
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98 const struct in6_addr *prefix, int prefixlen,
99 const struct in6_addr *gwaddr, int ifindex);
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
104 struct rt6_info *rt = (struct rt6_info *) dst;
105 struct inet_peer *peer;
108 if (!(rt->dst.flags & DST_HOST))
109 return dst_cow_metrics_generic(dst, old);
112 rt6_bind_peer(rt, 1);
114 peer = rt->rt6i_peer;
116 u32 *old_p = __DST_METRICS_PTR(old);
117 unsigned long prev, new;
120 if (inet_metrics_new(peer))
121 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
123 new = (unsigned long) p;
124 prev = cmpxchg(&dst->_metrics, old, new);
127 p = __DST_METRICS_PTR(prev);
128 if (prev & DST_METRICS_READ_ONLY)
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
137 return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
140 static struct dst_ops ip6_dst_ops_template = {
142 .protocol = cpu_to_be16(ETH_P_IPV6),
145 .check = ip6_dst_check,
146 .default_advmss = ip6_default_advmss,
148 .cow_metrics = ipv6_cow_metrics,
149 .destroy = ip6_dst_destroy,
150 .ifdown = ip6_dst_ifdown,
151 .negative_advice = ip6_negative_advice,
152 .link_failure = ip6_link_failure,
153 .update_pmtu = ip6_rt_update_pmtu,
154 .local_out = __ip6_local_out,
155 .neigh_lookup = ip6_neigh_lookup,
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
160 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
162 return mtu ? : dst->dev->mtu;
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
175 static struct dst_ops ip6_dst_blackhole_ops = {
177 .protocol = cpu_to_be16(ETH_P_IPV6),
178 .destroy = ip6_dst_destroy,
179 .check = ip6_dst_check,
180 .mtu = ip6_blackhole_mtu,
181 .default_advmss = ip6_default_advmss,
182 .update_pmtu = ip6_rt_blackhole_update_pmtu,
183 .cow_metrics = ip6_rt_blackhole_cow_metrics,
184 .neigh_lookup = ip6_neigh_lookup,
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188 [RTAX_HOPLIMIT - 1] = 0,
191 static struct rt6_info ip6_null_entry_template = {
193 .__refcnt = ATOMIC_INIT(1),
196 .error = -ENETUNREACH,
197 .input = ip6_pkt_discard,
198 .output = ip6_pkt_discard_out,
200 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
201 .rt6i_protocol = RTPROT_KERNEL,
202 .rt6i_metric = ~(u32) 0,
203 .rt6i_ref = ATOMIC_INIT(1),
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
211 static struct rt6_info ip6_prohibit_entry_template = {
213 .__refcnt = ATOMIC_INIT(1),
217 .input = ip6_pkt_prohibit,
218 .output = ip6_pkt_prohibit_out,
220 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
221 .rt6i_protocol = RTPROT_KERNEL,
222 .rt6i_metric = ~(u32) 0,
223 .rt6i_ref = ATOMIC_INIT(1),
226 static struct rt6_info ip6_blk_hole_entry_template = {
228 .__refcnt = ATOMIC_INIT(1),
232 .input = dst_discard,
233 .output = dst_discard,
235 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
236 .rt6i_protocol = RTPROT_KERNEL,
237 .rt6i_metric = ~(u32) 0,
238 .rt6i_ref = ATOMIC_INIT(1),
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245 struct net_device *dev,
248 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
251 memset(&rt->rt6i_table, 0,
252 sizeof(*rt) - sizeof(struct dst_entry));
257 static void ip6_dst_destroy(struct dst_entry *dst)
259 struct rt6_info *rt = (struct rt6_info *)dst;
260 struct inet6_dev *idev = rt->rt6i_idev;
261 struct inet_peer *peer = rt->rt6i_peer;
263 if (!(rt->dst.flags & DST_HOST))
264 dst_destroy_metrics_generic(dst);
267 rt->rt6i_idev = NULL;
271 rt->rt6i_peer = NULL;
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
278 static u32 rt6_peer_genid(void)
280 return atomic_read(&__rt6_peer_genid);
283 void rt6_bind_peer(struct rt6_info *rt, int create)
285 struct inet_peer *peer;
287 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
291 rt->rt6i_peer_genid = rt6_peer_genid();
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
297 struct rt6_info *rt = (struct rt6_info *)dst;
298 struct inet6_dev *idev = rt->rt6i_idev;
299 struct net_device *loopback_dev =
300 dev_net(dev)->loopback_dev;
302 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
303 struct inet6_dev *loopback_idev =
304 in6_dev_get(loopback_dev);
305 if (loopback_idev != NULL) {
306 rt->rt6i_idev = loopback_idev;
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
314 return (rt->rt6i_flags & RTF_EXPIRES) &&
315 time_after(jiffies, rt->rt6i_expires);
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
320 return ipv6_addr_type(daddr) &
321 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
325 * Route lookup. Any table->tb6_lock is implied.
328 static inline struct rt6_info *rt6_device_match(struct net *net,
330 const struct in6_addr *saddr,
334 struct rt6_info *local = NULL;
335 struct rt6_info *sprt;
337 if (!oif && ipv6_addr_any(saddr))
340 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341 struct net_device *dev = sprt->rt6i_dev;
344 if (dev->ifindex == oif)
346 if (dev->flags & IFF_LOOPBACK) {
347 if (sprt->rt6i_idev == NULL ||
348 sprt->rt6i_idev->dev->ifindex != oif) {
349 if (flags & RT6_LOOKUP_F_IFACE && oif)
351 if (local && (!oif ||
352 local->rt6i_idev->dev->ifindex == oif))
358 if (ipv6_chk_addr(net, saddr, dev,
359 flags & RT6_LOOKUP_F_IFACE))
368 if (flags & RT6_LOOKUP_F_IFACE)
369 return net->ipv6.ip6_null_entry;
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
378 struct neighbour *neigh;
380 * Okay, this does not seem to be appropriate
381 * for now, however, we need to check if it
382 * is really so; aka Router Reachability Probing.
384 * Router Reachability Probe MUST be rate-limited
385 * to no more than one per minute.
388 neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389 if (!neigh || (neigh->nud_state & NUD_VALID))
391 read_lock_bh(&neigh->lock);
392 if (!(neigh->nud_state & NUD_VALID) &&
393 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394 struct in6_addr mcaddr;
395 struct in6_addr *target;
397 neigh->updated = jiffies;
398 read_unlock_bh(&neigh->lock);
400 target = (struct in6_addr *)&neigh->primary_key;
401 addrconf_addr_solict_mult(target, &mcaddr);
402 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
404 read_unlock_bh(&neigh->lock);
410 static inline void rt6_probe(struct rt6_info *rt)
416 * Default Router Selection (RFC 2461 6.3.6)
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
420 struct net_device *dev = rt->rt6i_dev;
421 if (!oif || dev->ifindex == oif)
423 if ((dev->flags & IFF_LOOPBACK) &&
424 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
429 static inline int rt6_check_neigh(struct rt6_info *rt)
431 struct neighbour *neigh;
435 neigh = dst_get_neighbour(&rt->dst);
436 if (rt->rt6i_flags & RTF_NONEXTHOP ||
437 !(rt->rt6i_flags & RTF_GATEWAY))
440 read_lock_bh(&neigh->lock);
441 if (neigh->nud_state & NUD_VALID)
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444 else if (neigh->nud_state & NUD_FAILED)
449 read_unlock_bh(&neigh->lock);
456 static int rt6_score_route(struct rt6_info *rt, int oif,
461 m = rt6_check_dev(rt, oif);
462 if (!m && (strict & RT6_LOOKUP_F_IFACE))
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
467 n = rt6_check_neigh(rt);
468 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474 int *mpri, struct rt6_info *match)
478 if (rt6_check_expired(rt))
481 m = rt6_score_route(rt, oif, strict);
486 if (strict & RT6_LOOKUP_F_REACHABLE)
490 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499 struct rt6_info *rr_head,
500 u32 metric, int oif, int strict)
502 struct rt6_info *rt, *match;
506 for (rt = rr_head; rt && rt->rt6i_metric == metric;
507 rt = rt->dst.rt6_next)
508 match = find_match(rt, oif, strict, &mpri, match);
509 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510 rt = rt->dst.rt6_next)
511 match = find_match(rt, oif, strict, &mpri, match);
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
518 struct rt6_info *match, *rt0;
521 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522 __func__, fn->leaf, oif);
526 fn->rr_ptr = rt0 = fn->leaf;
528 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
531 (strict & RT6_LOOKUP_F_REACHABLE)) {
532 struct rt6_info *next = rt0->dst.rt6_next;
534 /* no entries matched; do round-robin */
535 if (!next || next->rt6i_metric != rt0->rt6i_metric)
542 RT6_TRACE("%s() => %p\n",
545 net = dev_net(rt0->rt6i_dev);
546 return match ? match : net->ipv6.ip6_null_entry;
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551 const struct in6_addr *gwaddr)
553 struct net *net = dev_net(dev);
554 struct route_info *rinfo = (struct route_info *) opt;
555 struct in6_addr prefix_buf, *prefix;
557 unsigned long lifetime;
560 if (len < sizeof(struct route_info)) {
564 /* Sanity check for prefix_len and length */
565 if (rinfo->length > 3) {
567 } else if (rinfo->prefix_len > 128) {
569 } else if (rinfo->prefix_len > 64) {
570 if (rinfo->length < 2) {
573 } else if (rinfo->prefix_len > 0) {
574 if (rinfo->length < 1) {
579 pref = rinfo->route_pref;
580 if (pref == ICMPV6_ROUTER_PREF_INVALID)
583 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
585 if (rinfo->length == 3)
586 prefix = (struct in6_addr *)rinfo->prefix;
588 /* this function is safe */
589 ipv6_addr_prefix(&prefix_buf,
590 (struct in6_addr *)rinfo->prefix,
592 prefix = &prefix_buf;
595 if (rinfo->prefix_len == 0)
596 rt = rt6_get_dflt_router(gwaddr, dev);
598 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
599 gwaddr, dev->ifindex);
601 if (rt && !lifetime) {
607 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
610 rt->rt6i_flags = RTF_ROUTEINFO |
611 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
614 if (!addrconf_finite_timeout(lifetime)) {
615 rt->rt6i_flags &= ~RTF_EXPIRES;
617 rt->rt6i_expires = jiffies + HZ * lifetime;
618 rt->rt6i_flags |= RTF_EXPIRES;
620 dst_release(&rt->dst);
626 #define BACKTRACK(__net, saddr) \
628 if (rt == __net->ipv6.ip6_null_entry) { \
629 struct fib6_node *pn; \
631 if (fn->fn_flags & RTN_TL_ROOT) \
634 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
635 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
638 if (fn->fn_flags & RTN_RTINFO) \
644 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
645 struct fib6_table *table,
646 struct flowi6 *fl6, int flags)
648 struct fib6_node *fn;
651 read_lock_bh(&table->tb6_lock);
652 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
655 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
656 BACKTRACK(net, &fl6->saddr);
658 dst_use(&rt->dst, jiffies);
659 read_unlock_bh(&table->tb6_lock);
664 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
665 const struct in6_addr *saddr, int oif, int strict)
667 struct flowi6 fl6 = {
671 struct dst_entry *dst;
672 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
675 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
676 flags |= RT6_LOOKUP_F_HAS_SADDR;
679 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
681 return (struct rt6_info *) dst;
688 EXPORT_SYMBOL(rt6_lookup);
690 /* ip6_ins_rt is called with FREE table->tb6_lock.
691 It takes new route entry, the addition fails by any reason the
692 route is freed. In any case, if caller does not hold it, it may
696 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
699 struct fib6_table *table;
701 table = rt->rt6i_table;
702 write_lock_bh(&table->tb6_lock);
703 err = fib6_add(&table->tb6_root, rt, info);
704 write_unlock_bh(&table->tb6_lock);
709 int ip6_ins_rt(struct rt6_info *rt)
711 struct nl_info info = {
712 .nl_net = dev_net(rt->rt6i_dev),
714 return __ip6_ins_rt(rt, &info);
717 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
718 const struct in6_addr *daddr,
719 const struct in6_addr *saddr)
727 rt = ip6_rt_copy(ort, daddr);
730 struct neighbour *neigh;
731 int attempts = !in_softirq();
733 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
734 if (ort->rt6i_dst.plen != 128 &&
735 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
736 rt->rt6i_flags |= RTF_ANYCAST;
737 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
740 rt->rt6i_flags |= RTF_CACHE;
742 #ifdef CONFIG_IPV6_SUBTREES
743 if (rt->rt6i_src.plen && saddr) {
744 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
745 rt->rt6i_src.plen = 128;
750 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
752 struct net *net = dev_net(rt->rt6i_dev);
753 int saved_rt_min_interval =
754 net->ipv6.sysctl.ip6_rt_gc_min_interval;
755 int saved_rt_elasticity =
756 net->ipv6.sysctl.ip6_rt_gc_elasticity;
758 if (attempts-- > 0) {
759 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
760 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
762 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
764 net->ipv6.sysctl.ip6_rt_gc_elasticity =
766 net->ipv6.sysctl.ip6_rt_gc_min_interval =
767 saved_rt_min_interval;
773 "ipv6: Neighbour table overflow.\n");
777 dst_set_neighbour(&rt->dst, neigh);
784 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
785 const struct in6_addr *daddr)
787 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
790 rt->rt6i_flags |= RTF_CACHE;
791 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
796 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
797 struct flowi6 *fl6, int flags, bool input)
799 struct fib6_node *fn;
800 struct rt6_info *rt, *nrt;
804 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
805 int local = RTF_NONEXTHOP;
807 strict |= flags & RT6_LOOKUP_F_IFACE;
812 read_lock_bh(&table->tb6_lock);
815 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
818 rt = rt6_select(fn, oif, strict | reachable);
820 BACKTRACK(net, &fl6->saddr);
821 if (rt == net->ipv6.ip6_null_entry ||
822 rt->rt6i_flags & RTF_CACHE)
826 read_unlock_bh(&table->tb6_lock);
828 if (!dst_get_neighbour_raw(&rt->dst)
829 && !(rt->rt6i_flags & local))
830 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
831 else if (!(rt->dst.flags & DST_HOST))
832 nrt = rt6_alloc_clone(rt, &fl6->daddr);
836 dst_release(&rt->dst);
837 rt = nrt ? : net->ipv6.ip6_null_entry;
841 err = ip6_ins_rt(nrt);
850 * Race condition! In the gap, when table->tb6_lock was
851 * released someone could insert this route. Relookup.
853 dst_release(&rt->dst);
862 read_unlock_bh(&table->tb6_lock);
864 rt->dst.lastuse = jiffies;
870 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
871 struct flowi6 *fl6, int flags)
873 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags, true);
876 void ip6_route_input(struct sk_buff *skb)
878 const struct ipv6hdr *iph = ipv6_hdr(skb);
879 struct net *net = dev_net(skb->dev);
880 int flags = RT6_LOOKUP_F_HAS_SADDR;
881 struct flowi6 fl6 = {
882 .flowi6_iif = skb->dev->ifindex,
885 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
886 .flowi6_mark = skb->mark,
887 .flowi6_proto = iph->nexthdr,
890 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
891 flags |= RT6_LOOKUP_F_IFACE;
893 skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
896 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
897 struct flowi6 *fl6, int flags)
899 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags, false);
902 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
907 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
908 flags |= RT6_LOOKUP_F_IFACE;
910 if (!ipv6_addr_any(&fl6->saddr))
911 flags |= RT6_LOOKUP_F_HAS_SADDR;
913 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
915 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
918 EXPORT_SYMBOL(ip6_route_output);
920 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
922 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
923 struct dst_entry *new = NULL;
925 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
927 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
932 new->input = dst_discard;
933 new->output = dst_discard;
935 if (dst_metrics_read_only(&ort->dst))
936 new->_metrics = ort->dst._metrics;
938 dst_copy_metrics(new, &ort->dst);
939 rt->rt6i_idev = ort->rt6i_idev;
941 in6_dev_hold(rt->rt6i_idev);
942 rt->rt6i_expires = 0;
944 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
945 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
948 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
949 #ifdef CONFIG_IPV6_SUBTREES
950 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
956 dst_release(dst_orig);
957 return new ? new : ERR_PTR(-ENOMEM);
961 * Destination cache support functions
964 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
968 rt = (struct rt6_info *) dst;
970 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
971 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
973 rt6_bind_peer(rt, 0);
974 rt->rt6i_peer_genid = rt6_peer_genid();
981 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
983 struct rt6_info *rt = (struct rt6_info *) dst;
986 if (rt->rt6i_flags & RTF_CACHE) {
987 if (rt6_check_expired(rt)) {
999 static void ip6_link_failure(struct sk_buff *skb)
1001 struct rt6_info *rt;
1003 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1005 rt = (struct rt6_info *) skb_dst(skb);
1007 if (rt->rt6i_flags&RTF_CACHE) {
1008 dst_set_expires(&rt->dst, 0);
1009 rt->rt6i_flags |= RTF_EXPIRES;
1010 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1011 rt->rt6i_node->fn_sernum = -1;
1015 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1017 struct rt6_info *rt6 = (struct rt6_info*)dst;
1019 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1020 rt6->rt6i_flags |= RTF_MODIFIED;
1021 if (mtu < IPV6_MIN_MTU)
1024 dst_metric_set(dst, RTAX_MTU, mtu);
1028 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1030 struct net_device *dev = dst->dev;
1031 unsigned int mtu = dst_mtu(dst);
1032 struct net *net = dev_net(dev);
1034 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1036 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1037 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1040 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1041 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1042 * IPV6_MAXPLEN is also valid and means: "any MSS,
1043 * rely only on pmtu discovery"
1045 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1050 static unsigned int ip6_mtu(const struct dst_entry *dst)
1052 struct inet6_dev *idev;
1053 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1061 idev = __in6_dev_get(dst->dev);
1063 mtu = idev->cnf.mtu6;
1067 return min_t(unsigned int, mtu, IP6_MAX_MTU);
1070 static struct dst_entry *icmp6_dst_gc_list;
1071 static DEFINE_SPINLOCK(icmp6_dst_lock);
1073 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1074 struct neighbour *neigh,
1075 const struct in6_addr *addr)
1077 struct rt6_info *rt;
1078 struct inet6_dev *idev = in6_dev_get(dev);
1079 struct net *net = dev_net(dev);
1081 if (unlikely(idev == NULL))
1084 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1085 if (unlikely(rt == NULL)) {
1093 neigh = ndisc_get_neigh(dev, addr);
1098 rt->dst.flags |= DST_HOST;
1099 rt->dst.output = ip6_output;
1100 dst_set_neighbour(&rt->dst, neigh);
1101 atomic_set(&rt->dst.__refcnt, 1);
1102 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1103 rt->rt6i_dst.plen = 128;
1104 rt->rt6i_idev = idev;
1105 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1107 spin_lock_bh(&icmp6_dst_lock);
1108 rt->dst.next = icmp6_dst_gc_list;
1109 icmp6_dst_gc_list = &rt->dst;
1110 spin_unlock_bh(&icmp6_dst_lock);
1112 fib6_force_start_gc(net);
1118 int icmp6_dst_gc(void)
1120 struct dst_entry *dst, **pprev;
1123 spin_lock_bh(&icmp6_dst_lock);
1124 pprev = &icmp6_dst_gc_list;
1126 while ((dst = *pprev) != NULL) {
1127 if (!atomic_read(&dst->__refcnt)) {
1136 spin_unlock_bh(&icmp6_dst_lock);
1141 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1144 struct dst_entry *dst, **pprev;
1146 spin_lock_bh(&icmp6_dst_lock);
1147 pprev = &icmp6_dst_gc_list;
1148 while ((dst = *pprev) != NULL) {
1149 struct rt6_info *rt = (struct rt6_info *) dst;
1150 if (func(rt, arg)) {
1157 spin_unlock_bh(&icmp6_dst_lock);
1160 static int ip6_dst_gc(struct dst_ops *ops)
1162 unsigned long now = jiffies;
1163 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1164 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1165 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1166 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1167 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1168 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1171 entries = dst_entries_get_fast(ops);
1172 if (time_after(rt_last_gc + rt_min_interval, now) &&
1173 entries <= rt_max_size)
1176 net->ipv6.ip6_rt_gc_expire++;
1177 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1178 net->ipv6.ip6_rt_last_gc = now;
1179 entries = dst_entries_get_slow(ops);
1180 if (entries < ops->gc_thresh)
1181 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1183 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1184 return entries > rt_max_size;
1187 /* Clean host part of a prefix. Not necessary in radix tree,
1188 but results in cleaner routing tables.
1190 Remove it only when all the things will work!
1193 int ip6_dst_hoplimit(struct dst_entry *dst)
1195 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1196 if (hoplimit == 0) {
1197 struct net_device *dev = dst->dev;
1198 struct inet6_dev *idev;
1201 idev = __in6_dev_get(dev);
1203 hoplimit = idev->cnf.hop_limit;
1205 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1210 EXPORT_SYMBOL(ip6_dst_hoplimit);
1216 int ip6_route_add(struct fib6_config *cfg)
1219 struct net *net = cfg->fc_nlinfo.nl_net;
1220 struct rt6_info *rt = NULL;
1221 struct net_device *dev = NULL;
1222 struct inet6_dev *idev = NULL;
1223 struct fib6_table *table;
1226 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1228 #ifndef CONFIG_IPV6_SUBTREES
1229 if (cfg->fc_src_len)
1232 if (cfg->fc_ifindex) {
1234 dev = dev_get_by_index(net, cfg->fc_ifindex);
1237 idev = in6_dev_get(dev);
1242 if (cfg->fc_metric == 0)
1243 cfg->fc_metric = IP6_RT_PRIO_USER;
1245 table = fib6_new_table(net, cfg->fc_table);
1246 if (table == NULL) {
1251 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1258 rt->dst.obsolete = -1;
1259 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1260 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1263 if (cfg->fc_protocol == RTPROT_UNSPEC)
1264 cfg->fc_protocol = RTPROT_BOOT;
1265 rt->rt6i_protocol = cfg->fc_protocol;
1267 addr_type = ipv6_addr_type(&cfg->fc_dst);
1269 if (addr_type & IPV6_ADDR_MULTICAST)
1270 rt->dst.input = ip6_mc_input;
1271 else if (cfg->fc_flags & RTF_LOCAL)
1272 rt->dst.input = ip6_input;
1274 rt->dst.input = ip6_forward;
1276 rt->dst.output = ip6_output;
1278 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1279 rt->rt6i_dst.plen = cfg->fc_dst_len;
1280 if (rt->rt6i_dst.plen == 128)
1281 rt->dst.flags |= DST_HOST;
1283 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1284 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1289 dst_init_metrics(&rt->dst, metrics, 0);
1291 #ifdef CONFIG_IPV6_SUBTREES
1292 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1293 rt->rt6i_src.plen = cfg->fc_src_len;
1296 rt->rt6i_metric = cfg->fc_metric;
1298 /* We cannot add true routes via loopback here,
1299 they would result in kernel looping; promote them to reject routes
1301 if ((cfg->fc_flags & RTF_REJECT) ||
1302 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1303 && !(cfg->fc_flags&RTF_LOCAL))) {
1304 /* hold loopback dev/idev if we haven't done so. */
1305 if (dev != net->loopback_dev) {
1310 dev = net->loopback_dev;
1312 idev = in6_dev_get(dev);
1318 rt->dst.output = ip6_pkt_discard_out;
1319 rt->dst.input = ip6_pkt_discard;
1320 rt->dst.error = -ENETUNREACH;
1321 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1325 if (cfg->fc_flags & RTF_GATEWAY) {
1326 const struct in6_addr *gw_addr;
1329 gw_addr = &cfg->fc_gateway;
1330 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1331 gwa_type = ipv6_addr_type(gw_addr);
1333 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1334 struct rt6_info *grt;
1336 /* IPv6 strictly inhibits using not link-local
1337 addresses as nexthop address.
1338 Otherwise, router will not able to send redirects.
1339 It is very good, but in some (rare!) circumstances
1340 (SIT, PtP, NBMA NOARP links) it is handy to allow
1341 some exceptions. --ANK
1344 if (!(gwa_type&IPV6_ADDR_UNICAST))
1347 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1349 err = -EHOSTUNREACH;
1353 if (dev != grt->rt6i_dev) {
1354 dst_release(&grt->dst);
1358 dev = grt->rt6i_dev;
1359 idev = grt->rt6i_idev;
1361 in6_dev_hold(grt->rt6i_idev);
1363 if (!(grt->rt6i_flags&RTF_GATEWAY))
1365 dst_release(&grt->dst);
1371 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1379 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1380 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1384 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1385 rt->rt6i_prefsrc.plen = 128;
1387 rt->rt6i_prefsrc.plen = 0;
1389 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1390 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1395 dst_set_neighbour(&rt->dst, n);
1398 rt->rt6i_flags = cfg->fc_flags;
1405 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1406 int type = nla_type(nla);
1409 if (type > RTAX_MAX) {
1414 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1420 rt->rt6i_idev = idev;
1421 rt->rt6i_table = table;
1423 cfg->fc_nlinfo.nl_net = dev_net(dev);
1425 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1437 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1440 struct fib6_table *table;
1441 struct net *net = dev_net(rt->rt6i_dev);
1443 if (rt == net->ipv6.ip6_null_entry) {
1448 table = rt->rt6i_table;
1449 write_lock_bh(&table->tb6_lock);
1450 err = fib6_del(rt, info);
1451 write_unlock_bh(&table->tb6_lock);
1454 dst_release(&rt->dst);
1458 int ip6_del_rt(struct rt6_info *rt)
1460 struct nl_info info = {
1461 .nl_net = dev_net(rt->rt6i_dev),
1463 return __ip6_del_rt(rt, &info);
1466 static int ip6_route_del(struct fib6_config *cfg)
1468 struct fib6_table *table;
1469 struct fib6_node *fn;
1470 struct rt6_info *rt;
1473 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1477 read_lock_bh(&table->tb6_lock);
1479 fn = fib6_locate(&table->tb6_root,
1480 &cfg->fc_dst, cfg->fc_dst_len,
1481 &cfg->fc_src, cfg->fc_src_len);
1484 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1485 if (cfg->fc_ifindex &&
1486 (rt->rt6i_dev == NULL ||
1487 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1489 if (cfg->fc_flags & RTF_GATEWAY &&
1490 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1492 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1495 read_unlock_bh(&table->tb6_lock);
1497 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1500 read_unlock_bh(&table->tb6_lock);
1508 struct ip6rd_flowi {
1510 struct in6_addr gateway;
1513 static struct rt6_info *__ip6_route_redirect(struct net *net,
1514 struct fib6_table *table,
1518 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1519 struct rt6_info *rt;
1520 struct fib6_node *fn;
1523 * Get the "current" route for this destination and
1524 * check if the redirect has come from approriate router.
1526 * RFC 2461 specifies that redirects should only be
1527 * accepted if they come from the nexthop to the target.
1528 * Due to the way the routes are chosen, this notion
1529 * is a bit fuzzy and one might need to check all possible
1533 read_lock_bh(&table->tb6_lock);
1534 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1536 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1538 * Current route is on-link; redirect is always invalid.
1540 * Seems, previous statement is not true. It could
1541 * be node, which looks for us as on-link (f.e. proxy ndisc)
1542 * But then router serving it might decide, that we should
1543 * know truth 8)8) --ANK (980726).
1545 if (rt6_check_expired(rt))
1547 if (!(rt->rt6i_flags & RTF_GATEWAY))
1549 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1551 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1557 rt = net->ipv6.ip6_null_entry;
1558 BACKTRACK(net, &fl6->saddr);
1562 read_unlock_bh(&table->tb6_lock);
1567 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1568 const struct in6_addr *src,
1569 const struct in6_addr *gateway,
1570 struct net_device *dev)
1572 int flags = RT6_LOOKUP_F_HAS_SADDR;
1573 struct net *net = dev_net(dev);
1574 struct ip6rd_flowi rdfl = {
1576 .flowi6_oif = dev->ifindex,
1582 ipv6_addr_copy(&rdfl.gateway, gateway);
1584 if (rt6_need_strict(dest))
1585 flags |= RT6_LOOKUP_F_IFACE;
1587 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1588 flags, __ip6_route_redirect);
1591 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1592 const struct in6_addr *saddr,
1593 struct neighbour *neigh, u8 *lladdr, int on_link)
1595 struct rt6_info *rt, *nrt = NULL;
1596 struct netevent_redirect netevent;
1597 struct net *net = dev_net(neigh->dev);
1599 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1601 if (rt == net->ipv6.ip6_null_entry) {
1602 if (net_ratelimit())
1603 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1604 "for redirect target\n");
1609 * We have finally decided to accept it.
1612 neigh_update(neigh, lladdr, NUD_STALE,
1613 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1614 NEIGH_UPDATE_F_OVERRIDE|
1615 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1616 NEIGH_UPDATE_F_ISROUTER))
1620 * Redirect received -> path was valid.
1621 * Look, redirects are sent only in response to data packets,
1622 * so that this nexthop apparently is reachable. --ANK
1624 dst_confirm(&rt->dst);
1626 /* Duplicate redirect: silently ignore. */
1627 if (neigh == dst_get_neighbour_raw(&rt->dst))
1630 nrt = ip6_rt_copy(rt, dest);
1634 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1636 nrt->rt6i_flags &= ~RTF_GATEWAY;
1638 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1639 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1641 if (ip6_ins_rt(nrt))
1644 netevent.old = &rt->dst;
1645 netevent.new = &nrt->dst;
1646 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1648 if (rt->rt6i_flags&RTF_CACHE) {
1654 dst_release(&rt->dst);
1658 * Handle ICMP "packet too big" messages
1659 * i.e. Path MTU discovery
1662 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1663 struct net *net, u32 pmtu, int ifindex)
1665 struct rt6_info *rt, *nrt;
1668 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1672 if (rt6_check_expired(rt)) {
1677 if (pmtu >= dst_mtu(&rt->dst))
1680 if (pmtu < IPV6_MIN_MTU) {
1682 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1683 * MTU (1280) and a fragment header should always be included
1684 * after a node receiving Too Big message reporting PMTU is
1685 * less than the IPv6 Minimum Link MTU.
1687 pmtu = IPV6_MIN_MTU;
1691 /* New mtu received -> path was valid.
1692 They are sent only in response to data packets,
1693 so that this nexthop apparently is reachable. --ANK
1695 dst_confirm(&rt->dst);
1697 /* Host route. If it is static, it would be better
1698 not to override it, but add new one, so that
1699 when cache entry will expire old pmtu
1700 would return automatically.
1702 if (rt->rt6i_flags & RTF_CACHE) {
1703 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1705 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1706 features |= RTAX_FEATURE_ALLFRAG;
1707 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1709 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1710 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1715 Two cases are possible:
1716 1. It is connected route. Action: COW
1717 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1719 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1720 nrt = rt6_alloc_cow(rt, daddr, saddr);
1722 nrt = rt6_alloc_clone(rt, daddr);
1725 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1727 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1728 features |= RTAX_FEATURE_ALLFRAG;
1729 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1732 /* According to RFC 1981, detecting PMTU increase shouldn't be
1733 * happened within 5 mins, the recommended timer is 10 mins.
1734 * Here this route expiration time is set to ip6_rt_mtu_expires
1735 * which is 10 mins. After 10 mins the decreased pmtu is expired
1736 * and detecting PMTU increase will be automatically happened.
1738 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1739 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1744 dst_release(&rt->dst);
1747 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1748 struct net_device *dev, u32 pmtu)
1750 struct net *net = dev_net(dev);
1753 * RFC 1981 states that a node "MUST reduce the size of the packets it
1754 * is sending along the path" that caused the Packet Too Big message.
1755 * Since it's not possible in the general case to determine which
1756 * interface was used to send the original packet, we update the MTU
1757 * on the interface that will be used to send future packets. We also
1758 * update the MTU on the interface that received the Packet Too Big in
1759 * case the original packet was forced out that interface with
1760 * SO_BINDTODEVICE or similar. This is the next best thing to the
1761 * correct behaviour, which would be to update the MTU on all
1764 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1765 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1769 * Misc support functions
1772 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1773 const struct in6_addr *dest)
1775 struct net *net = dev_net(ort->rt6i_dev);
1776 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1780 rt->dst.input = ort->dst.input;
1781 rt->dst.output = ort->dst.output;
1782 rt->dst.flags |= DST_HOST;
1784 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1785 rt->rt6i_dst.plen = 128;
1786 dst_copy_metrics(&rt->dst, &ort->dst);
1787 rt->dst.error = ort->dst.error;
1788 rt->rt6i_idev = ort->rt6i_idev;
1790 in6_dev_hold(rt->rt6i_idev);
1791 rt->dst.lastuse = jiffies;
1792 rt->rt6i_expires = 0;
1794 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1795 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1796 rt->rt6i_metric = 0;
1798 #ifdef CONFIG_IPV6_SUBTREES
1799 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1801 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1802 rt->rt6i_table = ort->rt6i_table;
1807 #ifdef CONFIG_IPV6_ROUTE_INFO
1808 static struct rt6_info *rt6_get_route_info(struct net *net,
1809 const struct in6_addr *prefix, int prefixlen,
1810 const struct in6_addr *gwaddr, int ifindex)
1812 struct fib6_node *fn;
1813 struct rt6_info *rt = NULL;
1814 struct fib6_table *table;
1816 table = fib6_get_table(net, RT6_TABLE_INFO);
1820 write_lock_bh(&table->tb6_lock);
1821 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1825 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1826 if (rt->rt6i_dev->ifindex != ifindex)
1828 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1830 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1836 write_unlock_bh(&table->tb6_lock);
1840 static struct rt6_info *rt6_add_route_info(struct net *net,
1841 const struct in6_addr *prefix, int prefixlen,
1842 const struct in6_addr *gwaddr, int ifindex,
1845 struct fib6_config cfg = {
1846 .fc_table = RT6_TABLE_INFO,
1847 .fc_metric = IP6_RT_PRIO_USER,
1848 .fc_ifindex = ifindex,
1849 .fc_dst_len = prefixlen,
1850 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1851 RTF_UP | RTF_PREF(pref),
1853 .fc_nlinfo.nlh = NULL,
1854 .fc_nlinfo.nl_net = net,
1857 ipv6_addr_copy(&cfg.fc_dst, prefix);
1858 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1860 /* We should treat it as a default route if prefix length is 0. */
1862 cfg.fc_flags |= RTF_DEFAULT;
1864 ip6_route_add(&cfg);
1866 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1870 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1872 struct rt6_info *rt;
1873 struct fib6_table *table;
1875 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1879 write_lock_bh(&table->tb6_lock);
1880 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1881 if (dev == rt->rt6i_dev &&
1882 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1883 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1888 write_unlock_bh(&table->tb6_lock);
1892 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1893 struct net_device *dev,
1896 struct fib6_config cfg = {
1897 .fc_table = RT6_TABLE_DFLT,
1898 .fc_metric = IP6_RT_PRIO_USER,
1899 .fc_ifindex = dev->ifindex,
1900 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1901 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1903 .fc_nlinfo.nlh = NULL,
1904 .fc_nlinfo.nl_net = dev_net(dev),
1907 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1909 ip6_route_add(&cfg);
1911 return rt6_get_dflt_router(gwaddr, dev);
1914 void rt6_purge_dflt_routers(struct net *net)
1916 struct rt6_info *rt;
1917 struct fib6_table *table;
1919 /* NOTE: Keep consistent with rt6_get_dflt_router */
1920 table = fib6_get_table(net, RT6_TABLE_DFLT);
1925 read_lock_bh(&table->tb6_lock);
1926 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1927 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
1928 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
1930 read_unlock_bh(&table->tb6_lock);
1935 read_unlock_bh(&table->tb6_lock);
1938 static void rtmsg_to_fib6_config(struct net *net,
1939 struct in6_rtmsg *rtmsg,
1940 struct fib6_config *cfg)
1942 memset(cfg, 0, sizeof(*cfg));
1944 cfg->fc_table = RT6_TABLE_MAIN;
1945 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1946 cfg->fc_metric = rtmsg->rtmsg_metric;
1947 cfg->fc_expires = rtmsg->rtmsg_info;
1948 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1949 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1950 cfg->fc_flags = rtmsg->rtmsg_flags;
1952 cfg->fc_nlinfo.nl_net = net;
1954 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1955 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1956 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1959 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1961 struct fib6_config cfg;
1962 struct in6_rtmsg rtmsg;
1966 case SIOCADDRT: /* Add a route */
1967 case SIOCDELRT: /* Delete a route */
1968 if (!capable(CAP_NET_ADMIN))
1970 err = copy_from_user(&rtmsg, arg,
1971 sizeof(struct in6_rtmsg));
1975 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1980 err = ip6_route_add(&cfg);
1983 err = ip6_route_del(&cfg);
1997 * Drop the packet on the floor
2000 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2003 struct dst_entry *dst = skb_dst(skb);
2004 switch (ipstats_mib_noroutes) {
2005 case IPSTATS_MIB_INNOROUTES:
2006 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2007 if (type == IPV6_ADDR_ANY) {
2008 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2009 IPSTATS_MIB_INADDRERRORS);
2013 case IPSTATS_MIB_OUTNOROUTES:
2014 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2015 ipstats_mib_noroutes);
2018 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2023 static int ip6_pkt_discard(struct sk_buff *skb)
2025 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2028 static int ip6_pkt_discard_out(struct sk_buff *skb)
2030 skb->dev = skb_dst(skb)->dev;
2031 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2034 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2036 static int ip6_pkt_prohibit(struct sk_buff *skb)
2038 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2041 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2043 skb->dev = skb_dst(skb)->dev;
2044 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2050 * Allocate a dst for local (unicast / anycast) address.
2053 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2054 const struct in6_addr *addr,
2057 struct net *net = dev_net(idev->dev);
2058 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2059 net->loopback_dev, DST_NOCOUNT);
2060 struct neighbour *neigh;
2063 return ERR_PTR(-ENOMEM);
2067 rt->dst.flags |= DST_HOST;
2068 rt->dst.input = ip6_input;
2069 rt->dst.output = ip6_output;
2070 rt->rt6i_idev = idev;
2071 rt->dst.obsolete = -1;
2073 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2075 rt->rt6i_flags |= RTF_ANYCAST;
2077 rt->rt6i_flags |= RTF_LOCAL;
2078 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2079 if (IS_ERR(neigh)) {
2082 return ERR_CAST(neigh);
2084 dst_set_neighbour(&rt->dst, neigh);
2086 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2087 rt->rt6i_dst.plen = 128;
2088 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2090 atomic_set(&rt->dst.__refcnt, 1);
2095 int ip6_route_get_saddr(struct net *net,
2096 struct rt6_info *rt,
2097 const struct in6_addr *daddr,
2099 struct in6_addr *saddr)
2101 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2103 if (rt->rt6i_prefsrc.plen)
2104 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2106 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2107 daddr, prefs, saddr);
2111 /* remove deleted ip from prefsrc entries */
2112 struct arg_dev_net_ip {
2113 struct net_device *dev;
2115 struct in6_addr *addr;
2118 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2120 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2121 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2122 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2124 if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2125 rt != net->ipv6.ip6_null_entry &&
2126 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2127 /* remove prefsrc entry */
2128 rt->rt6i_prefsrc.plen = 0;
2133 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2135 struct net *net = dev_net(ifp->idev->dev);
2136 struct arg_dev_net_ip adni = {
2137 .dev = ifp->idev->dev,
2141 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2144 struct arg_dev_net {
2145 struct net_device *dev;
2149 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2151 const struct arg_dev_net *adn = arg;
2152 const struct net_device *dev = adn->dev;
2154 if ((rt->rt6i_dev == dev || dev == NULL) &&
2155 rt != adn->net->ipv6.ip6_null_entry) {
2156 RT6_TRACE("deleted by ifdown %p\n", rt);
2162 void rt6_ifdown(struct net *net, struct net_device *dev)
2164 struct arg_dev_net adn = {
2169 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2170 icmp6_clean_all(fib6_ifdown, &adn);
2173 struct rt6_mtu_change_arg
2175 struct net_device *dev;
2179 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2181 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2182 struct inet6_dev *idev;
2184 /* In IPv6 pmtu discovery is not optional,
2185 so that RTAX_MTU lock cannot disable it.
2186 We still use this lock to block changes
2187 caused by addrconf/ndisc.
2190 idev = __in6_dev_get(arg->dev);
2194 /* For administrative MTU increase, there is no way to discover
2195 IPv6 PMTU increase, so PMTU increase should be updated here.
2196 Since RFC 1981 doesn't include administrative MTU increase
2197 update PMTU increase is a MUST. (i.e. jumbo frame)
2200 If new MTU is less than route PMTU, this new MTU will be the
2201 lowest MTU in the path, update the route PMTU to reflect PMTU
2202 decreases; if new MTU is greater than route PMTU, and the
2203 old MTU is the lowest MTU in the path, update the route PMTU
2204 to reflect the increase. In this case if the other nodes' MTU
2205 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2208 if (rt->rt6i_dev == arg->dev &&
2209 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2210 (dst_mtu(&rt->dst) >= arg->mtu ||
2211 (dst_mtu(&rt->dst) < arg->mtu &&
2212 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2213 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2218 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2220 struct rt6_mtu_change_arg arg = {
2225 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2228 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2229 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2230 [RTA_OIF] = { .type = NLA_U32 },
2231 [RTA_IIF] = { .type = NLA_U32 },
2232 [RTA_PRIORITY] = { .type = NLA_U32 },
2233 [RTA_METRICS] = { .type = NLA_NESTED },
2236 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2237 struct fib6_config *cfg)
2240 struct nlattr *tb[RTA_MAX+1];
2243 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2248 rtm = nlmsg_data(nlh);
2249 memset(cfg, 0, sizeof(*cfg));
2251 cfg->fc_table = rtm->rtm_table;
2252 cfg->fc_dst_len = rtm->rtm_dst_len;
2253 cfg->fc_src_len = rtm->rtm_src_len;
2254 cfg->fc_flags = RTF_UP;
2255 cfg->fc_protocol = rtm->rtm_protocol;
2257 if (rtm->rtm_type == RTN_UNREACHABLE)
2258 cfg->fc_flags |= RTF_REJECT;
2260 if (rtm->rtm_type == RTN_LOCAL)
2261 cfg->fc_flags |= RTF_LOCAL;
2263 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2264 cfg->fc_nlinfo.nlh = nlh;
2265 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2267 if (tb[RTA_GATEWAY]) {
2268 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2269 cfg->fc_flags |= RTF_GATEWAY;
2273 int plen = (rtm->rtm_dst_len + 7) >> 3;
2275 if (nla_len(tb[RTA_DST]) < plen)
2278 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2282 int plen = (rtm->rtm_src_len + 7) >> 3;
2284 if (nla_len(tb[RTA_SRC]) < plen)
2287 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2290 if (tb[RTA_PREFSRC])
2291 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2294 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2296 if (tb[RTA_PRIORITY])
2297 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2299 if (tb[RTA_METRICS]) {
2300 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2301 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2305 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2312 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2314 struct fib6_config cfg;
2317 err = rtm_to_fib6_config(skb, nlh, &cfg);
2321 return ip6_route_del(&cfg);
2324 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2326 struct fib6_config cfg;
2329 err = rtm_to_fib6_config(skb, nlh, &cfg);
2333 return ip6_route_add(&cfg);
2336 static inline size_t rt6_nlmsg_size(void)
2338 return NLMSG_ALIGN(sizeof(struct rtmsg))
2339 + nla_total_size(16) /* RTA_SRC */
2340 + nla_total_size(16) /* RTA_DST */
2341 + nla_total_size(16) /* RTA_GATEWAY */
2342 + nla_total_size(16) /* RTA_PREFSRC */
2343 + nla_total_size(4) /* RTA_TABLE */
2344 + nla_total_size(4) /* RTA_IIF */
2345 + nla_total_size(4) /* RTA_OIF */
2346 + nla_total_size(4) /* RTA_PRIORITY */
2347 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2348 + nla_total_size(sizeof(struct rta_cacheinfo));
2351 static int rt6_fill_node(struct net *net,
2352 struct sk_buff *skb, struct rt6_info *rt,
2353 struct in6_addr *dst, struct in6_addr *src,
2354 int iif, int type, u32 pid, u32 seq,
2355 int prefix, int nowait, unsigned int flags)
2358 struct nlmsghdr *nlh;
2361 struct neighbour *n;
2363 if (prefix) { /* user wants prefix routes only */
2364 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2365 /* success since this is not a prefix route */
2370 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2374 rtm = nlmsg_data(nlh);
2375 rtm->rtm_family = AF_INET6;
2376 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2377 rtm->rtm_src_len = rt->rt6i_src.plen;
2380 table = rt->rt6i_table->tb6_id;
2382 table = RT6_TABLE_UNSPEC;
2383 rtm->rtm_table = table;
2384 NLA_PUT_U32(skb, RTA_TABLE, table);
2385 if (rt->rt6i_flags&RTF_REJECT)
2386 rtm->rtm_type = RTN_UNREACHABLE;
2387 else if (rt->rt6i_flags&RTF_LOCAL)
2388 rtm->rtm_type = RTN_LOCAL;
2389 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2390 rtm->rtm_type = RTN_LOCAL;
2392 rtm->rtm_type = RTN_UNICAST;
2394 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2395 rtm->rtm_protocol = rt->rt6i_protocol;
2396 if (rt->rt6i_flags&RTF_DYNAMIC)
2397 rtm->rtm_protocol = RTPROT_REDIRECT;
2398 else if (rt->rt6i_flags & RTF_ADDRCONF)
2399 rtm->rtm_protocol = RTPROT_KERNEL;
2400 else if (rt->rt6i_flags&RTF_DEFAULT)
2401 rtm->rtm_protocol = RTPROT_RA;
2403 if (rt->rt6i_flags&RTF_CACHE)
2404 rtm->rtm_flags |= RTM_F_CLONED;
2407 NLA_PUT(skb, RTA_DST, 16, dst);
2408 rtm->rtm_dst_len = 128;
2409 } else if (rtm->rtm_dst_len)
2410 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2411 #ifdef CONFIG_IPV6_SUBTREES
2413 NLA_PUT(skb, RTA_SRC, 16, src);
2414 rtm->rtm_src_len = 128;
2415 } else if (rtm->rtm_src_len)
2416 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2419 #ifdef CONFIG_IPV6_MROUTE
2420 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2421 int err = ip6mr_get_route(net, skb, rtm, nowait);
2426 goto nla_put_failure;
2428 if (err == -EMSGSIZE)
2429 goto nla_put_failure;
2434 NLA_PUT_U32(skb, RTA_IIF, iif);
2436 struct in6_addr saddr_buf;
2437 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2438 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2441 if (rt->rt6i_prefsrc.plen) {
2442 struct in6_addr saddr_buf;
2443 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2444 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2447 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2448 goto nla_put_failure;
2451 n = dst_get_neighbour(&rt->dst);
2453 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2455 goto nla_put_failure;
2461 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2463 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2465 if (!(rt->rt6i_flags & RTF_EXPIRES))
2467 else if (rt->rt6i_expires - jiffies < INT_MAX)
2468 expires = rt->rt6i_expires - jiffies;
2472 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2473 expires, rt->dst.error) < 0)
2474 goto nla_put_failure;
2476 return nlmsg_end(skb, nlh);
2479 nlmsg_cancel(skb, nlh);
2483 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2485 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2488 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2489 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2490 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2494 return rt6_fill_node(arg->net,
2495 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2496 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2497 prefix, 0, NLM_F_MULTI);
2500 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2502 struct net *net = sock_net(in_skb->sk);
2503 struct nlattr *tb[RTA_MAX+1];
2504 struct rt6_info *rt;
2505 struct sk_buff *skb;
2510 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2515 memset(&fl6, 0, sizeof(fl6));
2518 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2521 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2525 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2528 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2532 iif = nla_get_u32(tb[RTA_IIF]);
2535 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2538 struct net_device *dev;
2539 dev = __dev_get_by_index(net, iif);
2546 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2552 /* Reserve room for dummy headers, this skb can pass
2553 through good chunk of routing engine.
2555 skb_reset_mac_header(skb);
2556 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2558 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2559 skb_dst_set(skb, &rt->dst);
2561 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2562 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2563 nlh->nlmsg_seq, 0, 0, 0);
2569 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2574 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2576 struct sk_buff *skb;
2577 struct net *net = info->nl_net;
2582 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2584 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2588 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2589 event, info->pid, seq, 0, 0, 0);
2591 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2592 WARN_ON(err == -EMSGSIZE);
2596 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2597 info->nlh, gfp_any());
2601 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2604 static int ip6_route_dev_notify(struct notifier_block *this,
2605 unsigned long event, void *data)
2607 struct net_device *dev = (struct net_device *)data;
2608 struct net *net = dev_net(dev);
2610 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2611 net->ipv6.ip6_null_entry->dst.dev = dev;
2612 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2613 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2614 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2615 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2616 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2617 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2628 #ifdef CONFIG_PROC_FS
2639 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2641 struct seq_file *m = p_arg;
2642 struct neighbour *n;
2644 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2646 #ifdef CONFIG_IPV6_SUBTREES
2647 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2649 seq_puts(m, "00000000000000000000000000000000 00 ");
2652 n = dst_get_neighbour(&rt->dst);
2654 seq_printf(m, "%pi6", n->primary_key);
2656 seq_puts(m, "00000000000000000000000000000000");
2659 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2660 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2661 rt->dst.__use, rt->rt6i_flags,
2662 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2666 static int ipv6_route_show(struct seq_file *m, void *v)
2668 struct net *net = (struct net *)m->private;
2669 fib6_clean_all(net, rt6_info_route, 0, m);
2673 static int ipv6_route_open(struct inode *inode, struct file *file)
2675 return single_open_net(inode, file, ipv6_route_show);
2678 static const struct file_operations ipv6_route_proc_fops = {
2679 .owner = THIS_MODULE,
2680 .open = ipv6_route_open,
2682 .llseek = seq_lseek,
2683 .release = single_release_net,
2686 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2688 struct net *net = (struct net *)seq->private;
2689 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2690 net->ipv6.rt6_stats->fib_nodes,
2691 net->ipv6.rt6_stats->fib_route_nodes,
2692 net->ipv6.rt6_stats->fib_rt_alloc,
2693 net->ipv6.rt6_stats->fib_rt_entries,
2694 net->ipv6.rt6_stats->fib_rt_cache,
2695 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2696 net->ipv6.rt6_stats->fib_discarded_routes);
2701 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2703 return single_open_net(inode, file, rt6_stats_seq_show);
2706 static const struct file_operations rt6_stats_seq_fops = {
2707 .owner = THIS_MODULE,
2708 .open = rt6_stats_seq_open,
2710 .llseek = seq_lseek,
2711 .release = single_release_net,
2713 #endif /* CONFIG_PROC_FS */
2715 #ifdef CONFIG_SYSCTL
2718 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2719 void __user *buffer, size_t *lenp, loff_t *ppos)
2726 net = (struct net *)ctl->extra1;
2727 delay = net->ipv6.sysctl.flush_delay;
2728 proc_dointvec(ctl, write, buffer, lenp, ppos);
2729 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2733 ctl_table ipv6_route_table_template[] = {
2735 .procname = "flush",
2736 .data = &init_net.ipv6.sysctl.flush_delay,
2737 .maxlen = sizeof(int),
2739 .proc_handler = ipv6_sysctl_rtcache_flush
2742 .procname = "gc_thresh",
2743 .data = &ip6_dst_ops_template.gc_thresh,
2744 .maxlen = sizeof(int),
2746 .proc_handler = proc_dointvec,
2749 .procname = "max_size",
2750 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2751 .maxlen = sizeof(int),
2753 .proc_handler = proc_dointvec,
2756 .procname = "gc_min_interval",
2757 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2758 .maxlen = sizeof(int),
2760 .proc_handler = proc_dointvec_jiffies,
2763 .procname = "gc_timeout",
2764 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2765 .maxlen = sizeof(int),
2767 .proc_handler = proc_dointvec_jiffies,
2770 .procname = "gc_interval",
2771 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2772 .maxlen = sizeof(int),
2774 .proc_handler = proc_dointvec_jiffies,
2777 .procname = "gc_elasticity",
2778 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2779 .maxlen = sizeof(int),
2781 .proc_handler = proc_dointvec,
2784 .procname = "mtu_expires",
2785 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2786 .maxlen = sizeof(int),
2788 .proc_handler = proc_dointvec_jiffies,
2791 .procname = "min_adv_mss",
2792 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2793 .maxlen = sizeof(int),
2795 .proc_handler = proc_dointvec,
2798 .procname = "gc_min_interval_ms",
2799 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2800 .maxlen = sizeof(int),
2802 .proc_handler = proc_dointvec_ms_jiffies,
2807 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2809 struct ctl_table *table;
2811 table = kmemdup(ipv6_route_table_template,
2812 sizeof(ipv6_route_table_template),
2816 table[0].data = &net->ipv6.sysctl.flush_delay;
2817 table[0].extra1 = net;
2818 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2819 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2820 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2821 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2822 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2823 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2824 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2825 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2826 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2833 static int __net_init ip6_route_net_init(struct net *net)
2837 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2838 sizeof(net->ipv6.ip6_dst_ops));
2840 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2841 goto out_ip6_dst_ops;
2843 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2844 sizeof(*net->ipv6.ip6_null_entry),
2846 if (!net->ipv6.ip6_null_entry)
2847 goto out_ip6_dst_entries;
2848 net->ipv6.ip6_null_entry->dst.path =
2849 (struct dst_entry *)net->ipv6.ip6_null_entry;
2850 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2851 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2852 ip6_template_metrics, true);
2854 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2855 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2856 sizeof(*net->ipv6.ip6_prohibit_entry),
2858 if (!net->ipv6.ip6_prohibit_entry)
2859 goto out_ip6_null_entry;
2860 net->ipv6.ip6_prohibit_entry->dst.path =
2861 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2862 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2863 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2864 ip6_template_metrics, true);
2866 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2867 sizeof(*net->ipv6.ip6_blk_hole_entry),
2869 if (!net->ipv6.ip6_blk_hole_entry)
2870 goto out_ip6_prohibit_entry;
2871 net->ipv6.ip6_blk_hole_entry->dst.path =
2872 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2873 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2874 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2875 ip6_template_metrics, true);
2878 net->ipv6.sysctl.flush_delay = 0;
2879 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2880 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2881 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2882 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2883 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2884 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2885 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2887 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2893 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2894 out_ip6_prohibit_entry:
2895 kfree(net->ipv6.ip6_prohibit_entry);
2897 kfree(net->ipv6.ip6_null_entry);
2899 out_ip6_dst_entries:
2900 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2905 static void __net_exit ip6_route_net_exit(struct net *net)
2907 kfree(net->ipv6.ip6_null_entry);
2908 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2909 kfree(net->ipv6.ip6_prohibit_entry);
2910 kfree(net->ipv6.ip6_blk_hole_entry);
2912 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2915 static int __net_init ip6_route_net_init_late(struct net *net)
2917 #ifdef CONFIG_PROC_FS
2918 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2919 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2924 static void __net_exit ip6_route_net_exit_late(struct net *net)
2926 #ifdef CONFIG_PROC_FS
2927 proc_net_remove(net, "ipv6_route");
2928 proc_net_remove(net, "rt6_stats");
2932 static struct pernet_operations ip6_route_net_ops = {
2933 .init = ip6_route_net_init,
2934 .exit = ip6_route_net_exit,
2937 static struct pernet_operations ip6_route_net_late_ops = {
2938 .init = ip6_route_net_init_late,
2939 .exit = ip6_route_net_exit_late,
2942 static struct notifier_block ip6_route_dev_notifier = {
2943 .notifier_call = ip6_route_dev_notify,
2947 int __init ip6_route_init(void)
2952 ip6_dst_ops_template.kmem_cachep =
2953 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2954 SLAB_HWCACHE_ALIGN, NULL);
2955 if (!ip6_dst_ops_template.kmem_cachep)
2958 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2960 goto out_kmem_cache;
2962 ret = register_pernet_subsys(&ip6_route_net_ops);
2964 goto out_dst_entries;
2966 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2968 /* Registering of the loopback is done before this portion of code,
2969 * the loopback reference in rt6_info will not be taken, do it
2970 * manually for init_net */
2971 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2972 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2973 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2974 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2975 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2976 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2977 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2981 goto out_register_subsys;
2987 ret = fib6_rules_init();
2991 ret = register_pernet_subsys(&ip6_route_net_late_ops);
2993 goto fib6_rules_init;
2996 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2997 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2998 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2999 goto out_register_late_subsys;
3001 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3003 goto out_register_late_subsys;
3008 out_register_late_subsys:
3009 unregister_pernet_subsys(&ip6_route_net_late_ops);
3011 fib6_rules_cleanup();
3016 out_register_subsys:
3017 unregister_pernet_subsys(&ip6_route_net_ops);
3019 dst_entries_destroy(&ip6_dst_blackhole_ops);
3021 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3025 void ip6_route_cleanup(void)
3027 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3028 unregister_pernet_subsys(&ip6_route_net_late_ops);
3029 fib6_rules_cleanup();
3032 unregister_pernet_subsys(&ip6_route_net_ops);
3033 dst_entries_destroy(&ip6_dst_blackhole_ops);
3034 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);