2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
53 #include <linux/rtnetlink.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
59 #include <asm/uaccess.h>
62 #include <linux/sysctl.h>
65 /* Set to 3 to get tracing. */
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
73 #define RT6_TRACE(x...) do { ; } while (0)
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77 const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void ip6_dst_destroy(struct dst_entry *);
83 static void ip6_dst_ifdown(struct dst_entry *,
84 struct net_device *dev, int how);
85 static int ip6_dst_gc(struct dst_ops *ops);
87 static int ip6_pkt_discard(struct sk_buff *skb);
88 static int ip6_pkt_discard_out(struct sk_buff *skb);
89 static void ip6_link_failure(struct sk_buff *skb);
90 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94 const struct in6_addr *prefix, int prefixlen,
95 const struct in6_addr *gwaddr, int ifindex,
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98 const struct in6_addr *prefix, int prefixlen,
99 const struct in6_addr *gwaddr, int ifindex);
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
104 struct rt6_info *rt = (struct rt6_info *) dst;
105 struct inet_peer *peer;
108 if (!(rt->dst.flags & DST_HOST))
109 return dst_cow_metrics_generic(dst, old);
112 rt6_bind_peer(rt, 1);
114 peer = rt->rt6i_peer;
116 u32 *old_p = __DST_METRICS_PTR(old);
117 unsigned long prev, new;
120 if (inet_metrics_new(peer))
121 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
123 new = (unsigned long) p;
124 prev = cmpxchg(&dst->_metrics, old, new);
127 p = __DST_METRICS_PTR(prev);
128 if (prev & DST_METRICS_READ_ONLY)
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
137 return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
140 static struct dst_ops ip6_dst_ops_template = {
142 .protocol = cpu_to_be16(ETH_P_IPV6),
145 .check = ip6_dst_check,
146 .default_advmss = ip6_default_advmss,
148 .cow_metrics = ipv6_cow_metrics,
149 .destroy = ip6_dst_destroy,
150 .ifdown = ip6_dst_ifdown,
151 .negative_advice = ip6_negative_advice,
152 .link_failure = ip6_link_failure,
153 .update_pmtu = ip6_rt_update_pmtu,
154 .local_out = __ip6_local_out,
155 .neigh_lookup = ip6_neigh_lookup,
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
160 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
162 return mtu ? : dst->dev->mtu;
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
175 static struct dst_ops ip6_dst_blackhole_ops = {
177 .protocol = cpu_to_be16(ETH_P_IPV6),
178 .destroy = ip6_dst_destroy,
179 .check = ip6_dst_check,
180 .mtu = ip6_blackhole_mtu,
181 .default_advmss = ip6_default_advmss,
182 .update_pmtu = ip6_rt_blackhole_update_pmtu,
183 .cow_metrics = ip6_rt_blackhole_cow_metrics,
184 .neigh_lookup = ip6_neigh_lookup,
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188 [RTAX_HOPLIMIT - 1] = 0,
191 static struct rt6_info ip6_null_entry_template = {
193 .__refcnt = ATOMIC_INIT(1),
196 .error = -ENETUNREACH,
197 .input = ip6_pkt_discard,
198 .output = ip6_pkt_discard_out,
200 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
201 .rt6i_protocol = RTPROT_KERNEL,
202 .rt6i_metric = ~(u32) 0,
203 .rt6i_ref = ATOMIC_INIT(1),
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
211 static struct rt6_info ip6_prohibit_entry_template = {
213 .__refcnt = ATOMIC_INIT(1),
217 .input = ip6_pkt_prohibit,
218 .output = ip6_pkt_prohibit_out,
220 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
221 .rt6i_protocol = RTPROT_KERNEL,
222 .rt6i_metric = ~(u32) 0,
223 .rt6i_ref = ATOMIC_INIT(1),
226 static struct rt6_info ip6_blk_hole_entry_template = {
228 .__refcnt = ATOMIC_INIT(1),
232 .input = dst_discard,
233 .output = dst_discard,
235 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
236 .rt6i_protocol = RTPROT_KERNEL,
237 .rt6i_metric = ~(u32) 0,
238 .rt6i_ref = ATOMIC_INIT(1),
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245 struct net_device *dev,
248 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
251 memset(&rt->rt6i_table, 0,
252 sizeof(*rt) - sizeof(struct dst_entry));
257 static void ip6_dst_destroy(struct dst_entry *dst)
259 struct rt6_info *rt = (struct rt6_info *)dst;
260 struct inet6_dev *idev = rt->rt6i_idev;
261 struct inet_peer *peer = rt->rt6i_peer;
263 if (!(rt->dst.flags & DST_HOST))
264 dst_destroy_metrics_generic(dst);
267 rt->rt6i_idev = NULL;
271 rt->rt6i_peer = NULL;
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
278 static u32 rt6_peer_genid(void)
280 return atomic_read(&__rt6_peer_genid);
283 void rt6_bind_peer(struct rt6_info *rt, int create)
285 struct inet_peer *peer;
287 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
291 rt->rt6i_peer_genid = rt6_peer_genid();
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
297 struct rt6_info *rt = (struct rt6_info *)dst;
298 struct inet6_dev *idev = rt->rt6i_idev;
299 struct net_device *loopback_dev =
300 dev_net(dev)->loopback_dev;
302 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
303 struct inet6_dev *loopback_idev =
304 in6_dev_get(loopback_dev);
305 if (loopback_idev != NULL) {
306 rt->rt6i_idev = loopback_idev;
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
314 return (rt->rt6i_flags & RTF_EXPIRES) &&
315 time_after(jiffies, rt->rt6i_expires);
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
320 return ipv6_addr_type(daddr) &
321 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
325 * Route lookup. Any table->tb6_lock is implied.
328 static inline struct rt6_info *rt6_device_match(struct net *net,
330 const struct in6_addr *saddr,
334 struct rt6_info *local = NULL;
335 struct rt6_info *sprt;
337 if (!oif && ipv6_addr_any(saddr))
340 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341 struct net_device *dev = sprt->rt6i_dev;
344 if (dev->ifindex == oif)
346 if (dev->flags & IFF_LOOPBACK) {
347 if (sprt->rt6i_idev == NULL ||
348 sprt->rt6i_idev->dev->ifindex != oif) {
349 if (flags & RT6_LOOKUP_F_IFACE && oif)
351 if (local && (!oif ||
352 local->rt6i_idev->dev->ifindex == oif))
358 if (ipv6_chk_addr(net, saddr, dev,
359 flags & RT6_LOOKUP_F_IFACE))
368 if (flags & RT6_LOOKUP_F_IFACE)
369 return net->ipv6.ip6_null_entry;
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
378 struct neighbour *neigh;
380 * Okay, this does not seem to be appropriate
381 * for now, however, we need to check if it
382 * is really so; aka Router Reachability Probing.
384 * Router Reachability Probe MUST be rate-limited
385 * to no more than one per minute.
388 neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389 if (!neigh || (neigh->nud_state & NUD_VALID))
391 read_lock_bh(&neigh->lock);
392 if (!(neigh->nud_state & NUD_VALID) &&
393 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394 struct in6_addr mcaddr;
395 struct in6_addr *target;
397 neigh->updated = jiffies;
398 read_unlock_bh(&neigh->lock);
400 target = (struct in6_addr *)&neigh->primary_key;
401 addrconf_addr_solict_mult(target, &mcaddr);
402 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
404 read_unlock_bh(&neigh->lock);
410 static inline void rt6_probe(struct rt6_info *rt)
416 * Default Router Selection (RFC 2461 6.3.6)
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
420 struct net_device *dev = rt->rt6i_dev;
421 if (!oif || dev->ifindex == oif)
423 if ((dev->flags & IFF_LOOPBACK) &&
424 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
429 static inline int rt6_check_neigh(struct rt6_info *rt)
431 struct neighbour *neigh;
435 neigh = dst_get_neighbour(&rt->dst);
436 if (rt->rt6i_flags & RTF_NONEXTHOP ||
437 !(rt->rt6i_flags & RTF_GATEWAY))
440 read_lock_bh(&neigh->lock);
441 if (neigh->nud_state & NUD_VALID)
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444 else if (neigh->nud_state & NUD_FAILED)
449 read_unlock_bh(&neigh->lock);
456 static int rt6_score_route(struct rt6_info *rt, int oif,
461 m = rt6_check_dev(rt, oif);
462 if (!m && (strict & RT6_LOOKUP_F_IFACE))
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
467 n = rt6_check_neigh(rt);
468 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474 int *mpri, struct rt6_info *match)
478 if (rt6_check_expired(rt))
481 m = rt6_score_route(rt, oif, strict);
486 if (strict & RT6_LOOKUP_F_REACHABLE)
490 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499 struct rt6_info *rr_head,
500 u32 metric, int oif, int strict)
502 struct rt6_info *rt, *match;
506 for (rt = rr_head; rt && rt->rt6i_metric == metric;
507 rt = rt->dst.rt6_next)
508 match = find_match(rt, oif, strict, &mpri, match);
509 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510 rt = rt->dst.rt6_next)
511 match = find_match(rt, oif, strict, &mpri, match);
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
518 struct rt6_info *match, *rt0;
521 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522 __func__, fn->leaf, oif);
526 fn->rr_ptr = rt0 = fn->leaf;
528 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
531 (strict & RT6_LOOKUP_F_REACHABLE)) {
532 struct rt6_info *next = rt0->dst.rt6_next;
534 /* no entries matched; do round-robin */
535 if (!next || next->rt6i_metric != rt0->rt6i_metric)
542 RT6_TRACE("%s() => %p\n",
545 net = dev_net(rt0->rt6i_dev);
546 return match ? match : net->ipv6.ip6_null_entry;
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551 const struct in6_addr *gwaddr)
553 struct net *net = dev_net(dev);
554 struct route_info *rinfo = (struct route_info *) opt;
555 struct in6_addr prefix_buf, *prefix;
557 unsigned long lifetime;
560 if (len < sizeof(struct route_info)) {
564 /* Sanity check for prefix_len and length */
565 if (rinfo->length > 3) {
567 } else if (rinfo->prefix_len > 128) {
569 } else if (rinfo->prefix_len > 64) {
570 if (rinfo->length < 2) {
573 } else if (rinfo->prefix_len > 0) {
574 if (rinfo->length < 1) {
579 pref = rinfo->route_pref;
580 if (pref == ICMPV6_ROUTER_PREF_INVALID)
583 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
585 if (rinfo->length == 3)
586 prefix = (struct in6_addr *)rinfo->prefix;
588 /* this function is safe */
589 ipv6_addr_prefix(&prefix_buf,
590 (struct in6_addr *)rinfo->prefix,
592 prefix = &prefix_buf;
595 if (rinfo->prefix_len == 0)
596 rt = rt6_get_dflt_router(gwaddr, dev);
598 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
599 gwaddr, dev->ifindex);
601 if (rt && !lifetime) {
607 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
610 rt->rt6i_flags = RTF_ROUTEINFO |
611 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
614 if (!addrconf_finite_timeout(lifetime)) {
615 rt->rt6i_flags &= ~RTF_EXPIRES;
617 rt->rt6i_expires = jiffies + HZ * lifetime;
618 rt->rt6i_flags |= RTF_EXPIRES;
620 dst_release(&rt->dst);
626 #define BACKTRACK(__net, saddr) \
628 if (rt == __net->ipv6.ip6_null_entry) { \
629 struct fib6_node *pn; \
631 if (fn->fn_flags & RTN_TL_ROOT) \
634 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
635 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
638 if (fn->fn_flags & RTN_RTINFO) \
644 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
645 struct fib6_table *table,
646 struct flowi6 *fl6, int flags)
648 struct fib6_node *fn;
651 read_lock_bh(&table->tb6_lock);
652 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
655 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
656 BACKTRACK(net, &fl6->saddr);
658 dst_use(&rt->dst, jiffies);
659 read_unlock_bh(&table->tb6_lock);
664 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
665 const struct in6_addr *saddr, int oif, int strict)
667 struct flowi6 fl6 = {
671 struct dst_entry *dst;
672 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
675 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
676 flags |= RT6_LOOKUP_F_HAS_SADDR;
679 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
681 return (struct rt6_info *) dst;
688 EXPORT_SYMBOL(rt6_lookup);
690 /* ip6_ins_rt is called with FREE table->tb6_lock.
691 It takes new route entry, the addition fails by any reason the
692 route is freed. In any case, if caller does not hold it, it may
696 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
699 struct fib6_table *table;
701 table = rt->rt6i_table;
702 write_lock_bh(&table->tb6_lock);
703 err = fib6_add(&table->tb6_root, rt, info);
704 write_unlock_bh(&table->tb6_lock);
709 int ip6_ins_rt(struct rt6_info *rt)
711 struct nl_info info = {
712 .nl_net = dev_net(rt->rt6i_dev),
714 return __ip6_ins_rt(rt, &info);
717 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
718 const struct in6_addr *daddr,
719 const struct in6_addr *saddr)
727 rt = ip6_rt_copy(ort, daddr);
730 struct neighbour *neigh;
731 int attempts = !in_softirq();
733 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
734 if (ort->rt6i_dst.plen != 128 &&
735 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
736 rt->rt6i_flags |= RTF_ANYCAST;
737 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
740 rt->rt6i_flags |= RTF_CACHE;
742 #ifdef CONFIG_IPV6_SUBTREES
743 if (rt->rt6i_src.plen && saddr) {
744 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
745 rt->rt6i_src.plen = 128;
750 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
752 struct net *net = dev_net(rt->rt6i_dev);
753 int saved_rt_min_interval =
754 net->ipv6.sysctl.ip6_rt_gc_min_interval;
755 int saved_rt_elasticity =
756 net->ipv6.sysctl.ip6_rt_gc_elasticity;
758 if (attempts-- > 0) {
759 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
760 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
762 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
764 net->ipv6.sysctl.ip6_rt_gc_elasticity =
766 net->ipv6.sysctl.ip6_rt_gc_min_interval =
767 saved_rt_min_interval;
773 "ipv6: Neighbour table overflow.\n");
777 dst_set_neighbour(&rt->dst, neigh);
784 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
785 const struct in6_addr *daddr)
787 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
790 rt->rt6i_flags |= RTF_CACHE;
791 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
796 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
797 struct flowi6 *fl6, int flags, bool input)
799 struct fib6_node *fn;
800 struct rt6_info *rt, *nrt;
804 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
805 int local = RTF_NONEXTHOP;
807 strict |= flags & RT6_LOOKUP_F_IFACE;
812 read_lock_bh(&table->tb6_lock);
815 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
818 rt = rt6_select(fn, oif, strict | reachable);
820 BACKTRACK(net, &fl6->saddr);
821 if (rt == net->ipv6.ip6_null_entry ||
822 rt->rt6i_flags & RTF_CACHE)
826 read_unlock_bh(&table->tb6_lock);
828 if (!dst_get_neighbour_raw(&rt->dst)
829 && !(rt->rt6i_flags & local))
830 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
831 else if (!(rt->dst.flags & DST_HOST))
832 nrt = rt6_alloc_clone(rt, &fl6->daddr);
836 dst_release(&rt->dst);
837 rt = nrt ? : net->ipv6.ip6_null_entry;
841 err = ip6_ins_rt(nrt);
850 * Race condition! In the gap, when table->tb6_lock was
851 * released someone could insert this route. Relookup.
853 dst_release(&rt->dst);
862 read_unlock_bh(&table->tb6_lock);
864 rt->dst.lastuse = jiffies;
870 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
871 struct flowi6 *fl6, int flags)
873 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags, true);
876 void ip6_route_input(struct sk_buff *skb)
878 const struct ipv6hdr *iph = ipv6_hdr(skb);
879 struct net *net = dev_net(skb->dev);
880 int flags = RT6_LOOKUP_F_HAS_SADDR;
881 struct flowi6 fl6 = {
882 .flowi6_iif = skb->dev->ifindex,
885 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
886 .flowi6_mark = skb->mark,
887 .flowi6_proto = iph->nexthdr,
890 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
891 flags |= RT6_LOOKUP_F_IFACE;
893 skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
896 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
897 struct flowi6 *fl6, int flags)
899 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags, false);
902 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
907 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
908 flags |= RT6_LOOKUP_F_IFACE;
910 if (!ipv6_addr_any(&fl6->saddr))
911 flags |= RT6_LOOKUP_F_HAS_SADDR;
913 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
915 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
918 EXPORT_SYMBOL(ip6_route_output);
920 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
922 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
923 struct dst_entry *new = NULL;
925 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
927 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
932 new->input = dst_discard;
933 new->output = dst_discard;
935 if (dst_metrics_read_only(&ort->dst))
936 new->_metrics = ort->dst._metrics;
938 dst_copy_metrics(new, &ort->dst);
939 rt->rt6i_idev = ort->rt6i_idev;
941 in6_dev_hold(rt->rt6i_idev);
942 rt->rt6i_expires = 0;
944 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
945 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
948 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
949 #ifdef CONFIG_IPV6_SUBTREES
950 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
956 dst_release(dst_orig);
957 return new ? new : ERR_PTR(-ENOMEM);
961 * Destination cache support functions
964 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
968 rt = (struct rt6_info *) dst;
970 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
971 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
973 rt6_bind_peer(rt, 0);
974 rt->rt6i_peer_genid = rt6_peer_genid();
981 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
983 struct rt6_info *rt = (struct rt6_info *) dst;
986 if (rt->rt6i_flags & RTF_CACHE) {
987 if (rt6_check_expired(rt)) {
999 static void ip6_link_failure(struct sk_buff *skb)
1001 struct rt6_info *rt;
1003 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1005 rt = (struct rt6_info *) skb_dst(skb);
1007 if (rt->rt6i_flags&RTF_CACHE) {
1008 dst_set_expires(&rt->dst, 0);
1009 rt->rt6i_flags |= RTF_EXPIRES;
1010 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1011 rt->rt6i_node->fn_sernum = -1;
1015 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1017 struct rt6_info *rt6 = (struct rt6_info*)dst;
1019 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1020 rt6->rt6i_flags |= RTF_MODIFIED;
1021 if (mtu < IPV6_MIN_MTU)
1024 dst_metric_set(dst, RTAX_MTU, mtu);
1028 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1030 struct net_device *dev = dst->dev;
1031 unsigned int mtu = dst_mtu(dst);
1032 struct net *net = dev_net(dev);
1034 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1036 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1037 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1040 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1041 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1042 * IPV6_MAXPLEN is also valid and means: "any MSS,
1043 * rely only on pmtu discovery"
1045 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1050 static unsigned int ip6_mtu(const struct dst_entry *dst)
1052 struct inet6_dev *idev;
1053 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1061 idev = __in6_dev_get(dst->dev);
1063 mtu = idev->cnf.mtu6;
1067 return min_t(unsigned int, mtu, IP6_MAX_MTU);
1070 static struct dst_entry *icmp6_dst_gc_list;
1071 static DEFINE_SPINLOCK(icmp6_dst_lock);
1073 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1074 struct neighbour *neigh,
1075 const struct in6_addr *addr)
1077 struct rt6_info *rt;
1078 struct inet6_dev *idev = in6_dev_get(dev);
1079 struct net *net = dev_net(dev);
1081 if (unlikely(idev == NULL))
1084 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1085 if (unlikely(rt == NULL)) {
1093 neigh = ndisc_get_neigh(dev, addr);
1098 rt->dst.flags |= DST_HOST;
1099 rt->dst.output = ip6_output;
1100 dst_set_neighbour(&rt->dst, neigh);
1101 atomic_set(&rt->dst.__refcnt, 1);
1102 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1103 rt->rt6i_dst.plen = 128;
1104 rt->rt6i_idev = idev;
1105 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1107 spin_lock_bh(&icmp6_dst_lock);
1108 rt->dst.next = icmp6_dst_gc_list;
1109 icmp6_dst_gc_list = &rt->dst;
1110 spin_unlock_bh(&icmp6_dst_lock);
1112 fib6_force_start_gc(net);
1118 int icmp6_dst_gc(void)
1120 struct dst_entry *dst, **pprev;
1123 spin_lock_bh(&icmp6_dst_lock);
1124 pprev = &icmp6_dst_gc_list;
1126 while ((dst = *pprev) != NULL) {
1127 if (!atomic_read(&dst->__refcnt)) {
1136 spin_unlock_bh(&icmp6_dst_lock);
1141 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1144 struct dst_entry *dst, **pprev;
1146 spin_lock_bh(&icmp6_dst_lock);
1147 pprev = &icmp6_dst_gc_list;
1148 while ((dst = *pprev) != NULL) {
1149 struct rt6_info *rt = (struct rt6_info *) dst;
1150 if (func(rt, arg)) {
1157 spin_unlock_bh(&icmp6_dst_lock);
1160 static int ip6_dst_gc(struct dst_ops *ops)
1162 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1163 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1164 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1165 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1166 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1167 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1170 entries = dst_entries_get_fast(ops);
1171 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1172 entries <= rt_max_size)
1175 net->ipv6.ip6_rt_gc_expire++;
1176 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, entries > rt_max_size);
1177 entries = dst_entries_get_slow(ops);
1178 if (entries < ops->gc_thresh)
1179 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1181 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1182 return entries > rt_max_size;
1185 /* Clean host part of a prefix. Not necessary in radix tree,
1186 but results in cleaner routing tables.
1188 Remove it only when all the things will work!
1191 int ip6_dst_hoplimit(struct dst_entry *dst)
1193 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1194 if (hoplimit == 0) {
1195 struct net_device *dev = dst->dev;
1196 struct inet6_dev *idev;
1199 idev = __in6_dev_get(dev);
1201 hoplimit = idev->cnf.hop_limit;
1203 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1208 EXPORT_SYMBOL(ip6_dst_hoplimit);
1214 int ip6_route_add(struct fib6_config *cfg)
1217 struct net *net = cfg->fc_nlinfo.nl_net;
1218 struct rt6_info *rt = NULL;
1219 struct net_device *dev = NULL;
1220 struct inet6_dev *idev = NULL;
1221 struct fib6_table *table;
1224 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1226 #ifndef CONFIG_IPV6_SUBTREES
1227 if (cfg->fc_src_len)
1230 if (cfg->fc_ifindex) {
1232 dev = dev_get_by_index(net, cfg->fc_ifindex);
1235 idev = in6_dev_get(dev);
1240 if (cfg->fc_metric == 0)
1241 cfg->fc_metric = IP6_RT_PRIO_USER;
1243 table = fib6_new_table(net, cfg->fc_table);
1244 if (table == NULL) {
1249 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1256 rt->dst.obsolete = -1;
1257 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1258 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1261 if (cfg->fc_protocol == RTPROT_UNSPEC)
1262 cfg->fc_protocol = RTPROT_BOOT;
1263 rt->rt6i_protocol = cfg->fc_protocol;
1265 addr_type = ipv6_addr_type(&cfg->fc_dst);
1267 if (addr_type & IPV6_ADDR_MULTICAST)
1268 rt->dst.input = ip6_mc_input;
1269 else if (cfg->fc_flags & RTF_LOCAL)
1270 rt->dst.input = ip6_input;
1272 rt->dst.input = ip6_forward;
1274 rt->dst.output = ip6_output;
1276 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1277 rt->rt6i_dst.plen = cfg->fc_dst_len;
1278 if (rt->rt6i_dst.plen == 128)
1279 rt->dst.flags |= DST_HOST;
1281 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1282 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1287 dst_init_metrics(&rt->dst, metrics, 0);
1289 #ifdef CONFIG_IPV6_SUBTREES
1290 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1291 rt->rt6i_src.plen = cfg->fc_src_len;
1294 rt->rt6i_metric = cfg->fc_metric;
1296 /* We cannot add true routes via loopback here,
1297 they would result in kernel looping; promote them to reject routes
1299 if ((cfg->fc_flags & RTF_REJECT) ||
1300 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1301 && !(cfg->fc_flags&RTF_LOCAL))) {
1302 /* hold loopback dev/idev if we haven't done so. */
1303 if (dev != net->loopback_dev) {
1308 dev = net->loopback_dev;
1310 idev = in6_dev_get(dev);
1316 rt->dst.output = ip6_pkt_discard_out;
1317 rt->dst.input = ip6_pkt_discard;
1318 rt->dst.error = -ENETUNREACH;
1319 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1323 if (cfg->fc_flags & RTF_GATEWAY) {
1324 const struct in6_addr *gw_addr;
1327 gw_addr = &cfg->fc_gateway;
1328 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1329 gwa_type = ipv6_addr_type(gw_addr);
1331 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1332 struct rt6_info *grt;
1334 /* IPv6 strictly inhibits using not link-local
1335 addresses as nexthop address.
1336 Otherwise, router will not able to send redirects.
1337 It is very good, but in some (rare!) circumstances
1338 (SIT, PtP, NBMA NOARP links) it is handy to allow
1339 some exceptions. --ANK
1342 if (!(gwa_type&IPV6_ADDR_UNICAST))
1345 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1347 err = -EHOSTUNREACH;
1351 if (dev != grt->rt6i_dev) {
1352 dst_release(&grt->dst);
1356 dev = grt->rt6i_dev;
1357 idev = grt->rt6i_idev;
1359 in6_dev_hold(grt->rt6i_idev);
1361 if (!(grt->rt6i_flags&RTF_GATEWAY))
1363 dst_release(&grt->dst);
1369 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1377 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1378 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1382 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1383 rt->rt6i_prefsrc.plen = 128;
1385 rt->rt6i_prefsrc.plen = 0;
1387 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1388 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1393 dst_set_neighbour(&rt->dst, n);
1396 rt->rt6i_flags = cfg->fc_flags;
1403 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1404 int type = nla_type(nla);
1407 if (type > RTAX_MAX) {
1412 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1418 rt->rt6i_idev = idev;
1419 rt->rt6i_table = table;
1421 cfg->fc_nlinfo.nl_net = dev_net(dev);
1423 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1435 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1438 struct fib6_table *table;
1439 struct net *net = dev_net(rt->rt6i_dev);
1441 if (rt == net->ipv6.ip6_null_entry) {
1446 table = rt->rt6i_table;
1447 write_lock_bh(&table->tb6_lock);
1448 err = fib6_del(rt, info);
1449 write_unlock_bh(&table->tb6_lock);
1452 dst_release(&rt->dst);
1456 int ip6_del_rt(struct rt6_info *rt)
1458 struct nl_info info = {
1459 .nl_net = dev_net(rt->rt6i_dev),
1461 return __ip6_del_rt(rt, &info);
1464 static int ip6_route_del(struct fib6_config *cfg)
1466 struct fib6_table *table;
1467 struct fib6_node *fn;
1468 struct rt6_info *rt;
1471 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1475 read_lock_bh(&table->tb6_lock);
1477 fn = fib6_locate(&table->tb6_root,
1478 &cfg->fc_dst, cfg->fc_dst_len,
1479 &cfg->fc_src, cfg->fc_src_len);
1482 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1483 if (cfg->fc_ifindex &&
1484 (rt->rt6i_dev == NULL ||
1485 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1487 if (cfg->fc_flags & RTF_GATEWAY &&
1488 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1490 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1493 read_unlock_bh(&table->tb6_lock);
1495 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1498 read_unlock_bh(&table->tb6_lock);
1506 struct ip6rd_flowi {
1508 struct in6_addr gateway;
1511 static struct rt6_info *__ip6_route_redirect(struct net *net,
1512 struct fib6_table *table,
1516 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1517 struct rt6_info *rt;
1518 struct fib6_node *fn;
1521 * Get the "current" route for this destination and
1522 * check if the redirect has come from approriate router.
1524 * RFC 2461 specifies that redirects should only be
1525 * accepted if they come from the nexthop to the target.
1526 * Due to the way the routes are chosen, this notion
1527 * is a bit fuzzy and one might need to check all possible
1531 read_lock_bh(&table->tb6_lock);
1532 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1534 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1536 * Current route is on-link; redirect is always invalid.
1538 * Seems, previous statement is not true. It could
1539 * be node, which looks for us as on-link (f.e. proxy ndisc)
1540 * But then router serving it might decide, that we should
1541 * know truth 8)8) --ANK (980726).
1543 if (rt6_check_expired(rt))
1545 if (!(rt->rt6i_flags & RTF_GATEWAY))
1547 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1549 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1555 rt = net->ipv6.ip6_null_entry;
1556 BACKTRACK(net, &fl6->saddr);
1560 read_unlock_bh(&table->tb6_lock);
1565 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1566 const struct in6_addr *src,
1567 const struct in6_addr *gateway,
1568 struct net_device *dev)
1570 int flags = RT6_LOOKUP_F_HAS_SADDR;
1571 struct net *net = dev_net(dev);
1572 struct ip6rd_flowi rdfl = {
1574 .flowi6_oif = dev->ifindex,
1580 ipv6_addr_copy(&rdfl.gateway, gateway);
1582 if (rt6_need_strict(dest))
1583 flags |= RT6_LOOKUP_F_IFACE;
1585 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1586 flags, __ip6_route_redirect);
1589 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1590 const struct in6_addr *saddr,
1591 struct neighbour *neigh, u8 *lladdr, int on_link)
1593 struct rt6_info *rt, *nrt = NULL;
1594 struct netevent_redirect netevent;
1595 struct net *net = dev_net(neigh->dev);
1597 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1599 if (rt == net->ipv6.ip6_null_entry) {
1600 if (net_ratelimit())
1601 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1602 "for redirect target\n");
1607 * We have finally decided to accept it.
1610 neigh_update(neigh, lladdr, NUD_STALE,
1611 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1612 NEIGH_UPDATE_F_OVERRIDE|
1613 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1614 NEIGH_UPDATE_F_ISROUTER))
1618 * Redirect received -> path was valid.
1619 * Look, redirects are sent only in response to data packets,
1620 * so that this nexthop apparently is reachable. --ANK
1622 dst_confirm(&rt->dst);
1624 /* Duplicate redirect: silently ignore. */
1625 if (neigh == dst_get_neighbour_raw(&rt->dst))
1628 nrt = ip6_rt_copy(rt, dest);
1632 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1634 nrt->rt6i_flags &= ~RTF_GATEWAY;
1636 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1637 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1639 if (ip6_ins_rt(nrt))
1642 netevent.old = &rt->dst;
1643 netevent.new = &nrt->dst;
1644 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1646 if (rt->rt6i_flags&RTF_CACHE) {
1652 dst_release(&rt->dst);
1656 * Handle ICMP "packet too big" messages
1657 * i.e. Path MTU discovery
1660 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1661 struct net *net, u32 pmtu, int ifindex)
1663 struct rt6_info *rt, *nrt;
1666 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1670 if (rt6_check_expired(rt)) {
1675 if (pmtu >= dst_mtu(&rt->dst))
1678 if (pmtu < IPV6_MIN_MTU) {
1680 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1681 * MTU (1280) and a fragment header should always be included
1682 * after a node receiving Too Big message reporting PMTU is
1683 * less than the IPv6 Minimum Link MTU.
1685 pmtu = IPV6_MIN_MTU;
1689 /* New mtu received -> path was valid.
1690 They are sent only in response to data packets,
1691 so that this nexthop apparently is reachable. --ANK
1693 dst_confirm(&rt->dst);
1695 /* Host route. If it is static, it would be better
1696 not to override it, but add new one, so that
1697 when cache entry will expire old pmtu
1698 would return automatically.
1700 if (rt->rt6i_flags & RTF_CACHE) {
1701 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1703 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1704 features |= RTAX_FEATURE_ALLFRAG;
1705 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1707 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1708 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1713 Two cases are possible:
1714 1. It is connected route. Action: COW
1715 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1717 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1718 nrt = rt6_alloc_cow(rt, daddr, saddr);
1720 nrt = rt6_alloc_clone(rt, daddr);
1723 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1725 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1726 features |= RTAX_FEATURE_ALLFRAG;
1727 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1730 /* According to RFC 1981, detecting PMTU increase shouldn't be
1731 * happened within 5 mins, the recommended timer is 10 mins.
1732 * Here this route expiration time is set to ip6_rt_mtu_expires
1733 * which is 10 mins. After 10 mins the decreased pmtu is expired
1734 * and detecting PMTU increase will be automatically happened.
1736 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1737 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1742 dst_release(&rt->dst);
1745 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1746 struct net_device *dev, u32 pmtu)
1748 struct net *net = dev_net(dev);
1751 * RFC 1981 states that a node "MUST reduce the size of the packets it
1752 * is sending along the path" that caused the Packet Too Big message.
1753 * Since it's not possible in the general case to determine which
1754 * interface was used to send the original packet, we update the MTU
1755 * on the interface that will be used to send future packets. We also
1756 * update the MTU on the interface that received the Packet Too Big in
1757 * case the original packet was forced out that interface with
1758 * SO_BINDTODEVICE or similar. This is the next best thing to the
1759 * correct behaviour, which would be to update the MTU on all
1762 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1763 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1767 * Misc support functions
1770 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1771 const struct in6_addr *dest)
1773 struct net *net = dev_net(ort->rt6i_dev);
1774 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1778 rt->dst.input = ort->dst.input;
1779 rt->dst.output = ort->dst.output;
1780 rt->dst.flags |= DST_HOST;
1782 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1783 rt->rt6i_dst.plen = 128;
1784 dst_copy_metrics(&rt->dst, &ort->dst);
1785 rt->dst.error = ort->dst.error;
1786 rt->rt6i_idev = ort->rt6i_idev;
1788 in6_dev_hold(rt->rt6i_idev);
1789 rt->dst.lastuse = jiffies;
1790 rt->rt6i_expires = 0;
1792 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1793 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1794 rt->rt6i_metric = 0;
1796 #ifdef CONFIG_IPV6_SUBTREES
1797 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1799 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1800 rt->rt6i_table = ort->rt6i_table;
1805 #ifdef CONFIG_IPV6_ROUTE_INFO
1806 static struct rt6_info *rt6_get_route_info(struct net *net,
1807 const struct in6_addr *prefix, int prefixlen,
1808 const struct in6_addr *gwaddr, int ifindex)
1810 struct fib6_node *fn;
1811 struct rt6_info *rt = NULL;
1812 struct fib6_table *table;
1814 table = fib6_get_table(net, RT6_TABLE_INFO);
1818 write_lock_bh(&table->tb6_lock);
1819 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1823 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1824 if (rt->rt6i_dev->ifindex != ifindex)
1826 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1828 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1834 write_unlock_bh(&table->tb6_lock);
1838 static struct rt6_info *rt6_add_route_info(struct net *net,
1839 const struct in6_addr *prefix, int prefixlen,
1840 const struct in6_addr *gwaddr, int ifindex,
1843 struct fib6_config cfg = {
1844 .fc_table = RT6_TABLE_INFO,
1845 .fc_metric = IP6_RT_PRIO_USER,
1846 .fc_ifindex = ifindex,
1847 .fc_dst_len = prefixlen,
1848 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1849 RTF_UP | RTF_PREF(pref),
1851 .fc_nlinfo.nlh = NULL,
1852 .fc_nlinfo.nl_net = net,
1855 ipv6_addr_copy(&cfg.fc_dst, prefix);
1856 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1858 /* We should treat it as a default route if prefix length is 0. */
1860 cfg.fc_flags |= RTF_DEFAULT;
1862 ip6_route_add(&cfg);
1864 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1868 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1870 struct rt6_info *rt;
1871 struct fib6_table *table;
1873 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1877 write_lock_bh(&table->tb6_lock);
1878 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1879 if (dev == rt->rt6i_dev &&
1880 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1881 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1886 write_unlock_bh(&table->tb6_lock);
1890 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1891 struct net_device *dev,
1894 struct fib6_config cfg = {
1895 .fc_table = RT6_TABLE_DFLT,
1896 .fc_metric = IP6_RT_PRIO_USER,
1897 .fc_ifindex = dev->ifindex,
1898 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1899 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1901 .fc_nlinfo.nlh = NULL,
1902 .fc_nlinfo.nl_net = dev_net(dev),
1905 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1907 ip6_route_add(&cfg);
1909 return rt6_get_dflt_router(gwaddr, dev);
1912 void rt6_purge_dflt_routers(struct net *net)
1914 struct rt6_info *rt;
1915 struct fib6_table *table;
1917 /* NOTE: Keep consistent with rt6_get_dflt_router */
1918 table = fib6_get_table(net, RT6_TABLE_DFLT);
1923 read_lock_bh(&table->tb6_lock);
1924 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1925 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
1926 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
1928 read_unlock_bh(&table->tb6_lock);
1933 read_unlock_bh(&table->tb6_lock);
1936 static void rtmsg_to_fib6_config(struct net *net,
1937 struct in6_rtmsg *rtmsg,
1938 struct fib6_config *cfg)
1940 memset(cfg, 0, sizeof(*cfg));
1942 cfg->fc_table = RT6_TABLE_MAIN;
1943 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1944 cfg->fc_metric = rtmsg->rtmsg_metric;
1945 cfg->fc_expires = rtmsg->rtmsg_info;
1946 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1947 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1948 cfg->fc_flags = rtmsg->rtmsg_flags;
1950 cfg->fc_nlinfo.nl_net = net;
1952 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1953 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1954 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1957 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1959 struct fib6_config cfg;
1960 struct in6_rtmsg rtmsg;
1964 case SIOCADDRT: /* Add a route */
1965 case SIOCDELRT: /* Delete a route */
1966 if (!capable(CAP_NET_ADMIN))
1968 err = copy_from_user(&rtmsg, arg,
1969 sizeof(struct in6_rtmsg));
1973 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1978 err = ip6_route_add(&cfg);
1981 err = ip6_route_del(&cfg);
1995 * Drop the packet on the floor
1998 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2001 struct dst_entry *dst = skb_dst(skb);
2002 switch (ipstats_mib_noroutes) {
2003 case IPSTATS_MIB_INNOROUTES:
2004 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2005 if (type == IPV6_ADDR_ANY) {
2006 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2007 IPSTATS_MIB_INADDRERRORS);
2011 case IPSTATS_MIB_OUTNOROUTES:
2012 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2013 ipstats_mib_noroutes);
2016 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2021 static int ip6_pkt_discard(struct sk_buff *skb)
2023 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2026 static int ip6_pkt_discard_out(struct sk_buff *skb)
2028 skb->dev = skb_dst(skb)->dev;
2029 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2032 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2034 static int ip6_pkt_prohibit(struct sk_buff *skb)
2036 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2039 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2041 skb->dev = skb_dst(skb)->dev;
2042 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2048 * Allocate a dst for local (unicast / anycast) address.
2051 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2052 const struct in6_addr *addr,
2055 struct net *net = dev_net(idev->dev);
2056 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2057 net->loopback_dev, DST_NOCOUNT);
2058 struct neighbour *neigh;
2061 return ERR_PTR(-ENOMEM);
2065 rt->dst.flags |= DST_HOST;
2066 rt->dst.input = ip6_input;
2067 rt->dst.output = ip6_output;
2068 rt->rt6i_idev = idev;
2069 rt->dst.obsolete = -1;
2071 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2073 rt->rt6i_flags |= RTF_ANYCAST;
2075 rt->rt6i_flags |= RTF_LOCAL;
2076 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2077 if (IS_ERR(neigh)) {
2080 return ERR_CAST(neigh);
2082 dst_set_neighbour(&rt->dst, neigh);
2084 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2085 rt->rt6i_dst.plen = 128;
2086 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2088 atomic_set(&rt->dst.__refcnt, 1);
2093 int ip6_route_get_saddr(struct net *net,
2094 struct rt6_info *rt,
2095 const struct in6_addr *daddr,
2097 struct in6_addr *saddr)
2099 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2101 if (rt->rt6i_prefsrc.plen)
2102 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2104 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2105 daddr, prefs, saddr);
2109 /* remove deleted ip from prefsrc entries */
2110 struct arg_dev_net_ip {
2111 struct net_device *dev;
2113 struct in6_addr *addr;
2116 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2118 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2119 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2120 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2122 if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2123 rt != net->ipv6.ip6_null_entry &&
2124 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2125 /* remove prefsrc entry */
2126 rt->rt6i_prefsrc.plen = 0;
2131 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2133 struct net *net = dev_net(ifp->idev->dev);
2134 struct arg_dev_net_ip adni = {
2135 .dev = ifp->idev->dev,
2139 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2142 struct arg_dev_net {
2143 struct net_device *dev;
2147 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2149 const struct arg_dev_net *adn = arg;
2150 const struct net_device *dev = adn->dev;
2152 if ((rt->rt6i_dev == dev || dev == NULL) &&
2153 rt != adn->net->ipv6.ip6_null_entry) {
2154 RT6_TRACE("deleted by ifdown %p\n", rt);
2160 void rt6_ifdown(struct net *net, struct net_device *dev)
2162 struct arg_dev_net adn = {
2167 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2168 icmp6_clean_all(fib6_ifdown, &adn);
2171 struct rt6_mtu_change_arg
2173 struct net_device *dev;
2177 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2179 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2180 struct inet6_dev *idev;
2182 /* In IPv6 pmtu discovery is not optional,
2183 so that RTAX_MTU lock cannot disable it.
2184 We still use this lock to block changes
2185 caused by addrconf/ndisc.
2188 idev = __in6_dev_get(arg->dev);
2192 /* For administrative MTU increase, there is no way to discover
2193 IPv6 PMTU increase, so PMTU increase should be updated here.
2194 Since RFC 1981 doesn't include administrative MTU increase
2195 update PMTU increase is a MUST. (i.e. jumbo frame)
2198 If new MTU is less than route PMTU, this new MTU will be the
2199 lowest MTU in the path, update the route PMTU to reflect PMTU
2200 decreases; if new MTU is greater than route PMTU, and the
2201 old MTU is the lowest MTU in the path, update the route PMTU
2202 to reflect the increase. In this case if the other nodes' MTU
2203 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2206 if (rt->rt6i_dev == arg->dev &&
2207 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2208 (dst_mtu(&rt->dst) >= arg->mtu ||
2209 (dst_mtu(&rt->dst) < arg->mtu &&
2210 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2211 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2216 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2218 struct rt6_mtu_change_arg arg = {
2223 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2226 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2227 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2228 [RTA_OIF] = { .type = NLA_U32 },
2229 [RTA_IIF] = { .type = NLA_U32 },
2230 [RTA_PRIORITY] = { .type = NLA_U32 },
2231 [RTA_METRICS] = { .type = NLA_NESTED },
2234 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2235 struct fib6_config *cfg)
2238 struct nlattr *tb[RTA_MAX+1];
2241 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2246 rtm = nlmsg_data(nlh);
2247 memset(cfg, 0, sizeof(*cfg));
2249 cfg->fc_table = rtm->rtm_table;
2250 cfg->fc_dst_len = rtm->rtm_dst_len;
2251 cfg->fc_src_len = rtm->rtm_src_len;
2252 cfg->fc_flags = RTF_UP;
2253 cfg->fc_protocol = rtm->rtm_protocol;
2255 if (rtm->rtm_type == RTN_UNREACHABLE)
2256 cfg->fc_flags |= RTF_REJECT;
2258 if (rtm->rtm_type == RTN_LOCAL)
2259 cfg->fc_flags |= RTF_LOCAL;
2261 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2262 cfg->fc_nlinfo.nlh = nlh;
2263 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2265 if (tb[RTA_GATEWAY]) {
2266 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2267 cfg->fc_flags |= RTF_GATEWAY;
2271 int plen = (rtm->rtm_dst_len + 7) >> 3;
2273 if (nla_len(tb[RTA_DST]) < plen)
2276 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2280 int plen = (rtm->rtm_src_len + 7) >> 3;
2282 if (nla_len(tb[RTA_SRC]) < plen)
2285 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2288 if (tb[RTA_PREFSRC])
2289 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2292 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2294 if (tb[RTA_PRIORITY])
2295 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2297 if (tb[RTA_METRICS]) {
2298 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2299 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2303 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2310 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2312 struct fib6_config cfg;
2315 err = rtm_to_fib6_config(skb, nlh, &cfg);
2319 return ip6_route_del(&cfg);
2322 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2324 struct fib6_config cfg;
2327 err = rtm_to_fib6_config(skb, nlh, &cfg);
2331 return ip6_route_add(&cfg);
2334 static inline size_t rt6_nlmsg_size(void)
2336 return NLMSG_ALIGN(sizeof(struct rtmsg))
2337 + nla_total_size(16) /* RTA_SRC */
2338 + nla_total_size(16) /* RTA_DST */
2339 + nla_total_size(16) /* RTA_GATEWAY */
2340 + nla_total_size(16) /* RTA_PREFSRC */
2341 + nla_total_size(4) /* RTA_TABLE */
2342 + nla_total_size(4) /* RTA_IIF */
2343 + nla_total_size(4) /* RTA_OIF */
2344 + nla_total_size(4) /* RTA_PRIORITY */
2345 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2346 + nla_total_size(sizeof(struct rta_cacheinfo));
2349 static int rt6_fill_node(struct net *net,
2350 struct sk_buff *skb, struct rt6_info *rt,
2351 struct in6_addr *dst, struct in6_addr *src,
2352 int iif, int type, u32 pid, u32 seq,
2353 int prefix, int nowait, unsigned int flags)
2356 struct nlmsghdr *nlh;
2359 struct neighbour *n;
2361 if (prefix) { /* user wants prefix routes only */
2362 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2363 /* success since this is not a prefix route */
2368 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2372 rtm = nlmsg_data(nlh);
2373 rtm->rtm_family = AF_INET6;
2374 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2375 rtm->rtm_src_len = rt->rt6i_src.plen;
2378 table = rt->rt6i_table->tb6_id;
2380 table = RT6_TABLE_UNSPEC;
2381 rtm->rtm_table = table;
2382 NLA_PUT_U32(skb, RTA_TABLE, table);
2383 if (rt->rt6i_flags&RTF_REJECT)
2384 rtm->rtm_type = RTN_UNREACHABLE;
2385 else if (rt->rt6i_flags&RTF_LOCAL)
2386 rtm->rtm_type = RTN_LOCAL;
2387 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2388 rtm->rtm_type = RTN_LOCAL;
2390 rtm->rtm_type = RTN_UNICAST;
2392 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2393 rtm->rtm_protocol = rt->rt6i_protocol;
2394 if (rt->rt6i_flags&RTF_DYNAMIC)
2395 rtm->rtm_protocol = RTPROT_REDIRECT;
2396 else if (rt->rt6i_flags & RTF_ADDRCONF)
2397 rtm->rtm_protocol = RTPROT_KERNEL;
2398 else if (rt->rt6i_flags&RTF_DEFAULT)
2399 rtm->rtm_protocol = RTPROT_RA;
2401 if (rt->rt6i_flags&RTF_CACHE)
2402 rtm->rtm_flags |= RTM_F_CLONED;
2405 NLA_PUT(skb, RTA_DST, 16, dst);
2406 rtm->rtm_dst_len = 128;
2407 } else if (rtm->rtm_dst_len)
2408 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2409 #ifdef CONFIG_IPV6_SUBTREES
2411 NLA_PUT(skb, RTA_SRC, 16, src);
2412 rtm->rtm_src_len = 128;
2413 } else if (rtm->rtm_src_len)
2414 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2417 #ifdef CONFIG_IPV6_MROUTE
2418 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2419 int err = ip6mr_get_route(net, skb, rtm, nowait,
2426 goto nla_put_failure;
2428 if (err == -EMSGSIZE)
2429 goto nla_put_failure;
2434 NLA_PUT_U32(skb, RTA_IIF, iif);
2436 struct in6_addr saddr_buf;
2437 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2438 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2441 if (rt->rt6i_prefsrc.plen) {
2442 struct in6_addr saddr_buf;
2443 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2444 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2447 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2448 goto nla_put_failure;
2451 n = dst_get_neighbour(&rt->dst);
2453 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2455 goto nla_put_failure;
2461 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2463 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2465 if (!(rt->rt6i_flags & RTF_EXPIRES))
2467 else if (rt->rt6i_expires - jiffies < INT_MAX)
2468 expires = rt->rt6i_expires - jiffies;
2472 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2473 expires, rt->dst.error) < 0)
2474 goto nla_put_failure;
2476 return nlmsg_end(skb, nlh);
2479 nlmsg_cancel(skb, nlh);
2483 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2485 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2488 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2489 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2490 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2494 return rt6_fill_node(arg->net,
2495 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2496 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2497 prefix, 0, NLM_F_MULTI);
2500 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2502 struct net *net = sock_net(in_skb->sk);
2503 struct nlattr *tb[RTA_MAX+1];
2504 struct rt6_info *rt;
2505 struct sk_buff *skb;
2510 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2515 memset(&fl6, 0, sizeof(fl6));
2518 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2521 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2525 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2528 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2532 iif = nla_get_u32(tb[RTA_IIF]);
2535 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2538 struct net_device *dev;
2539 dev = __dev_get_by_index(net, iif);
2546 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2552 /* Reserve room for dummy headers, this skb can pass
2553 through good chunk of routing engine.
2555 skb_reset_mac_header(skb);
2556 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2558 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2559 skb_dst_set(skb, &rt->dst);
2561 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2562 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2563 nlh->nlmsg_seq, 0, 0, 0);
2569 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2574 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2576 struct sk_buff *skb;
2577 struct net *net = info->nl_net;
2582 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2584 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2588 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2589 event, info->pid, seq, 0, 0, 0);
2591 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2592 WARN_ON(err == -EMSGSIZE);
2596 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2597 info->nlh, gfp_any());
2601 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2604 static int ip6_route_dev_notify(struct notifier_block *this,
2605 unsigned long event, void *data)
2607 struct net_device *dev = (struct net_device *)data;
2608 struct net *net = dev_net(dev);
2610 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2611 net->ipv6.ip6_null_entry->dst.dev = dev;
2612 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2613 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2614 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2615 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2616 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2617 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2628 #ifdef CONFIG_PROC_FS
2639 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2641 struct seq_file *m = p_arg;
2642 struct neighbour *n;
2644 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2646 #ifdef CONFIG_IPV6_SUBTREES
2647 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2649 seq_puts(m, "00000000000000000000000000000000 00 ");
2652 n = dst_get_neighbour(&rt->dst);
2654 seq_printf(m, "%pi6", n->primary_key);
2656 seq_puts(m, "00000000000000000000000000000000");
2659 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2660 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2661 rt->dst.__use, rt->rt6i_flags,
2662 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2666 static int ipv6_route_show(struct seq_file *m, void *v)
2668 struct net *net = (struct net *)m->private;
2669 fib6_clean_all(net, rt6_info_route, 0, m);
2673 static int ipv6_route_open(struct inode *inode, struct file *file)
2675 return single_open_net(inode, file, ipv6_route_show);
2678 static const struct file_operations ipv6_route_proc_fops = {
2679 .owner = THIS_MODULE,
2680 .open = ipv6_route_open,
2682 .llseek = seq_lseek,
2683 .release = single_release_net,
2686 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2688 struct net *net = (struct net *)seq->private;
2689 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2690 net->ipv6.rt6_stats->fib_nodes,
2691 net->ipv6.rt6_stats->fib_route_nodes,
2692 net->ipv6.rt6_stats->fib_rt_alloc,
2693 net->ipv6.rt6_stats->fib_rt_entries,
2694 net->ipv6.rt6_stats->fib_rt_cache,
2695 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2696 net->ipv6.rt6_stats->fib_discarded_routes);
2701 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2703 return single_open_net(inode, file, rt6_stats_seq_show);
2706 static const struct file_operations rt6_stats_seq_fops = {
2707 .owner = THIS_MODULE,
2708 .open = rt6_stats_seq_open,
2710 .llseek = seq_lseek,
2711 .release = single_release_net,
2713 #endif /* CONFIG_PROC_FS */
2715 #ifdef CONFIG_SYSCTL
2718 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2719 void __user *buffer, size_t *lenp, loff_t *ppos)
2726 net = (struct net *)ctl->extra1;
2727 delay = net->ipv6.sysctl.flush_delay;
2728 proc_dointvec(ctl, write, buffer, lenp, ppos);
2729 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
2733 ctl_table ipv6_route_table_template[] = {
2735 .procname = "flush",
2736 .data = &init_net.ipv6.sysctl.flush_delay,
2737 .maxlen = sizeof(int),
2739 .proc_handler = ipv6_sysctl_rtcache_flush
2742 .procname = "gc_thresh",
2743 .data = &ip6_dst_ops_template.gc_thresh,
2744 .maxlen = sizeof(int),
2746 .proc_handler = proc_dointvec,
2749 .procname = "max_size",
2750 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2751 .maxlen = sizeof(int),
2753 .proc_handler = proc_dointvec,
2756 .procname = "gc_min_interval",
2757 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2758 .maxlen = sizeof(int),
2760 .proc_handler = proc_dointvec_jiffies,
2763 .procname = "gc_timeout",
2764 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2765 .maxlen = sizeof(int),
2767 .proc_handler = proc_dointvec_jiffies,
2770 .procname = "gc_interval",
2771 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2772 .maxlen = sizeof(int),
2774 .proc_handler = proc_dointvec_jiffies,
2777 .procname = "gc_elasticity",
2778 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2779 .maxlen = sizeof(int),
2781 .proc_handler = proc_dointvec,
2784 .procname = "mtu_expires",
2785 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2786 .maxlen = sizeof(int),
2788 .proc_handler = proc_dointvec_jiffies,
2791 .procname = "min_adv_mss",
2792 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2793 .maxlen = sizeof(int),
2795 .proc_handler = proc_dointvec,
2798 .procname = "gc_min_interval_ms",
2799 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2800 .maxlen = sizeof(int),
2802 .proc_handler = proc_dointvec_ms_jiffies,
2807 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2809 struct ctl_table *table;
2811 table = kmemdup(ipv6_route_table_template,
2812 sizeof(ipv6_route_table_template),
2816 table[0].data = &net->ipv6.sysctl.flush_delay;
2817 table[0].extra1 = net;
2818 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2819 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2820 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2821 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2822 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2823 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2824 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2825 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2826 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2833 static int __net_init ip6_route_net_init(struct net *net)
2837 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2838 sizeof(net->ipv6.ip6_dst_ops));
2840 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2841 goto out_ip6_dst_ops;
2843 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2844 sizeof(*net->ipv6.ip6_null_entry),
2846 if (!net->ipv6.ip6_null_entry)
2847 goto out_ip6_dst_entries;
2848 net->ipv6.ip6_null_entry->dst.path =
2849 (struct dst_entry *)net->ipv6.ip6_null_entry;
2850 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2851 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2852 ip6_template_metrics, true);
2854 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2855 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2856 sizeof(*net->ipv6.ip6_prohibit_entry),
2858 if (!net->ipv6.ip6_prohibit_entry)
2859 goto out_ip6_null_entry;
2860 net->ipv6.ip6_prohibit_entry->dst.path =
2861 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2862 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2863 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2864 ip6_template_metrics, true);
2866 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2867 sizeof(*net->ipv6.ip6_blk_hole_entry),
2869 if (!net->ipv6.ip6_blk_hole_entry)
2870 goto out_ip6_prohibit_entry;
2871 net->ipv6.ip6_blk_hole_entry->dst.path =
2872 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2873 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2874 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2875 ip6_template_metrics, true);
2878 net->ipv6.sysctl.flush_delay = 0;
2879 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2880 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2881 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2882 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2883 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2884 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2885 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2887 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2893 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2894 out_ip6_prohibit_entry:
2895 kfree(net->ipv6.ip6_prohibit_entry);
2897 kfree(net->ipv6.ip6_null_entry);
2899 out_ip6_dst_entries:
2900 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2905 static void __net_exit ip6_route_net_exit(struct net *net)
2907 kfree(net->ipv6.ip6_null_entry);
2908 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2909 kfree(net->ipv6.ip6_prohibit_entry);
2910 kfree(net->ipv6.ip6_blk_hole_entry);
2912 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2915 static int __net_init ip6_route_net_init_late(struct net *net)
2917 #ifdef CONFIG_PROC_FS
2918 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2919 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2924 static void __net_exit ip6_route_net_exit_late(struct net *net)
2926 #ifdef CONFIG_PROC_FS
2927 proc_net_remove(net, "ipv6_route");
2928 proc_net_remove(net, "rt6_stats");
2932 static struct pernet_operations ip6_route_net_ops = {
2933 .init = ip6_route_net_init,
2934 .exit = ip6_route_net_exit,
2937 static struct pernet_operations ip6_route_net_late_ops = {
2938 .init = ip6_route_net_init_late,
2939 .exit = ip6_route_net_exit_late,
2942 static struct notifier_block ip6_route_dev_notifier = {
2943 .notifier_call = ip6_route_dev_notify,
2947 int __init ip6_route_init(void)
2952 ip6_dst_ops_template.kmem_cachep =
2953 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2954 SLAB_HWCACHE_ALIGN, NULL);
2955 if (!ip6_dst_ops_template.kmem_cachep)
2958 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2960 goto out_kmem_cache;
2962 ret = register_pernet_subsys(&ip6_route_net_ops);
2964 goto out_dst_entries;
2966 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2968 /* Registering of the loopback is done before this portion of code,
2969 * the loopback reference in rt6_info will not be taken, do it
2970 * manually for init_net */
2971 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2972 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2973 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2974 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2975 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2976 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2977 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2981 goto out_register_subsys;
2987 ret = fib6_rules_init();
2991 ret = register_pernet_subsys(&ip6_route_net_late_ops);
2993 goto fib6_rules_init;
2996 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2997 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2998 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2999 goto out_register_late_subsys;
3001 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3003 goto out_register_late_subsys;
3008 out_register_late_subsys:
3009 unregister_pernet_subsys(&ip6_route_net_late_ops);
3011 fib6_rules_cleanup();
3016 out_register_subsys:
3017 unregister_pernet_subsys(&ip6_route_net_ops);
3019 dst_entries_destroy(&ip6_dst_blackhole_ops);
3021 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3025 void ip6_route_cleanup(void)
3027 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3028 unregister_pernet_subsys(&ip6_route_net_late_ops);
3029 fib6_rules_cleanup();
3032 unregister_pernet_subsys(&ip6_route_net_ops);
3033 dst_entries_destroy(&ip6_dst_blackhole_ops);
3034 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);