2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
53 #include <linux/rtnetlink.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
59 #include <asm/uaccess.h>
62 #include <linux/sysctl.h>
65 /* Set to 3 to get tracing. */
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
73 #define RT6_TRACE(x...) do { ; } while (0)
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77 const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void ip6_dst_destroy(struct dst_entry *);
83 static void ip6_dst_ifdown(struct dst_entry *,
84 struct net_device *dev, int how);
85 static int ip6_dst_gc(struct dst_ops *ops);
87 static int ip6_pkt_discard(struct sk_buff *skb);
88 static int ip6_pkt_discard_out(struct sk_buff *skb);
89 static void ip6_link_failure(struct sk_buff *skb);
90 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94 const struct in6_addr *prefix, int prefixlen,
95 const struct in6_addr *gwaddr, int ifindex,
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98 const struct in6_addr *prefix, int prefixlen,
99 const struct in6_addr *gwaddr, int ifindex);
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
104 struct rt6_info *rt = (struct rt6_info *) dst;
105 struct inet_peer *peer;
108 if (!(rt->dst.flags & DST_HOST))
112 rt6_bind_peer(rt, 1);
114 peer = rt->rt6i_peer;
116 u32 *old_p = __DST_METRICS_PTR(old);
117 unsigned long prev, new;
120 if (inet_metrics_new(peer))
121 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
123 new = (unsigned long) p;
124 prev = cmpxchg(&dst->_metrics, old, new);
127 p = __DST_METRICS_PTR(prev);
128 if (prev & DST_METRICS_READ_ONLY)
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
137 return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
140 static struct dst_ops ip6_dst_ops_template = {
142 .protocol = cpu_to_be16(ETH_P_IPV6),
145 .check = ip6_dst_check,
146 .default_advmss = ip6_default_advmss,
148 .cow_metrics = ipv6_cow_metrics,
149 .destroy = ip6_dst_destroy,
150 .ifdown = ip6_dst_ifdown,
151 .negative_advice = ip6_negative_advice,
152 .link_failure = ip6_link_failure,
153 .update_pmtu = ip6_rt_update_pmtu,
154 .local_out = __ip6_local_out,
155 .neigh_lookup = ip6_neigh_lookup,
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
160 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
162 return mtu ? : dst->dev->mtu;
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
175 static struct dst_ops ip6_dst_blackhole_ops = {
177 .protocol = cpu_to_be16(ETH_P_IPV6),
178 .destroy = ip6_dst_destroy,
179 .check = ip6_dst_check,
180 .mtu = ip6_blackhole_mtu,
181 .default_advmss = ip6_default_advmss,
182 .update_pmtu = ip6_rt_blackhole_update_pmtu,
183 .cow_metrics = ip6_rt_blackhole_cow_metrics,
184 .neigh_lookup = ip6_neigh_lookup,
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188 [RTAX_HOPLIMIT - 1] = 0,
191 static struct rt6_info ip6_null_entry_template = {
193 .__refcnt = ATOMIC_INIT(1),
196 .error = -ENETUNREACH,
197 .input = ip6_pkt_discard,
198 .output = ip6_pkt_discard_out,
200 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
201 .rt6i_protocol = RTPROT_KERNEL,
202 .rt6i_metric = ~(u32) 0,
203 .rt6i_ref = ATOMIC_INIT(1),
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
211 static struct rt6_info ip6_prohibit_entry_template = {
213 .__refcnt = ATOMIC_INIT(1),
217 .input = ip6_pkt_prohibit,
218 .output = ip6_pkt_prohibit_out,
220 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
221 .rt6i_protocol = RTPROT_KERNEL,
222 .rt6i_metric = ~(u32) 0,
223 .rt6i_ref = ATOMIC_INIT(1),
226 static struct rt6_info ip6_blk_hole_entry_template = {
228 .__refcnt = ATOMIC_INIT(1),
232 .input = dst_discard,
233 .output = dst_discard,
235 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
236 .rt6i_protocol = RTPROT_KERNEL,
237 .rt6i_metric = ~(u32) 0,
238 .rt6i_ref = ATOMIC_INIT(1),
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245 struct net_device *dev,
248 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
251 memset(&rt->rt6i_table, 0,
252 sizeof(*rt) - sizeof(struct dst_entry));
257 static void ip6_dst_destroy(struct dst_entry *dst)
259 struct rt6_info *rt = (struct rt6_info *)dst;
260 struct inet6_dev *idev = rt->rt6i_idev;
261 struct inet_peer *peer = rt->rt6i_peer;
263 if (!(rt->dst.flags & DST_HOST))
264 dst_destroy_metrics_generic(dst);
267 rt->rt6i_idev = NULL;
271 rt->rt6i_peer = NULL;
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
278 static u32 rt6_peer_genid(void)
280 return atomic_read(&__rt6_peer_genid);
283 void rt6_bind_peer(struct rt6_info *rt, int create)
285 struct inet_peer *peer;
287 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
291 rt->rt6i_peer_genid = rt6_peer_genid();
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
297 struct rt6_info *rt = (struct rt6_info *)dst;
298 struct inet6_dev *idev = rt->rt6i_idev;
299 struct net_device *loopback_dev =
300 dev_net(dev)->loopback_dev;
302 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
303 struct inet6_dev *loopback_idev =
304 in6_dev_get(loopback_dev);
305 if (loopback_idev != NULL) {
306 rt->rt6i_idev = loopback_idev;
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
314 return (rt->rt6i_flags & RTF_EXPIRES) &&
315 time_after(jiffies, rt->rt6i_expires);
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
320 return ipv6_addr_type(daddr) &
321 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
325 * Route lookup. Any table->tb6_lock is implied.
328 static inline struct rt6_info *rt6_device_match(struct net *net,
330 const struct in6_addr *saddr,
334 struct rt6_info *local = NULL;
335 struct rt6_info *sprt;
337 if (!oif && ipv6_addr_any(saddr))
340 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341 struct net_device *dev = sprt->rt6i_dev;
344 if (dev->ifindex == oif)
346 if (dev->flags & IFF_LOOPBACK) {
347 if (sprt->rt6i_idev == NULL ||
348 sprt->rt6i_idev->dev->ifindex != oif) {
349 if (flags & RT6_LOOKUP_F_IFACE && oif)
351 if (local && (!oif ||
352 local->rt6i_idev->dev->ifindex == oif))
358 if (ipv6_chk_addr(net, saddr, dev,
359 flags & RT6_LOOKUP_F_IFACE))
368 if (flags & RT6_LOOKUP_F_IFACE)
369 return net->ipv6.ip6_null_entry;
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
378 struct neighbour *neigh;
380 * Okay, this does not seem to be appropriate
381 * for now, however, we need to check if it
382 * is really so; aka Router Reachability Probing.
384 * Router Reachability Probe MUST be rate-limited
385 * to no more than one per minute.
388 neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389 if (!neigh || (neigh->nud_state & NUD_VALID))
391 read_lock_bh(&neigh->lock);
392 if (!(neigh->nud_state & NUD_VALID) &&
393 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394 struct in6_addr mcaddr;
395 struct in6_addr *target;
397 neigh->updated = jiffies;
398 read_unlock_bh(&neigh->lock);
400 target = (struct in6_addr *)&neigh->primary_key;
401 addrconf_addr_solict_mult(target, &mcaddr);
402 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
404 read_unlock_bh(&neigh->lock);
410 static inline void rt6_probe(struct rt6_info *rt)
416 * Default Router Selection (RFC 2461 6.3.6)
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
420 struct net_device *dev = rt->rt6i_dev;
421 if (!oif || dev->ifindex == oif)
423 if ((dev->flags & IFF_LOOPBACK) &&
424 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
429 static inline int rt6_check_neigh(struct rt6_info *rt)
431 struct neighbour *neigh;
435 neigh = dst_get_neighbour(&rt->dst);
436 if (rt->rt6i_flags & RTF_NONEXTHOP ||
437 !(rt->rt6i_flags & RTF_GATEWAY))
440 read_lock_bh(&neigh->lock);
441 if (neigh->nud_state & NUD_VALID)
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444 else if (neigh->nud_state & NUD_FAILED)
449 read_unlock_bh(&neigh->lock);
456 static int rt6_score_route(struct rt6_info *rt, int oif,
461 m = rt6_check_dev(rt, oif);
462 if (!m && (strict & RT6_LOOKUP_F_IFACE))
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
467 n = rt6_check_neigh(rt);
468 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474 int *mpri, struct rt6_info *match)
478 if (rt6_check_expired(rt))
481 m = rt6_score_route(rt, oif, strict);
486 if (strict & RT6_LOOKUP_F_REACHABLE)
490 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499 struct rt6_info *rr_head,
500 u32 metric, int oif, int strict)
502 struct rt6_info *rt, *match;
506 for (rt = rr_head; rt && rt->rt6i_metric == metric;
507 rt = rt->dst.rt6_next)
508 match = find_match(rt, oif, strict, &mpri, match);
509 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510 rt = rt->dst.rt6_next)
511 match = find_match(rt, oif, strict, &mpri, match);
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
518 struct rt6_info *match, *rt0;
521 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522 __func__, fn->leaf, oif);
526 fn->rr_ptr = rt0 = fn->leaf;
528 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
531 (strict & RT6_LOOKUP_F_REACHABLE)) {
532 struct rt6_info *next = rt0->dst.rt6_next;
534 /* no entries matched; do round-robin */
535 if (!next || next->rt6i_metric != rt0->rt6i_metric)
542 RT6_TRACE("%s() => %p\n",
545 net = dev_net(rt0->rt6i_dev);
546 return match ? match : net->ipv6.ip6_null_entry;
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551 const struct in6_addr *gwaddr)
553 struct net *net = dev_net(dev);
554 struct route_info *rinfo = (struct route_info *) opt;
555 struct in6_addr prefix_buf, *prefix;
557 unsigned long lifetime;
560 if (len < sizeof(struct route_info)) {
564 /* Sanity check for prefix_len and length */
565 if (rinfo->length > 3) {
567 } else if (rinfo->prefix_len > 128) {
569 } else if (rinfo->prefix_len > 64) {
570 if (rinfo->length < 2) {
573 } else if (rinfo->prefix_len > 0) {
574 if (rinfo->length < 1) {
579 pref = rinfo->route_pref;
580 if (pref == ICMPV6_ROUTER_PREF_INVALID)
583 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
585 if (rinfo->length == 3)
586 prefix = (struct in6_addr *)rinfo->prefix;
588 /* this function is safe */
589 ipv6_addr_prefix(&prefix_buf,
590 (struct in6_addr *)rinfo->prefix,
592 prefix = &prefix_buf;
595 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
598 if (rt && !lifetime) {
604 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
607 rt->rt6i_flags = RTF_ROUTEINFO |
608 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
611 if (!addrconf_finite_timeout(lifetime)) {
612 rt->rt6i_flags &= ~RTF_EXPIRES;
614 rt->rt6i_expires = jiffies + HZ * lifetime;
615 rt->rt6i_flags |= RTF_EXPIRES;
617 dst_release(&rt->dst);
623 #define BACKTRACK(__net, saddr) \
625 if (rt == __net->ipv6.ip6_null_entry) { \
626 struct fib6_node *pn; \
628 if (fn->fn_flags & RTN_TL_ROOT) \
631 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
632 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
635 if (fn->fn_flags & RTN_RTINFO) \
641 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
642 struct fib6_table *table,
643 struct flowi6 *fl6, int flags)
645 struct fib6_node *fn;
648 read_lock_bh(&table->tb6_lock);
649 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
652 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
653 BACKTRACK(net, &fl6->saddr);
655 dst_use(&rt->dst, jiffies);
656 read_unlock_bh(&table->tb6_lock);
661 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
662 const struct in6_addr *saddr, int oif, int strict)
664 struct flowi6 fl6 = {
668 struct dst_entry *dst;
669 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
672 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
673 flags |= RT6_LOOKUP_F_HAS_SADDR;
676 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
678 return (struct rt6_info *) dst;
685 EXPORT_SYMBOL(rt6_lookup);
687 /* ip6_ins_rt is called with FREE table->tb6_lock.
688 It takes new route entry, the addition fails by any reason the
689 route is freed. In any case, if caller does not hold it, it may
693 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
696 struct fib6_table *table;
698 table = rt->rt6i_table;
699 write_lock_bh(&table->tb6_lock);
700 err = fib6_add(&table->tb6_root, rt, info);
701 write_unlock_bh(&table->tb6_lock);
706 int ip6_ins_rt(struct rt6_info *rt)
708 struct nl_info info = {
709 .nl_net = dev_net(rt->rt6i_dev),
711 return __ip6_ins_rt(rt, &info);
714 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
715 const struct in6_addr *daddr,
716 const struct in6_addr *saddr)
724 rt = ip6_rt_copy(ort, daddr);
727 struct neighbour *neigh;
728 int attempts = !in_softirq();
730 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
731 if (ort->rt6i_dst.plen != 128 &&
732 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
733 rt->rt6i_flags |= RTF_ANYCAST;
734 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
737 rt->rt6i_flags |= RTF_CACHE;
739 #ifdef CONFIG_IPV6_SUBTREES
740 if (rt->rt6i_src.plen && saddr) {
741 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
742 rt->rt6i_src.plen = 128;
747 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
749 struct net *net = dev_net(rt->rt6i_dev);
750 int saved_rt_min_interval =
751 net->ipv6.sysctl.ip6_rt_gc_min_interval;
752 int saved_rt_elasticity =
753 net->ipv6.sysctl.ip6_rt_gc_elasticity;
755 if (attempts-- > 0) {
756 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
757 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
759 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
761 net->ipv6.sysctl.ip6_rt_gc_elasticity =
763 net->ipv6.sysctl.ip6_rt_gc_min_interval =
764 saved_rt_min_interval;
770 "ipv6: Neighbour table overflow.\n");
774 dst_set_neighbour(&rt->dst, neigh);
781 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
782 const struct in6_addr *daddr)
784 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
787 rt->rt6i_flags |= RTF_CACHE;
788 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
793 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
794 struct flowi6 *fl6, int flags, bool input)
796 struct fib6_node *fn;
797 struct rt6_info *rt, *nrt;
801 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
802 int local = RTF_NONEXTHOP;
804 strict |= flags & RT6_LOOKUP_F_IFACE;
809 read_lock_bh(&table->tb6_lock);
812 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
815 rt = rt6_select(fn, oif, strict | reachable);
817 BACKTRACK(net, &fl6->saddr);
818 if (rt == net->ipv6.ip6_null_entry ||
819 rt->rt6i_flags & RTF_CACHE)
823 read_unlock_bh(&table->tb6_lock);
825 if (!dst_get_neighbour_raw(&rt->dst)
826 && !(rt->rt6i_flags & local))
827 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
828 else if (!(rt->dst.flags & DST_HOST))
829 nrt = rt6_alloc_clone(rt, &fl6->daddr);
833 dst_release(&rt->dst);
834 rt = nrt ? : net->ipv6.ip6_null_entry;
838 err = ip6_ins_rt(nrt);
847 * Race condition! In the gap, when table->tb6_lock was
848 * released someone could insert this route. Relookup.
850 dst_release(&rt->dst);
859 read_unlock_bh(&table->tb6_lock);
861 rt->dst.lastuse = jiffies;
867 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
868 struct flowi6 *fl6, int flags)
870 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags, true);
873 void ip6_route_input(struct sk_buff *skb)
875 const struct ipv6hdr *iph = ipv6_hdr(skb);
876 struct net *net = dev_net(skb->dev);
877 int flags = RT6_LOOKUP_F_HAS_SADDR;
878 struct flowi6 fl6 = {
879 .flowi6_iif = skb->dev->ifindex,
882 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
883 .flowi6_mark = skb->mark,
884 .flowi6_proto = iph->nexthdr,
887 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
888 flags |= RT6_LOOKUP_F_IFACE;
890 skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
893 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
894 struct flowi6 *fl6, int flags)
896 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags, false);
899 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
904 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
905 flags |= RT6_LOOKUP_F_IFACE;
907 if (!ipv6_addr_any(&fl6->saddr))
908 flags |= RT6_LOOKUP_F_HAS_SADDR;
910 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
912 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
915 EXPORT_SYMBOL(ip6_route_output);
917 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
919 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
920 struct dst_entry *new = NULL;
922 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
924 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
929 new->input = dst_discard;
930 new->output = dst_discard;
932 if (dst_metrics_read_only(&ort->dst))
933 new->_metrics = ort->dst._metrics;
935 dst_copy_metrics(new, &ort->dst);
936 rt->rt6i_idev = ort->rt6i_idev;
938 in6_dev_hold(rt->rt6i_idev);
939 rt->rt6i_expires = 0;
941 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
942 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
945 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
946 #ifdef CONFIG_IPV6_SUBTREES
947 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
953 dst_release(dst_orig);
954 return new ? new : ERR_PTR(-ENOMEM);
958 * Destination cache support functions
961 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
965 rt = (struct rt6_info *) dst;
967 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
968 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
970 rt6_bind_peer(rt, 0);
971 rt->rt6i_peer_genid = rt6_peer_genid();
978 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
980 struct rt6_info *rt = (struct rt6_info *) dst;
983 if (rt->rt6i_flags & RTF_CACHE) {
984 if (rt6_check_expired(rt)) {
996 static void ip6_link_failure(struct sk_buff *skb)
1000 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1002 rt = (struct rt6_info *) skb_dst(skb);
1004 if (rt->rt6i_flags&RTF_CACHE) {
1005 dst_set_expires(&rt->dst, 0);
1006 rt->rt6i_flags |= RTF_EXPIRES;
1007 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1008 rt->rt6i_node->fn_sernum = -1;
1012 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1014 struct rt6_info *rt6 = (struct rt6_info*)dst;
1016 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1017 rt6->rt6i_flags |= RTF_MODIFIED;
1018 if (mtu < IPV6_MIN_MTU) {
1019 u32 features = dst_metric(dst, RTAX_FEATURES);
1021 features |= RTAX_FEATURE_ALLFRAG;
1022 dst_metric_set(dst, RTAX_FEATURES, features);
1024 dst_metric_set(dst, RTAX_MTU, mtu);
1028 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1030 struct net_device *dev = dst->dev;
1031 unsigned int mtu = dst_mtu(dst);
1032 struct net *net = dev_net(dev);
1034 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1036 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1037 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1040 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1041 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1042 * IPV6_MAXPLEN is also valid and means: "any MSS,
1043 * rely only on pmtu discovery"
1045 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1050 static unsigned int ip6_mtu(const struct dst_entry *dst)
1052 struct inet6_dev *idev;
1053 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1061 idev = __in6_dev_get(dst->dev);
1063 mtu = idev->cnf.mtu6;
1069 static struct dst_entry *icmp6_dst_gc_list;
1070 static DEFINE_SPINLOCK(icmp6_dst_lock);
1072 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1073 struct neighbour *neigh,
1074 const struct in6_addr *addr)
1076 struct rt6_info *rt;
1077 struct inet6_dev *idev = in6_dev_get(dev);
1078 struct net *net = dev_net(dev);
1080 if (unlikely(idev == NULL))
1083 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1084 if (unlikely(rt == NULL)) {
1092 neigh = ndisc_get_neigh(dev, addr);
1097 rt->dst.flags |= DST_HOST;
1098 rt->dst.output = ip6_output;
1099 dst_set_neighbour(&rt->dst, neigh);
1100 atomic_set(&rt->dst.__refcnt, 1);
1101 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1102 rt->rt6i_dst.plen = 128;
1103 rt->rt6i_idev = idev;
1104 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1106 spin_lock_bh(&icmp6_dst_lock);
1107 rt->dst.next = icmp6_dst_gc_list;
1108 icmp6_dst_gc_list = &rt->dst;
1109 spin_unlock_bh(&icmp6_dst_lock);
1111 fib6_force_start_gc(net);
1117 int icmp6_dst_gc(void)
1119 struct dst_entry *dst, **pprev;
1122 spin_lock_bh(&icmp6_dst_lock);
1123 pprev = &icmp6_dst_gc_list;
1125 while ((dst = *pprev) != NULL) {
1126 if (!atomic_read(&dst->__refcnt)) {
1135 spin_unlock_bh(&icmp6_dst_lock);
1140 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1143 struct dst_entry *dst, **pprev;
1145 spin_lock_bh(&icmp6_dst_lock);
1146 pprev = &icmp6_dst_gc_list;
1147 while ((dst = *pprev) != NULL) {
1148 struct rt6_info *rt = (struct rt6_info *) dst;
1149 if (func(rt, arg)) {
1156 spin_unlock_bh(&icmp6_dst_lock);
1159 static int ip6_dst_gc(struct dst_ops *ops)
1161 unsigned long now = jiffies;
1162 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1163 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1164 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1165 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1166 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1167 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1170 entries = dst_entries_get_fast(ops);
1171 if (time_after(rt_last_gc + rt_min_interval, now) &&
1172 entries <= rt_max_size)
1175 net->ipv6.ip6_rt_gc_expire++;
1176 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1177 net->ipv6.ip6_rt_last_gc = now;
1178 entries = dst_entries_get_slow(ops);
1179 if (entries < ops->gc_thresh)
1180 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1182 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1183 return entries > rt_max_size;
1186 /* Clean host part of a prefix. Not necessary in radix tree,
1187 but results in cleaner routing tables.
1189 Remove it only when all the things will work!
1192 int ip6_dst_hoplimit(struct dst_entry *dst)
1194 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1195 if (hoplimit == 0) {
1196 struct net_device *dev = dst->dev;
1197 struct inet6_dev *idev;
1200 idev = __in6_dev_get(dev);
1202 hoplimit = idev->cnf.hop_limit;
1204 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1209 EXPORT_SYMBOL(ip6_dst_hoplimit);
1215 int ip6_route_add(struct fib6_config *cfg)
1218 struct net *net = cfg->fc_nlinfo.nl_net;
1219 struct rt6_info *rt = NULL;
1220 struct net_device *dev = NULL;
1221 struct inet6_dev *idev = NULL;
1222 struct fib6_table *table;
1225 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1227 #ifndef CONFIG_IPV6_SUBTREES
1228 if (cfg->fc_src_len)
1231 if (cfg->fc_ifindex) {
1233 dev = dev_get_by_index(net, cfg->fc_ifindex);
1236 idev = in6_dev_get(dev);
1241 if (cfg->fc_metric == 0)
1242 cfg->fc_metric = IP6_RT_PRIO_USER;
1244 table = fib6_new_table(net, cfg->fc_table);
1245 if (table == NULL) {
1250 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1257 rt->dst.obsolete = -1;
1258 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1259 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1262 if (cfg->fc_protocol == RTPROT_UNSPEC)
1263 cfg->fc_protocol = RTPROT_BOOT;
1264 rt->rt6i_protocol = cfg->fc_protocol;
1266 addr_type = ipv6_addr_type(&cfg->fc_dst);
1268 if (addr_type & IPV6_ADDR_MULTICAST)
1269 rt->dst.input = ip6_mc_input;
1270 else if (cfg->fc_flags & RTF_LOCAL)
1271 rt->dst.input = ip6_input;
1273 rt->dst.input = ip6_forward;
1275 rt->dst.output = ip6_output;
1277 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1278 rt->rt6i_dst.plen = cfg->fc_dst_len;
1279 if (rt->rt6i_dst.plen == 128)
1280 rt->dst.flags |= DST_HOST;
1282 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1283 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1288 dst_init_metrics(&rt->dst, metrics, 0);
1290 #ifdef CONFIG_IPV6_SUBTREES
1291 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1292 rt->rt6i_src.plen = cfg->fc_src_len;
1295 rt->rt6i_metric = cfg->fc_metric;
1297 /* We cannot add true routes via loopback here,
1298 they would result in kernel looping; promote them to reject routes
1300 if ((cfg->fc_flags & RTF_REJECT) ||
1301 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1302 && !(cfg->fc_flags&RTF_LOCAL))) {
1303 /* hold loopback dev/idev if we haven't done so. */
1304 if (dev != net->loopback_dev) {
1309 dev = net->loopback_dev;
1311 idev = in6_dev_get(dev);
1317 rt->dst.output = ip6_pkt_discard_out;
1318 rt->dst.input = ip6_pkt_discard;
1319 rt->dst.error = -ENETUNREACH;
1320 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1324 if (cfg->fc_flags & RTF_GATEWAY) {
1325 const struct in6_addr *gw_addr;
1328 gw_addr = &cfg->fc_gateway;
1329 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1330 gwa_type = ipv6_addr_type(gw_addr);
1332 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1333 struct rt6_info *grt;
1335 /* IPv6 strictly inhibits using not link-local
1336 addresses as nexthop address.
1337 Otherwise, router will not able to send redirects.
1338 It is very good, but in some (rare!) circumstances
1339 (SIT, PtP, NBMA NOARP links) it is handy to allow
1340 some exceptions. --ANK
1343 if (!(gwa_type&IPV6_ADDR_UNICAST))
1346 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1348 err = -EHOSTUNREACH;
1352 if (dev != grt->rt6i_dev) {
1353 dst_release(&grt->dst);
1357 dev = grt->rt6i_dev;
1358 idev = grt->rt6i_idev;
1360 in6_dev_hold(grt->rt6i_idev);
1362 if (!(grt->rt6i_flags&RTF_GATEWAY))
1364 dst_release(&grt->dst);
1370 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1378 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1379 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1383 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1384 rt->rt6i_prefsrc.plen = 128;
1386 rt->rt6i_prefsrc.plen = 0;
1388 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1389 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1394 dst_set_neighbour(&rt->dst, n);
1397 rt->rt6i_flags = cfg->fc_flags;
1404 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1405 int type = nla_type(nla);
1408 if (type > RTAX_MAX) {
1413 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1419 rt->rt6i_idev = idev;
1420 rt->rt6i_table = table;
1422 cfg->fc_nlinfo.nl_net = dev_net(dev);
1424 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1436 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1439 struct fib6_table *table;
1440 struct net *net = dev_net(rt->rt6i_dev);
1442 if (rt == net->ipv6.ip6_null_entry) {
1447 table = rt->rt6i_table;
1448 write_lock_bh(&table->tb6_lock);
1449 err = fib6_del(rt, info);
1450 write_unlock_bh(&table->tb6_lock);
1453 dst_release(&rt->dst);
1457 int ip6_del_rt(struct rt6_info *rt)
1459 struct nl_info info = {
1460 .nl_net = dev_net(rt->rt6i_dev),
1462 return __ip6_del_rt(rt, &info);
1465 static int ip6_route_del(struct fib6_config *cfg)
1467 struct fib6_table *table;
1468 struct fib6_node *fn;
1469 struct rt6_info *rt;
1472 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1476 read_lock_bh(&table->tb6_lock);
1478 fn = fib6_locate(&table->tb6_root,
1479 &cfg->fc_dst, cfg->fc_dst_len,
1480 &cfg->fc_src, cfg->fc_src_len);
1483 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1484 if (cfg->fc_ifindex &&
1485 (rt->rt6i_dev == NULL ||
1486 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1488 if (cfg->fc_flags & RTF_GATEWAY &&
1489 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1491 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1494 read_unlock_bh(&table->tb6_lock);
1496 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1499 read_unlock_bh(&table->tb6_lock);
1507 struct ip6rd_flowi {
1509 struct in6_addr gateway;
1512 static struct rt6_info *__ip6_route_redirect(struct net *net,
1513 struct fib6_table *table,
1517 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1518 struct rt6_info *rt;
1519 struct fib6_node *fn;
1522 * Get the "current" route for this destination and
1523 * check if the redirect has come from approriate router.
1525 * RFC 2461 specifies that redirects should only be
1526 * accepted if they come from the nexthop to the target.
1527 * Due to the way the routes are chosen, this notion
1528 * is a bit fuzzy and one might need to check all possible
1532 read_lock_bh(&table->tb6_lock);
1533 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1535 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1537 * Current route is on-link; redirect is always invalid.
1539 * Seems, previous statement is not true. It could
1540 * be node, which looks for us as on-link (f.e. proxy ndisc)
1541 * But then router serving it might decide, that we should
1542 * know truth 8)8) --ANK (980726).
1544 if (rt6_check_expired(rt))
1546 if (!(rt->rt6i_flags & RTF_GATEWAY))
1548 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1550 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1556 rt = net->ipv6.ip6_null_entry;
1557 BACKTRACK(net, &fl6->saddr);
1561 read_unlock_bh(&table->tb6_lock);
1566 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1567 const struct in6_addr *src,
1568 const struct in6_addr *gateway,
1569 struct net_device *dev)
1571 int flags = RT6_LOOKUP_F_HAS_SADDR;
1572 struct net *net = dev_net(dev);
1573 struct ip6rd_flowi rdfl = {
1575 .flowi6_oif = dev->ifindex,
1581 ipv6_addr_copy(&rdfl.gateway, gateway);
1583 if (rt6_need_strict(dest))
1584 flags |= RT6_LOOKUP_F_IFACE;
1586 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1587 flags, __ip6_route_redirect);
1590 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1591 const struct in6_addr *saddr,
1592 struct neighbour *neigh, u8 *lladdr, int on_link)
1594 struct rt6_info *rt, *nrt = NULL;
1595 struct netevent_redirect netevent;
1596 struct net *net = dev_net(neigh->dev);
1598 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1600 if (rt == net->ipv6.ip6_null_entry) {
1601 if (net_ratelimit())
1602 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1603 "for redirect target\n");
1608 * We have finally decided to accept it.
1611 neigh_update(neigh, lladdr, NUD_STALE,
1612 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1613 NEIGH_UPDATE_F_OVERRIDE|
1614 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1615 NEIGH_UPDATE_F_ISROUTER))
1619 * Redirect received -> path was valid.
1620 * Look, redirects are sent only in response to data packets,
1621 * so that this nexthop apparently is reachable. --ANK
1623 dst_confirm(&rt->dst);
1625 /* Duplicate redirect: silently ignore. */
1626 if (neigh == dst_get_neighbour_raw(&rt->dst))
1629 nrt = ip6_rt_copy(rt, dest);
1633 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1635 nrt->rt6i_flags &= ~RTF_GATEWAY;
1637 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1638 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1640 if (ip6_ins_rt(nrt))
1643 netevent.old = &rt->dst;
1644 netevent.new = &nrt->dst;
1645 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1647 if (rt->rt6i_flags&RTF_CACHE) {
1653 dst_release(&rt->dst);
1657 * Handle ICMP "packet too big" messages
1658 * i.e. Path MTU discovery
1661 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1662 struct net *net, u32 pmtu, int ifindex)
1664 struct rt6_info *rt, *nrt;
1667 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1671 if (rt6_check_expired(rt)) {
1676 if (pmtu >= dst_mtu(&rt->dst))
1679 if (pmtu < IPV6_MIN_MTU) {
1681 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1682 * MTU (1280) and a fragment header should always be included
1683 * after a node receiving Too Big message reporting PMTU is
1684 * less than the IPv6 Minimum Link MTU.
1686 pmtu = IPV6_MIN_MTU;
1690 /* New mtu received -> path was valid.
1691 They are sent only in response to data packets,
1692 so that this nexthop apparently is reachable. --ANK
1694 dst_confirm(&rt->dst);
1696 /* Host route. If it is static, it would be better
1697 not to override it, but add new one, so that
1698 when cache entry will expire old pmtu
1699 would return automatically.
1701 if (rt->rt6i_flags & RTF_CACHE) {
1702 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1704 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1705 features |= RTAX_FEATURE_ALLFRAG;
1706 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1708 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1709 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1714 Two cases are possible:
1715 1. It is connected route. Action: COW
1716 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1718 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1719 nrt = rt6_alloc_cow(rt, daddr, saddr);
1721 nrt = rt6_alloc_clone(rt, daddr);
1724 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1726 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1727 features |= RTAX_FEATURE_ALLFRAG;
1728 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1731 /* According to RFC 1981, detecting PMTU increase shouldn't be
1732 * happened within 5 mins, the recommended timer is 10 mins.
1733 * Here this route expiration time is set to ip6_rt_mtu_expires
1734 * which is 10 mins. After 10 mins the decreased pmtu is expired
1735 * and detecting PMTU increase will be automatically happened.
1737 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1738 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1743 dst_release(&rt->dst);
1746 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1747 struct net_device *dev, u32 pmtu)
1749 struct net *net = dev_net(dev);
1752 * RFC 1981 states that a node "MUST reduce the size of the packets it
1753 * is sending along the path" that caused the Packet Too Big message.
1754 * Since it's not possible in the general case to determine which
1755 * interface was used to send the original packet, we update the MTU
1756 * on the interface that will be used to send future packets. We also
1757 * update the MTU on the interface that received the Packet Too Big in
1758 * case the original packet was forced out that interface with
1759 * SO_BINDTODEVICE or similar. This is the next best thing to the
1760 * correct behaviour, which would be to update the MTU on all
1763 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1764 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1768 * Misc support functions
1771 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1772 const struct in6_addr *dest)
1774 struct net *net = dev_net(ort->rt6i_dev);
1775 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1779 rt->dst.input = ort->dst.input;
1780 rt->dst.output = ort->dst.output;
1781 rt->dst.flags |= DST_HOST;
1783 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1784 rt->rt6i_dst.plen = 128;
1785 dst_copy_metrics(&rt->dst, &ort->dst);
1786 rt->dst.error = ort->dst.error;
1787 rt->rt6i_idev = ort->rt6i_idev;
1789 in6_dev_hold(rt->rt6i_idev);
1790 rt->dst.lastuse = jiffies;
1791 rt->rt6i_expires = 0;
1793 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1794 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1795 rt->rt6i_metric = 0;
1797 #ifdef CONFIG_IPV6_SUBTREES
1798 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1800 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1801 rt->rt6i_table = ort->rt6i_table;
1806 #ifdef CONFIG_IPV6_ROUTE_INFO
1807 static struct rt6_info *rt6_get_route_info(struct net *net,
1808 const struct in6_addr *prefix, int prefixlen,
1809 const struct in6_addr *gwaddr, int ifindex)
1811 struct fib6_node *fn;
1812 struct rt6_info *rt = NULL;
1813 struct fib6_table *table;
1815 table = fib6_get_table(net, RT6_TABLE_INFO);
1819 write_lock_bh(&table->tb6_lock);
1820 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1824 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1825 if (rt->rt6i_dev->ifindex != ifindex)
1827 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1829 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1835 write_unlock_bh(&table->tb6_lock);
1839 static struct rt6_info *rt6_add_route_info(struct net *net,
1840 const struct in6_addr *prefix, int prefixlen,
1841 const struct in6_addr *gwaddr, int ifindex,
1844 struct fib6_config cfg = {
1845 .fc_table = RT6_TABLE_INFO,
1846 .fc_metric = IP6_RT_PRIO_USER,
1847 .fc_ifindex = ifindex,
1848 .fc_dst_len = prefixlen,
1849 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1850 RTF_UP | RTF_PREF(pref),
1852 .fc_nlinfo.nlh = NULL,
1853 .fc_nlinfo.nl_net = net,
1856 ipv6_addr_copy(&cfg.fc_dst, prefix);
1857 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1859 /* We should treat it as a default route if prefix length is 0. */
1861 cfg.fc_flags |= RTF_DEFAULT;
1863 ip6_route_add(&cfg);
1865 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1869 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1871 struct rt6_info *rt;
1872 struct fib6_table *table;
1874 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1878 write_lock_bh(&table->tb6_lock);
1879 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1880 if (dev == rt->rt6i_dev &&
1881 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1882 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1887 write_unlock_bh(&table->tb6_lock);
1891 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1892 struct net_device *dev,
1895 struct fib6_config cfg = {
1896 .fc_table = RT6_TABLE_DFLT,
1897 .fc_metric = IP6_RT_PRIO_USER,
1898 .fc_ifindex = dev->ifindex,
1899 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1900 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1902 .fc_nlinfo.nlh = NULL,
1903 .fc_nlinfo.nl_net = dev_net(dev),
1906 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1908 ip6_route_add(&cfg);
1910 return rt6_get_dflt_router(gwaddr, dev);
1913 void rt6_purge_dflt_routers(struct net *net)
1915 struct rt6_info *rt;
1916 struct fib6_table *table;
1918 /* NOTE: Keep consistent with rt6_get_dflt_router */
1919 table = fib6_get_table(net, RT6_TABLE_DFLT);
1924 read_lock_bh(&table->tb6_lock);
1925 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1926 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
1927 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
1929 read_unlock_bh(&table->tb6_lock);
1934 read_unlock_bh(&table->tb6_lock);
1937 static void rtmsg_to_fib6_config(struct net *net,
1938 struct in6_rtmsg *rtmsg,
1939 struct fib6_config *cfg)
1941 memset(cfg, 0, sizeof(*cfg));
1943 cfg->fc_table = RT6_TABLE_MAIN;
1944 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1945 cfg->fc_metric = rtmsg->rtmsg_metric;
1946 cfg->fc_expires = rtmsg->rtmsg_info;
1947 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1948 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1949 cfg->fc_flags = rtmsg->rtmsg_flags;
1951 cfg->fc_nlinfo.nl_net = net;
1953 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1954 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1955 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1958 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1960 struct fib6_config cfg;
1961 struct in6_rtmsg rtmsg;
1965 case SIOCADDRT: /* Add a route */
1966 case SIOCDELRT: /* Delete a route */
1967 if (!capable(CAP_NET_ADMIN))
1969 err = copy_from_user(&rtmsg, arg,
1970 sizeof(struct in6_rtmsg));
1974 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1979 err = ip6_route_add(&cfg);
1982 err = ip6_route_del(&cfg);
1996 * Drop the packet on the floor
1999 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2002 struct dst_entry *dst = skb_dst(skb);
2003 switch (ipstats_mib_noroutes) {
2004 case IPSTATS_MIB_INNOROUTES:
2005 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2006 if (type == IPV6_ADDR_ANY) {
2007 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2008 IPSTATS_MIB_INADDRERRORS);
2012 case IPSTATS_MIB_OUTNOROUTES:
2013 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2014 ipstats_mib_noroutes);
2017 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2022 static int ip6_pkt_discard(struct sk_buff *skb)
2024 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2027 static int ip6_pkt_discard_out(struct sk_buff *skb)
2029 skb->dev = skb_dst(skb)->dev;
2030 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2033 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2035 static int ip6_pkt_prohibit(struct sk_buff *skb)
2037 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2040 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2042 skb->dev = skb_dst(skb)->dev;
2043 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2049 * Allocate a dst for local (unicast / anycast) address.
2052 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2053 const struct in6_addr *addr,
2056 struct net *net = dev_net(idev->dev);
2057 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2058 net->loopback_dev, 0);
2059 struct neighbour *neigh;
2062 if (net_ratelimit())
2063 pr_warning("IPv6: Maximum number of routes reached,"
2064 " consider increasing route/max_size.\n");
2065 return ERR_PTR(-ENOMEM);
2070 rt->dst.flags |= DST_HOST;
2071 rt->dst.input = ip6_input;
2072 rt->dst.output = ip6_output;
2073 rt->rt6i_idev = idev;
2074 rt->dst.obsolete = -1;
2076 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2078 rt->rt6i_flags |= RTF_ANYCAST;
2080 rt->rt6i_flags |= RTF_LOCAL;
2081 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2082 if (IS_ERR(neigh)) {
2085 return ERR_CAST(neigh);
2087 dst_set_neighbour(&rt->dst, neigh);
2089 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2090 rt->rt6i_dst.plen = 128;
2091 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2093 atomic_set(&rt->dst.__refcnt, 1);
2098 int ip6_route_get_saddr(struct net *net,
2099 struct rt6_info *rt,
2100 const struct in6_addr *daddr,
2102 struct in6_addr *saddr)
2104 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2106 if (rt->rt6i_prefsrc.plen)
2107 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2109 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2110 daddr, prefs, saddr);
2114 /* remove deleted ip from prefsrc entries */
2115 struct arg_dev_net_ip {
2116 struct net_device *dev;
2118 struct in6_addr *addr;
2121 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2123 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2124 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2125 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2127 if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2128 rt != net->ipv6.ip6_null_entry &&
2129 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2130 /* remove prefsrc entry */
2131 rt->rt6i_prefsrc.plen = 0;
2136 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2138 struct net *net = dev_net(ifp->idev->dev);
2139 struct arg_dev_net_ip adni = {
2140 .dev = ifp->idev->dev,
2144 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2147 struct arg_dev_net {
2148 struct net_device *dev;
2152 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2154 const struct arg_dev_net *adn = arg;
2155 const struct net_device *dev = adn->dev;
2157 if ((rt->rt6i_dev == dev || dev == NULL) &&
2158 rt != adn->net->ipv6.ip6_null_entry) {
2159 RT6_TRACE("deleted by ifdown %p\n", rt);
2165 void rt6_ifdown(struct net *net, struct net_device *dev)
2167 struct arg_dev_net adn = {
2172 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2173 icmp6_clean_all(fib6_ifdown, &adn);
2176 struct rt6_mtu_change_arg
2178 struct net_device *dev;
2182 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2184 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2185 struct inet6_dev *idev;
2187 /* In IPv6 pmtu discovery is not optional,
2188 so that RTAX_MTU lock cannot disable it.
2189 We still use this lock to block changes
2190 caused by addrconf/ndisc.
2193 idev = __in6_dev_get(arg->dev);
2197 /* For administrative MTU increase, there is no way to discover
2198 IPv6 PMTU increase, so PMTU increase should be updated here.
2199 Since RFC 1981 doesn't include administrative MTU increase
2200 update PMTU increase is a MUST. (i.e. jumbo frame)
2203 If new MTU is less than route PMTU, this new MTU will be the
2204 lowest MTU in the path, update the route PMTU to reflect PMTU
2205 decreases; if new MTU is greater than route PMTU, and the
2206 old MTU is the lowest MTU in the path, update the route PMTU
2207 to reflect the increase. In this case if the other nodes' MTU
2208 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2211 if (rt->rt6i_dev == arg->dev &&
2212 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2213 (dst_mtu(&rt->dst) >= arg->mtu ||
2214 (dst_mtu(&rt->dst) < arg->mtu &&
2215 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2216 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2221 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2223 struct rt6_mtu_change_arg arg = {
2228 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2231 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2232 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2233 [RTA_OIF] = { .type = NLA_U32 },
2234 [RTA_IIF] = { .type = NLA_U32 },
2235 [RTA_PRIORITY] = { .type = NLA_U32 },
2236 [RTA_METRICS] = { .type = NLA_NESTED },
2239 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2240 struct fib6_config *cfg)
2243 struct nlattr *tb[RTA_MAX+1];
2246 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2251 rtm = nlmsg_data(nlh);
2252 memset(cfg, 0, sizeof(*cfg));
2254 cfg->fc_table = rtm->rtm_table;
2255 cfg->fc_dst_len = rtm->rtm_dst_len;
2256 cfg->fc_src_len = rtm->rtm_src_len;
2257 cfg->fc_flags = RTF_UP;
2258 cfg->fc_protocol = rtm->rtm_protocol;
2260 if (rtm->rtm_type == RTN_UNREACHABLE)
2261 cfg->fc_flags |= RTF_REJECT;
2263 if (rtm->rtm_type == RTN_LOCAL)
2264 cfg->fc_flags |= RTF_LOCAL;
2266 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2267 cfg->fc_nlinfo.nlh = nlh;
2268 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2270 if (tb[RTA_GATEWAY]) {
2271 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2272 cfg->fc_flags |= RTF_GATEWAY;
2276 int plen = (rtm->rtm_dst_len + 7) >> 3;
2278 if (nla_len(tb[RTA_DST]) < plen)
2281 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2285 int plen = (rtm->rtm_src_len + 7) >> 3;
2287 if (nla_len(tb[RTA_SRC]) < plen)
2290 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2293 if (tb[RTA_PREFSRC])
2294 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2297 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2299 if (tb[RTA_PRIORITY])
2300 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2302 if (tb[RTA_METRICS]) {
2303 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2304 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2308 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2315 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2317 struct fib6_config cfg;
2320 err = rtm_to_fib6_config(skb, nlh, &cfg);
2324 return ip6_route_del(&cfg);
2327 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2329 struct fib6_config cfg;
2332 err = rtm_to_fib6_config(skb, nlh, &cfg);
2336 return ip6_route_add(&cfg);
2339 static inline size_t rt6_nlmsg_size(void)
2341 return NLMSG_ALIGN(sizeof(struct rtmsg))
2342 + nla_total_size(16) /* RTA_SRC */
2343 + nla_total_size(16) /* RTA_DST */
2344 + nla_total_size(16) /* RTA_GATEWAY */
2345 + nla_total_size(16) /* RTA_PREFSRC */
2346 + nla_total_size(4) /* RTA_TABLE */
2347 + nla_total_size(4) /* RTA_IIF */
2348 + nla_total_size(4) /* RTA_OIF */
2349 + nla_total_size(4) /* RTA_PRIORITY */
2350 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2351 + nla_total_size(sizeof(struct rta_cacheinfo));
2354 static int rt6_fill_node(struct net *net,
2355 struct sk_buff *skb, struct rt6_info *rt,
2356 struct in6_addr *dst, struct in6_addr *src,
2357 int iif, int type, u32 pid, u32 seq,
2358 int prefix, int nowait, unsigned int flags)
2361 struct nlmsghdr *nlh;
2364 struct neighbour *n;
2366 if (prefix) { /* user wants prefix routes only */
2367 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2368 /* success since this is not a prefix route */
2373 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2377 rtm = nlmsg_data(nlh);
2378 rtm->rtm_family = AF_INET6;
2379 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2380 rtm->rtm_src_len = rt->rt6i_src.plen;
2383 table = rt->rt6i_table->tb6_id;
2385 table = RT6_TABLE_UNSPEC;
2386 rtm->rtm_table = table;
2387 NLA_PUT_U32(skb, RTA_TABLE, table);
2388 if (rt->rt6i_flags&RTF_REJECT)
2389 rtm->rtm_type = RTN_UNREACHABLE;
2390 else if (rt->rt6i_flags&RTF_LOCAL)
2391 rtm->rtm_type = RTN_LOCAL;
2392 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2393 rtm->rtm_type = RTN_LOCAL;
2395 rtm->rtm_type = RTN_UNICAST;
2397 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2398 rtm->rtm_protocol = rt->rt6i_protocol;
2399 if (rt->rt6i_flags&RTF_DYNAMIC)
2400 rtm->rtm_protocol = RTPROT_REDIRECT;
2401 else if (rt->rt6i_flags & RTF_ADDRCONF)
2402 rtm->rtm_protocol = RTPROT_KERNEL;
2403 else if (rt->rt6i_flags&RTF_DEFAULT)
2404 rtm->rtm_protocol = RTPROT_RA;
2406 if (rt->rt6i_flags&RTF_CACHE)
2407 rtm->rtm_flags |= RTM_F_CLONED;
2410 NLA_PUT(skb, RTA_DST, 16, dst);
2411 rtm->rtm_dst_len = 128;
2412 } else if (rtm->rtm_dst_len)
2413 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2414 #ifdef CONFIG_IPV6_SUBTREES
2416 NLA_PUT(skb, RTA_SRC, 16, src);
2417 rtm->rtm_src_len = 128;
2418 } else if (rtm->rtm_src_len)
2419 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2422 #ifdef CONFIG_IPV6_MROUTE
2423 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2424 int err = ip6mr_get_route(net, skb, rtm, nowait);
2429 goto nla_put_failure;
2431 if (err == -EMSGSIZE)
2432 goto nla_put_failure;
2437 NLA_PUT_U32(skb, RTA_IIF, iif);
2439 struct in6_addr saddr_buf;
2440 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2441 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2444 if (rt->rt6i_prefsrc.plen) {
2445 struct in6_addr saddr_buf;
2446 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2447 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2450 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2451 goto nla_put_failure;
2454 n = dst_get_neighbour(&rt->dst);
2456 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2458 goto nla_put_failure;
2464 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2466 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2468 if (!(rt->rt6i_flags & RTF_EXPIRES))
2470 else if (rt->rt6i_expires - jiffies < INT_MAX)
2471 expires = rt->rt6i_expires - jiffies;
2475 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2476 expires, rt->dst.error) < 0)
2477 goto nla_put_failure;
2479 return nlmsg_end(skb, nlh);
2482 nlmsg_cancel(skb, nlh);
2486 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2488 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2491 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2492 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2493 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2497 return rt6_fill_node(arg->net,
2498 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2499 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2500 prefix, 0, NLM_F_MULTI);
2503 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2505 struct net *net = sock_net(in_skb->sk);
2506 struct nlattr *tb[RTA_MAX+1];
2507 struct rt6_info *rt;
2508 struct sk_buff *skb;
2513 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2518 memset(&fl6, 0, sizeof(fl6));
2521 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2524 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2528 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2531 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2535 iif = nla_get_u32(tb[RTA_IIF]);
2538 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2541 struct net_device *dev;
2542 dev = __dev_get_by_index(net, iif);
2549 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2555 /* Reserve room for dummy headers, this skb can pass
2556 through good chunk of routing engine.
2558 skb_reset_mac_header(skb);
2559 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2561 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2562 skb_dst_set(skb, &rt->dst);
2564 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2565 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2566 nlh->nlmsg_seq, 0, 0, 0);
2572 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2577 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2579 struct sk_buff *skb;
2580 struct net *net = info->nl_net;
2585 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2587 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2591 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2592 event, info->pid, seq, 0, 0, 0);
2594 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2595 WARN_ON(err == -EMSGSIZE);
2599 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2600 info->nlh, gfp_any());
2604 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2607 static int ip6_route_dev_notify(struct notifier_block *this,
2608 unsigned long event, void *data)
2610 struct net_device *dev = (struct net_device *)data;
2611 struct net *net = dev_net(dev);
2613 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2614 net->ipv6.ip6_null_entry->dst.dev = dev;
2615 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2616 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2617 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2618 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2619 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2620 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2631 #ifdef CONFIG_PROC_FS
2642 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2644 struct seq_file *m = p_arg;
2645 struct neighbour *n;
2647 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2649 #ifdef CONFIG_IPV6_SUBTREES
2650 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2652 seq_puts(m, "00000000000000000000000000000000 00 ");
2655 n = dst_get_neighbour(&rt->dst);
2657 seq_printf(m, "%pi6", n->primary_key);
2659 seq_puts(m, "00000000000000000000000000000000");
2662 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2663 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2664 rt->dst.__use, rt->rt6i_flags,
2665 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2669 static int ipv6_route_show(struct seq_file *m, void *v)
2671 struct net *net = (struct net *)m->private;
2672 fib6_clean_all(net, rt6_info_route, 0, m);
2676 static int ipv6_route_open(struct inode *inode, struct file *file)
2678 return single_open_net(inode, file, ipv6_route_show);
2681 static const struct file_operations ipv6_route_proc_fops = {
2682 .owner = THIS_MODULE,
2683 .open = ipv6_route_open,
2685 .llseek = seq_lseek,
2686 .release = single_release_net,
2689 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2691 struct net *net = (struct net *)seq->private;
2692 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2693 net->ipv6.rt6_stats->fib_nodes,
2694 net->ipv6.rt6_stats->fib_route_nodes,
2695 net->ipv6.rt6_stats->fib_rt_alloc,
2696 net->ipv6.rt6_stats->fib_rt_entries,
2697 net->ipv6.rt6_stats->fib_rt_cache,
2698 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2699 net->ipv6.rt6_stats->fib_discarded_routes);
2704 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2706 return single_open_net(inode, file, rt6_stats_seq_show);
2709 static const struct file_operations rt6_stats_seq_fops = {
2710 .owner = THIS_MODULE,
2711 .open = rt6_stats_seq_open,
2713 .llseek = seq_lseek,
2714 .release = single_release_net,
2716 #endif /* CONFIG_PROC_FS */
2718 #ifdef CONFIG_SYSCTL
2721 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2722 void __user *buffer, size_t *lenp, loff_t *ppos)
2729 net = (struct net *)ctl->extra1;
2730 delay = net->ipv6.sysctl.flush_delay;
2731 proc_dointvec(ctl, write, buffer, lenp, ppos);
2732 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2736 ctl_table ipv6_route_table_template[] = {
2738 .procname = "flush",
2739 .data = &init_net.ipv6.sysctl.flush_delay,
2740 .maxlen = sizeof(int),
2742 .proc_handler = ipv6_sysctl_rtcache_flush
2745 .procname = "gc_thresh",
2746 .data = &ip6_dst_ops_template.gc_thresh,
2747 .maxlen = sizeof(int),
2749 .proc_handler = proc_dointvec,
2752 .procname = "max_size",
2753 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2754 .maxlen = sizeof(int),
2756 .proc_handler = proc_dointvec,
2759 .procname = "gc_min_interval",
2760 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2761 .maxlen = sizeof(int),
2763 .proc_handler = proc_dointvec_jiffies,
2766 .procname = "gc_timeout",
2767 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2768 .maxlen = sizeof(int),
2770 .proc_handler = proc_dointvec_jiffies,
2773 .procname = "gc_interval",
2774 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2775 .maxlen = sizeof(int),
2777 .proc_handler = proc_dointvec_jiffies,
2780 .procname = "gc_elasticity",
2781 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2782 .maxlen = sizeof(int),
2784 .proc_handler = proc_dointvec,
2787 .procname = "mtu_expires",
2788 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2789 .maxlen = sizeof(int),
2791 .proc_handler = proc_dointvec_jiffies,
2794 .procname = "min_adv_mss",
2795 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2796 .maxlen = sizeof(int),
2798 .proc_handler = proc_dointvec,
2801 .procname = "gc_min_interval_ms",
2802 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2803 .maxlen = sizeof(int),
2805 .proc_handler = proc_dointvec_ms_jiffies,
2810 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2812 struct ctl_table *table;
2814 table = kmemdup(ipv6_route_table_template,
2815 sizeof(ipv6_route_table_template),
2819 table[0].data = &net->ipv6.sysctl.flush_delay;
2820 table[0].extra1 = net;
2821 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2822 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2823 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2824 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2825 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2826 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2827 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2828 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2829 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2836 static int __net_init ip6_route_net_init(struct net *net)
2840 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2841 sizeof(net->ipv6.ip6_dst_ops));
2843 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2844 goto out_ip6_dst_ops;
2846 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2847 sizeof(*net->ipv6.ip6_null_entry),
2849 if (!net->ipv6.ip6_null_entry)
2850 goto out_ip6_dst_entries;
2851 net->ipv6.ip6_null_entry->dst.path =
2852 (struct dst_entry *)net->ipv6.ip6_null_entry;
2853 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2854 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2855 ip6_template_metrics, true);
2857 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2858 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2859 sizeof(*net->ipv6.ip6_prohibit_entry),
2861 if (!net->ipv6.ip6_prohibit_entry)
2862 goto out_ip6_null_entry;
2863 net->ipv6.ip6_prohibit_entry->dst.path =
2864 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2865 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2866 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2867 ip6_template_metrics, true);
2869 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2870 sizeof(*net->ipv6.ip6_blk_hole_entry),
2872 if (!net->ipv6.ip6_blk_hole_entry)
2873 goto out_ip6_prohibit_entry;
2874 net->ipv6.ip6_blk_hole_entry->dst.path =
2875 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2876 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2877 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2878 ip6_template_metrics, true);
2881 net->ipv6.sysctl.flush_delay = 0;
2882 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2883 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2884 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2885 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2886 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2887 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2888 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2890 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2896 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2897 out_ip6_prohibit_entry:
2898 kfree(net->ipv6.ip6_prohibit_entry);
2900 kfree(net->ipv6.ip6_null_entry);
2902 out_ip6_dst_entries:
2903 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2908 static void __net_exit ip6_route_net_exit(struct net *net)
2910 kfree(net->ipv6.ip6_null_entry);
2911 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2912 kfree(net->ipv6.ip6_prohibit_entry);
2913 kfree(net->ipv6.ip6_blk_hole_entry);
2915 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2918 static int __net_init ip6_route_net_init_late(struct net *net)
2920 #ifdef CONFIG_PROC_FS
2921 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2922 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2927 static void __net_exit ip6_route_net_exit_late(struct net *net)
2929 #ifdef CONFIG_PROC_FS
2930 proc_net_remove(net, "ipv6_route");
2931 proc_net_remove(net, "rt6_stats");
2935 static struct pernet_operations ip6_route_net_ops = {
2936 .init = ip6_route_net_init,
2937 .exit = ip6_route_net_exit,
2940 static struct pernet_operations ip6_route_net_late_ops = {
2941 .init = ip6_route_net_init_late,
2942 .exit = ip6_route_net_exit_late,
2945 static struct notifier_block ip6_route_dev_notifier = {
2946 .notifier_call = ip6_route_dev_notify,
2950 int __init ip6_route_init(void)
2955 ip6_dst_ops_template.kmem_cachep =
2956 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2957 SLAB_HWCACHE_ALIGN, NULL);
2958 if (!ip6_dst_ops_template.kmem_cachep)
2961 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2963 goto out_kmem_cache;
2965 ret = register_pernet_subsys(&ip6_route_net_ops);
2967 goto out_dst_entries;
2969 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2971 /* Registering of the loopback is done before this portion of code,
2972 * the loopback reference in rt6_info will not be taken, do it
2973 * manually for init_net */
2974 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2975 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2976 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2977 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2978 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2979 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2980 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2984 goto out_register_subsys;
2990 ret = fib6_rules_init();
2994 ret = register_pernet_subsys(&ip6_route_net_late_ops);
2996 goto fib6_rules_init;
2999 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3000 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3001 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3002 goto out_register_late_subsys;
3004 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3006 goto out_register_late_subsys;
3011 out_register_late_subsys:
3012 unregister_pernet_subsys(&ip6_route_net_late_ops);
3014 fib6_rules_cleanup();
3019 out_register_subsys:
3020 unregister_pernet_subsys(&ip6_route_net_ops);
3022 dst_entries_destroy(&ip6_dst_blackhole_ops);
3024 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3028 void ip6_route_cleanup(void)
3030 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3031 unregister_pernet_subsys(&ip6_route_net_late_ops);
3032 fib6_rules_cleanup();
3035 unregister_pernet_subsys(&ip6_route_net_ops);
3036 dst_entries_destroy(&ip6_dst_blackhole_ops);
3037 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);