2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
53 #include <linux/rtnetlink.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
59 #include <asm/uaccess.h>
62 #include <linux/sysctl.h>
65 /* Set to 3 to get tracing. */
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
73 #define RT6_TRACE(x...) do { ; } while (0)
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77 const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void ip6_dst_destroy(struct dst_entry *);
83 static void ip6_dst_ifdown(struct dst_entry *,
84 struct net_device *dev, int how);
85 static int ip6_dst_gc(struct dst_ops *ops);
87 static int ip6_pkt_discard(struct sk_buff *skb);
88 static int ip6_pkt_discard_out(struct sk_buff *skb);
89 static void ip6_link_failure(struct sk_buff *skb);
90 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94 const struct in6_addr *prefix, int prefixlen,
95 const struct in6_addr *gwaddr, int ifindex,
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98 const struct in6_addr *prefix, int prefixlen,
99 const struct in6_addr *gwaddr, int ifindex);
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
104 struct rt6_info *rt = (struct rt6_info *) dst;
105 struct inet_peer *peer;
108 if (!(rt->dst.flags & DST_HOST))
112 rt6_bind_peer(rt, 1);
114 peer = rt->rt6i_peer;
116 u32 *old_p = __DST_METRICS_PTR(old);
117 unsigned long prev, new;
120 if (inet_metrics_new(peer))
121 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
123 new = (unsigned long) p;
124 prev = cmpxchg(&dst->_metrics, old, new);
127 p = __DST_METRICS_PTR(prev);
128 if (prev & DST_METRICS_READ_ONLY)
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
137 return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
140 static struct dst_ops ip6_dst_ops_template = {
142 .protocol = cpu_to_be16(ETH_P_IPV6),
145 .check = ip6_dst_check,
146 .default_advmss = ip6_default_advmss,
148 .cow_metrics = ipv6_cow_metrics,
149 .destroy = ip6_dst_destroy,
150 .ifdown = ip6_dst_ifdown,
151 .negative_advice = ip6_negative_advice,
152 .link_failure = ip6_link_failure,
153 .update_pmtu = ip6_rt_update_pmtu,
154 .local_out = __ip6_local_out,
155 .neigh_lookup = ip6_neigh_lookup,
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
160 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
162 return mtu ? : dst->dev->mtu;
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
175 static struct dst_ops ip6_dst_blackhole_ops = {
177 .protocol = cpu_to_be16(ETH_P_IPV6),
178 .destroy = ip6_dst_destroy,
179 .check = ip6_dst_check,
180 .mtu = ip6_blackhole_mtu,
181 .default_advmss = ip6_default_advmss,
182 .update_pmtu = ip6_rt_blackhole_update_pmtu,
183 .cow_metrics = ip6_rt_blackhole_cow_metrics,
184 .neigh_lookup = ip6_neigh_lookup,
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188 [RTAX_HOPLIMIT - 1] = 255,
191 static struct rt6_info ip6_null_entry_template = {
193 .__refcnt = ATOMIC_INIT(1),
196 .error = -ENETUNREACH,
197 .input = ip6_pkt_discard,
198 .output = ip6_pkt_discard_out,
200 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
201 .rt6i_protocol = RTPROT_KERNEL,
202 .rt6i_metric = ~(u32) 0,
203 .rt6i_ref = ATOMIC_INIT(1),
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
211 static struct rt6_info ip6_prohibit_entry_template = {
213 .__refcnt = ATOMIC_INIT(1),
217 .input = ip6_pkt_prohibit,
218 .output = ip6_pkt_prohibit_out,
220 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
221 .rt6i_protocol = RTPROT_KERNEL,
222 .rt6i_metric = ~(u32) 0,
223 .rt6i_ref = ATOMIC_INIT(1),
226 static struct rt6_info ip6_blk_hole_entry_template = {
228 .__refcnt = ATOMIC_INIT(1),
232 .input = dst_discard,
233 .output = dst_discard,
235 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
236 .rt6i_protocol = RTPROT_KERNEL,
237 .rt6i_metric = ~(u32) 0,
238 .rt6i_ref = ATOMIC_INIT(1),
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245 struct net_device *dev,
248 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
251 memset(&rt->rt6i_table, 0,
252 sizeof(*rt) - sizeof(struct dst_entry));
257 static void ip6_dst_destroy(struct dst_entry *dst)
259 struct rt6_info *rt = (struct rt6_info *)dst;
260 struct inet6_dev *idev = rt->rt6i_idev;
261 struct inet_peer *peer = rt->rt6i_peer;
263 if (!(rt->dst.flags & DST_HOST))
264 dst_destroy_metrics_generic(dst);
267 rt->rt6i_idev = NULL;
271 rt->rt6i_peer = NULL;
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
278 static u32 rt6_peer_genid(void)
280 return atomic_read(&__rt6_peer_genid);
283 void rt6_bind_peer(struct rt6_info *rt, int create)
285 struct inet_peer *peer;
287 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
291 rt->rt6i_peer_genid = rt6_peer_genid();
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
297 struct rt6_info *rt = (struct rt6_info *)dst;
298 struct inet6_dev *idev = rt->rt6i_idev;
299 struct net_device *loopback_dev =
300 dev_net(dev)->loopback_dev;
302 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
303 struct inet6_dev *loopback_idev =
304 in6_dev_get(loopback_dev);
305 if (loopback_idev != NULL) {
306 rt->rt6i_idev = loopback_idev;
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
314 return (rt->rt6i_flags & RTF_EXPIRES) &&
315 time_after(jiffies, rt->rt6i_expires);
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
320 return ipv6_addr_type(daddr) &
321 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
325 * Route lookup. Any table->tb6_lock is implied.
328 static inline struct rt6_info *rt6_device_match(struct net *net,
330 const struct in6_addr *saddr,
334 struct rt6_info *local = NULL;
335 struct rt6_info *sprt;
337 if (!oif && ipv6_addr_any(saddr))
340 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341 struct net_device *dev = sprt->rt6i_dev;
344 if (dev->ifindex == oif)
346 if (dev->flags & IFF_LOOPBACK) {
347 if (sprt->rt6i_idev == NULL ||
348 sprt->rt6i_idev->dev->ifindex != oif) {
349 if (flags & RT6_LOOKUP_F_IFACE && oif)
351 if (local && (!oif ||
352 local->rt6i_idev->dev->ifindex == oif))
358 if (ipv6_chk_addr(net, saddr, dev,
359 flags & RT6_LOOKUP_F_IFACE))
368 if (flags & RT6_LOOKUP_F_IFACE)
369 return net->ipv6.ip6_null_entry;
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
378 struct neighbour *neigh;
380 * Okay, this does not seem to be appropriate
381 * for now, however, we need to check if it
382 * is really so; aka Router Reachability Probing.
384 * Router Reachability Probe MUST be rate-limited
385 * to no more than one per minute.
388 neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389 if (!neigh || (neigh->nud_state & NUD_VALID))
391 read_lock_bh(&neigh->lock);
392 if (!(neigh->nud_state & NUD_VALID) &&
393 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394 struct in6_addr mcaddr;
395 struct in6_addr *target;
397 neigh->updated = jiffies;
398 read_unlock_bh(&neigh->lock);
400 target = (struct in6_addr *)&neigh->primary_key;
401 addrconf_addr_solict_mult(target, &mcaddr);
402 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
404 read_unlock_bh(&neigh->lock);
410 static inline void rt6_probe(struct rt6_info *rt)
416 * Default Router Selection (RFC 2461 6.3.6)
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
420 struct net_device *dev = rt->rt6i_dev;
421 if (!oif || dev->ifindex == oif)
423 if ((dev->flags & IFF_LOOPBACK) &&
424 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
429 static inline int rt6_check_neigh(struct rt6_info *rt)
431 struct neighbour *neigh;
435 neigh = dst_get_neighbour(&rt->dst);
436 if (rt->rt6i_flags & RTF_NONEXTHOP ||
437 !(rt->rt6i_flags & RTF_GATEWAY))
440 read_lock_bh(&neigh->lock);
441 if (neigh->nud_state & NUD_VALID)
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444 else if (neigh->nud_state & NUD_FAILED)
449 read_unlock_bh(&neigh->lock);
456 static int rt6_score_route(struct rt6_info *rt, int oif,
461 m = rt6_check_dev(rt, oif);
462 if (!m && (strict & RT6_LOOKUP_F_IFACE))
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
467 n = rt6_check_neigh(rt);
468 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474 int *mpri, struct rt6_info *match)
478 if (rt6_check_expired(rt))
481 m = rt6_score_route(rt, oif, strict);
486 if (strict & RT6_LOOKUP_F_REACHABLE)
490 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499 struct rt6_info *rr_head,
500 u32 metric, int oif, int strict)
502 struct rt6_info *rt, *match;
506 for (rt = rr_head; rt && rt->rt6i_metric == metric;
507 rt = rt->dst.rt6_next)
508 match = find_match(rt, oif, strict, &mpri, match);
509 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510 rt = rt->dst.rt6_next)
511 match = find_match(rt, oif, strict, &mpri, match);
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
518 struct rt6_info *match, *rt0;
521 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522 __func__, fn->leaf, oif);
526 fn->rr_ptr = rt0 = fn->leaf;
528 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
531 (strict & RT6_LOOKUP_F_REACHABLE)) {
532 struct rt6_info *next = rt0->dst.rt6_next;
534 /* no entries matched; do round-robin */
535 if (!next || next->rt6i_metric != rt0->rt6i_metric)
542 RT6_TRACE("%s() => %p\n",
545 net = dev_net(rt0->rt6i_dev);
546 return match ? match : net->ipv6.ip6_null_entry;
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551 const struct in6_addr *gwaddr)
553 struct net *net = dev_net(dev);
554 struct route_info *rinfo = (struct route_info *) opt;
555 struct in6_addr prefix_buf, *prefix;
557 unsigned long lifetime;
560 if (len < sizeof(struct route_info)) {
564 /* Sanity check for prefix_len and length */
565 if (rinfo->length > 3) {
567 } else if (rinfo->prefix_len > 128) {
569 } else if (rinfo->prefix_len > 64) {
570 if (rinfo->length < 2) {
573 } else if (rinfo->prefix_len > 0) {
574 if (rinfo->length < 1) {
579 pref = rinfo->route_pref;
580 if (pref == ICMPV6_ROUTER_PREF_INVALID)
583 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
585 if (rinfo->length == 3)
586 prefix = (struct in6_addr *)rinfo->prefix;
588 /* this function is safe */
589 ipv6_addr_prefix(&prefix_buf,
590 (struct in6_addr *)rinfo->prefix,
592 prefix = &prefix_buf;
595 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
598 if (rt && !lifetime) {
604 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
607 rt->rt6i_flags = RTF_ROUTEINFO |
608 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
611 if (!addrconf_finite_timeout(lifetime)) {
612 rt->rt6i_flags &= ~RTF_EXPIRES;
614 rt->rt6i_expires = jiffies + HZ * lifetime;
615 rt->rt6i_flags |= RTF_EXPIRES;
617 dst_release(&rt->dst);
623 #define BACKTRACK(__net, saddr) \
625 if (rt == __net->ipv6.ip6_null_entry) { \
626 struct fib6_node *pn; \
628 if (fn->fn_flags & RTN_TL_ROOT) \
631 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
632 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
635 if (fn->fn_flags & RTN_RTINFO) \
641 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
642 struct fib6_table *table,
643 struct flowi6 *fl6, int flags)
645 struct fib6_node *fn;
648 read_lock_bh(&table->tb6_lock);
649 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
652 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
653 BACKTRACK(net, &fl6->saddr);
655 dst_use(&rt->dst, jiffies);
656 read_unlock_bh(&table->tb6_lock);
661 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
662 const struct in6_addr *saddr, int oif, int strict)
664 struct flowi6 fl6 = {
668 struct dst_entry *dst;
669 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
672 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
673 flags |= RT6_LOOKUP_F_HAS_SADDR;
676 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
678 return (struct rt6_info *) dst;
685 EXPORT_SYMBOL(rt6_lookup);
687 /* ip6_ins_rt is called with FREE table->tb6_lock.
688 It takes new route entry, the addition fails by any reason the
689 route is freed. In any case, if caller does not hold it, it may
693 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
696 struct fib6_table *table;
698 table = rt->rt6i_table;
699 write_lock_bh(&table->tb6_lock);
700 err = fib6_add(&table->tb6_root, rt, info);
701 write_unlock_bh(&table->tb6_lock);
706 int ip6_ins_rt(struct rt6_info *rt)
708 struct nl_info info = {
709 .nl_net = dev_net(rt->rt6i_dev),
711 return __ip6_ins_rt(rt, &info);
714 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
715 const struct in6_addr *daddr,
716 const struct in6_addr *saddr)
724 rt = ip6_rt_copy(ort, daddr);
727 struct neighbour *neigh;
728 int attempts = !in_softirq();
730 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
731 if (ort->rt6i_dst.plen != 128 &&
732 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
733 rt->rt6i_flags |= RTF_ANYCAST;
734 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
737 rt->rt6i_flags |= RTF_CACHE;
739 #ifdef CONFIG_IPV6_SUBTREES
740 if (rt->rt6i_src.plen && saddr) {
741 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
742 rt->rt6i_src.plen = 128;
747 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
749 struct net *net = dev_net(rt->rt6i_dev);
750 int saved_rt_min_interval =
751 net->ipv6.sysctl.ip6_rt_gc_min_interval;
752 int saved_rt_elasticity =
753 net->ipv6.sysctl.ip6_rt_gc_elasticity;
755 if (attempts-- > 0) {
756 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
757 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
759 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
761 net->ipv6.sysctl.ip6_rt_gc_elasticity =
763 net->ipv6.sysctl.ip6_rt_gc_min_interval =
764 saved_rt_min_interval;
770 "ipv6: Neighbour table overflow.\n");
774 dst_set_neighbour(&rt->dst, neigh);
781 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
782 const struct in6_addr *daddr)
784 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
787 rt->rt6i_flags |= RTF_CACHE;
788 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
793 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
794 struct flowi6 *fl6, int flags)
796 struct fib6_node *fn;
797 struct rt6_info *rt, *nrt;
801 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
803 strict |= flags & RT6_LOOKUP_F_IFACE;
806 read_lock_bh(&table->tb6_lock);
809 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
812 rt = rt6_select(fn, oif, strict | reachable);
814 BACKTRACK(net, &fl6->saddr);
815 if (rt == net->ipv6.ip6_null_entry ||
816 rt->rt6i_flags & RTF_CACHE)
820 read_unlock_bh(&table->tb6_lock);
822 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
823 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
824 else if (!(rt->dst.flags & DST_HOST))
825 nrt = rt6_alloc_clone(rt, &fl6->daddr);
829 dst_release(&rt->dst);
830 rt = nrt ? : net->ipv6.ip6_null_entry;
834 err = ip6_ins_rt(nrt);
843 * Race condition! In the gap, when table->tb6_lock was
844 * released someone could insert this route. Relookup.
846 dst_release(&rt->dst);
855 read_unlock_bh(&table->tb6_lock);
857 rt->dst.lastuse = jiffies;
863 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
864 struct flowi6 *fl6, int flags)
866 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
869 void ip6_route_input(struct sk_buff *skb)
871 const struct ipv6hdr *iph = ipv6_hdr(skb);
872 struct net *net = dev_net(skb->dev);
873 int flags = RT6_LOOKUP_F_HAS_SADDR;
874 struct flowi6 fl6 = {
875 .flowi6_iif = skb->dev->ifindex,
878 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
879 .flowi6_mark = skb->mark,
880 .flowi6_proto = iph->nexthdr,
883 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
884 flags |= RT6_LOOKUP_F_IFACE;
886 skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
889 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
890 struct flowi6 *fl6, int flags)
892 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
895 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
900 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
901 flags |= RT6_LOOKUP_F_IFACE;
903 if (!ipv6_addr_any(&fl6->saddr))
904 flags |= RT6_LOOKUP_F_HAS_SADDR;
906 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
908 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
911 EXPORT_SYMBOL(ip6_route_output);
913 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
915 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
916 struct dst_entry *new = NULL;
918 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
920 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
925 new->input = dst_discard;
926 new->output = dst_discard;
928 if (dst_metrics_read_only(&ort->dst))
929 new->_metrics = ort->dst._metrics;
931 dst_copy_metrics(new, &ort->dst);
932 rt->rt6i_idev = ort->rt6i_idev;
934 in6_dev_hold(rt->rt6i_idev);
935 rt->rt6i_expires = 0;
937 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
938 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
941 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
942 #ifdef CONFIG_IPV6_SUBTREES
943 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
949 dst_release(dst_orig);
950 return new ? new : ERR_PTR(-ENOMEM);
954 * Destination cache support functions
957 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
961 rt = (struct rt6_info *) dst;
963 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
964 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
966 rt6_bind_peer(rt, 0);
967 rt->rt6i_peer_genid = rt6_peer_genid();
974 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
976 struct rt6_info *rt = (struct rt6_info *) dst;
979 if (rt->rt6i_flags & RTF_CACHE) {
980 if (rt6_check_expired(rt)) {
992 static void ip6_link_failure(struct sk_buff *skb)
996 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
998 rt = (struct rt6_info *) skb_dst(skb);
1000 if (rt->rt6i_flags&RTF_CACHE) {
1001 dst_set_expires(&rt->dst, 0);
1002 rt->rt6i_flags |= RTF_EXPIRES;
1003 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1004 rt->rt6i_node->fn_sernum = -1;
1008 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1010 struct rt6_info *rt6 = (struct rt6_info*)dst;
1012 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1013 rt6->rt6i_flags |= RTF_MODIFIED;
1014 if (mtu < IPV6_MIN_MTU) {
1015 u32 features = dst_metric(dst, RTAX_FEATURES);
1017 features |= RTAX_FEATURE_ALLFRAG;
1018 dst_metric_set(dst, RTAX_FEATURES, features);
1020 dst_metric_set(dst, RTAX_MTU, mtu);
1024 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1026 struct net_device *dev = dst->dev;
1027 unsigned int mtu = dst_mtu(dst);
1028 struct net *net = dev_net(dev);
1030 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1032 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1033 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1036 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1037 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1038 * IPV6_MAXPLEN is also valid and means: "any MSS,
1039 * rely only on pmtu discovery"
1041 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1046 static unsigned int ip6_mtu(const struct dst_entry *dst)
1048 struct inet6_dev *idev;
1049 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1057 idev = __in6_dev_get(dst->dev);
1059 mtu = idev->cnf.mtu6;
1065 static struct dst_entry *icmp6_dst_gc_list;
1066 static DEFINE_SPINLOCK(icmp6_dst_lock);
1068 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1069 struct neighbour *neigh,
1070 const struct in6_addr *addr)
1072 struct rt6_info *rt;
1073 struct inet6_dev *idev = in6_dev_get(dev);
1074 struct net *net = dev_net(dev);
1076 if (unlikely(idev == NULL))
1079 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1080 if (unlikely(rt == NULL)) {
1088 neigh = ndisc_get_neigh(dev, addr);
1093 rt->dst.flags |= DST_HOST;
1094 rt->dst.output = ip6_output;
1095 dst_set_neighbour(&rt->dst, neigh);
1096 atomic_set(&rt->dst.__refcnt, 1);
1097 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1098 rt->rt6i_dst.plen = 128;
1099 rt->rt6i_idev = idev;
1100 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1102 spin_lock_bh(&icmp6_dst_lock);
1103 rt->dst.next = icmp6_dst_gc_list;
1104 icmp6_dst_gc_list = &rt->dst;
1105 spin_unlock_bh(&icmp6_dst_lock);
1107 fib6_force_start_gc(net);
1113 int icmp6_dst_gc(void)
1115 struct dst_entry *dst, **pprev;
1118 spin_lock_bh(&icmp6_dst_lock);
1119 pprev = &icmp6_dst_gc_list;
1121 while ((dst = *pprev) != NULL) {
1122 if (!atomic_read(&dst->__refcnt)) {
1131 spin_unlock_bh(&icmp6_dst_lock);
1136 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1139 struct dst_entry *dst, **pprev;
1141 spin_lock_bh(&icmp6_dst_lock);
1142 pprev = &icmp6_dst_gc_list;
1143 while ((dst = *pprev) != NULL) {
1144 struct rt6_info *rt = (struct rt6_info *) dst;
1145 if (func(rt, arg)) {
1152 spin_unlock_bh(&icmp6_dst_lock);
1155 static int ip6_dst_gc(struct dst_ops *ops)
1157 unsigned long now = jiffies;
1158 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1159 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1160 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1161 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1162 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1163 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1166 entries = dst_entries_get_fast(ops);
1167 if (time_after(rt_last_gc + rt_min_interval, now) &&
1168 entries <= rt_max_size)
1171 net->ipv6.ip6_rt_gc_expire++;
1172 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1173 net->ipv6.ip6_rt_last_gc = now;
1174 entries = dst_entries_get_slow(ops);
1175 if (entries < ops->gc_thresh)
1176 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1178 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1179 return entries > rt_max_size;
1182 /* Clean host part of a prefix. Not necessary in radix tree,
1183 but results in cleaner routing tables.
1185 Remove it only when all the things will work!
1188 int ip6_dst_hoplimit(struct dst_entry *dst)
1190 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1191 if (hoplimit == 0) {
1192 struct net_device *dev = dst->dev;
1193 struct inet6_dev *idev;
1196 idev = __in6_dev_get(dev);
1198 hoplimit = idev->cnf.hop_limit;
1200 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1205 EXPORT_SYMBOL(ip6_dst_hoplimit);
1211 int ip6_route_add(struct fib6_config *cfg)
1214 struct net *net = cfg->fc_nlinfo.nl_net;
1215 struct rt6_info *rt = NULL;
1216 struct net_device *dev = NULL;
1217 struct inet6_dev *idev = NULL;
1218 struct fib6_table *table;
1221 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1223 #ifndef CONFIG_IPV6_SUBTREES
1224 if (cfg->fc_src_len)
1227 if (cfg->fc_ifindex) {
1229 dev = dev_get_by_index(net, cfg->fc_ifindex);
1232 idev = in6_dev_get(dev);
1237 if (cfg->fc_metric == 0)
1238 cfg->fc_metric = IP6_RT_PRIO_USER;
1240 table = fib6_new_table(net, cfg->fc_table);
1241 if (table == NULL) {
1246 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1253 rt->dst.obsolete = -1;
1254 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1255 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1258 if (cfg->fc_protocol == RTPROT_UNSPEC)
1259 cfg->fc_protocol = RTPROT_BOOT;
1260 rt->rt6i_protocol = cfg->fc_protocol;
1262 addr_type = ipv6_addr_type(&cfg->fc_dst);
1264 if (addr_type & IPV6_ADDR_MULTICAST)
1265 rt->dst.input = ip6_mc_input;
1266 else if (cfg->fc_flags & RTF_LOCAL)
1267 rt->dst.input = ip6_input;
1269 rt->dst.input = ip6_forward;
1271 rt->dst.output = ip6_output;
1273 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1274 rt->rt6i_dst.plen = cfg->fc_dst_len;
1275 if (rt->rt6i_dst.plen == 128)
1276 rt->dst.flags |= DST_HOST;
1278 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1279 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1284 dst_init_metrics(&rt->dst, metrics, 0);
1286 #ifdef CONFIG_IPV6_SUBTREES
1287 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1288 rt->rt6i_src.plen = cfg->fc_src_len;
1291 rt->rt6i_metric = cfg->fc_metric;
1293 /* We cannot add true routes via loopback here,
1294 they would result in kernel looping; promote them to reject routes
1296 if ((cfg->fc_flags & RTF_REJECT) ||
1297 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1298 && !(cfg->fc_flags&RTF_LOCAL))) {
1299 /* hold loopback dev/idev if we haven't done so. */
1300 if (dev != net->loopback_dev) {
1305 dev = net->loopback_dev;
1307 idev = in6_dev_get(dev);
1313 rt->dst.output = ip6_pkt_discard_out;
1314 rt->dst.input = ip6_pkt_discard;
1315 rt->dst.error = -ENETUNREACH;
1316 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1320 if (cfg->fc_flags & RTF_GATEWAY) {
1321 const struct in6_addr *gw_addr;
1324 gw_addr = &cfg->fc_gateway;
1325 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1326 gwa_type = ipv6_addr_type(gw_addr);
1328 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1329 struct rt6_info *grt;
1331 /* IPv6 strictly inhibits using not link-local
1332 addresses as nexthop address.
1333 Otherwise, router will not able to send redirects.
1334 It is very good, but in some (rare!) circumstances
1335 (SIT, PtP, NBMA NOARP links) it is handy to allow
1336 some exceptions. --ANK
1339 if (!(gwa_type&IPV6_ADDR_UNICAST))
1342 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1344 err = -EHOSTUNREACH;
1348 if (dev != grt->rt6i_dev) {
1349 dst_release(&grt->dst);
1353 dev = grt->rt6i_dev;
1354 idev = grt->rt6i_idev;
1356 in6_dev_hold(grt->rt6i_idev);
1358 if (!(grt->rt6i_flags&RTF_GATEWAY))
1360 dst_release(&grt->dst);
1366 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1374 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1375 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1379 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1380 rt->rt6i_prefsrc.plen = 128;
1382 rt->rt6i_prefsrc.plen = 0;
1384 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1385 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1390 dst_set_neighbour(&rt->dst, n);
1393 rt->rt6i_flags = cfg->fc_flags;
1400 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1401 int type = nla_type(nla);
1404 if (type > RTAX_MAX) {
1409 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1415 rt->rt6i_idev = idev;
1416 rt->rt6i_table = table;
1418 cfg->fc_nlinfo.nl_net = dev_net(dev);
1420 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1432 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1435 struct fib6_table *table;
1436 struct net *net = dev_net(rt->rt6i_dev);
1438 if (rt == net->ipv6.ip6_null_entry) {
1443 table = rt->rt6i_table;
1444 write_lock_bh(&table->tb6_lock);
1445 err = fib6_del(rt, info);
1446 write_unlock_bh(&table->tb6_lock);
1449 dst_release(&rt->dst);
1453 int ip6_del_rt(struct rt6_info *rt)
1455 struct nl_info info = {
1456 .nl_net = dev_net(rt->rt6i_dev),
1458 return __ip6_del_rt(rt, &info);
1461 static int ip6_route_del(struct fib6_config *cfg)
1463 struct fib6_table *table;
1464 struct fib6_node *fn;
1465 struct rt6_info *rt;
1468 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1472 read_lock_bh(&table->tb6_lock);
1474 fn = fib6_locate(&table->tb6_root,
1475 &cfg->fc_dst, cfg->fc_dst_len,
1476 &cfg->fc_src, cfg->fc_src_len);
1479 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1480 if (cfg->fc_ifindex &&
1481 (rt->rt6i_dev == NULL ||
1482 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1484 if (cfg->fc_flags & RTF_GATEWAY &&
1485 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1487 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1490 read_unlock_bh(&table->tb6_lock);
1492 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1495 read_unlock_bh(&table->tb6_lock);
1503 struct ip6rd_flowi {
1505 struct in6_addr gateway;
1508 static struct rt6_info *__ip6_route_redirect(struct net *net,
1509 struct fib6_table *table,
1513 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1514 struct rt6_info *rt;
1515 struct fib6_node *fn;
1518 * Get the "current" route for this destination and
1519 * check if the redirect has come from approriate router.
1521 * RFC 2461 specifies that redirects should only be
1522 * accepted if they come from the nexthop to the target.
1523 * Due to the way the routes are chosen, this notion
1524 * is a bit fuzzy and one might need to check all possible
1528 read_lock_bh(&table->tb6_lock);
1529 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1531 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1533 * Current route is on-link; redirect is always invalid.
1535 * Seems, previous statement is not true. It could
1536 * be node, which looks for us as on-link (f.e. proxy ndisc)
1537 * But then router serving it might decide, that we should
1538 * know truth 8)8) --ANK (980726).
1540 if (rt6_check_expired(rt))
1542 if (!(rt->rt6i_flags & RTF_GATEWAY))
1544 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1546 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1552 rt = net->ipv6.ip6_null_entry;
1553 BACKTRACK(net, &fl6->saddr);
1557 read_unlock_bh(&table->tb6_lock);
1562 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1563 const struct in6_addr *src,
1564 const struct in6_addr *gateway,
1565 struct net_device *dev)
1567 int flags = RT6_LOOKUP_F_HAS_SADDR;
1568 struct net *net = dev_net(dev);
1569 struct ip6rd_flowi rdfl = {
1571 .flowi6_oif = dev->ifindex,
1577 ipv6_addr_copy(&rdfl.gateway, gateway);
1579 if (rt6_need_strict(dest))
1580 flags |= RT6_LOOKUP_F_IFACE;
1582 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1583 flags, __ip6_route_redirect);
1586 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1587 const struct in6_addr *saddr,
1588 struct neighbour *neigh, u8 *lladdr, int on_link)
1590 struct rt6_info *rt, *nrt = NULL;
1591 struct netevent_redirect netevent;
1592 struct net *net = dev_net(neigh->dev);
1594 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1596 if (rt == net->ipv6.ip6_null_entry) {
1597 if (net_ratelimit())
1598 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1599 "for redirect target\n");
1604 * We have finally decided to accept it.
1607 neigh_update(neigh, lladdr, NUD_STALE,
1608 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1609 NEIGH_UPDATE_F_OVERRIDE|
1610 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1611 NEIGH_UPDATE_F_ISROUTER))
1615 * Redirect received -> path was valid.
1616 * Look, redirects are sent only in response to data packets,
1617 * so that this nexthop apparently is reachable. --ANK
1619 dst_confirm(&rt->dst);
1621 /* Duplicate redirect: silently ignore. */
1622 if (neigh == dst_get_neighbour_raw(&rt->dst))
1625 nrt = ip6_rt_copy(rt, dest);
1629 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1631 nrt->rt6i_flags &= ~RTF_GATEWAY;
1633 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1634 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1636 if (ip6_ins_rt(nrt))
1639 netevent.old = &rt->dst;
1640 netevent.new = &nrt->dst;
1641 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1643 if (rt->rt6i_flags&RTF_CACHE) {
1649 dst_release(&rt->dst);
1653 * Handle ICMP "packet too big" messages
1654 * i.e. Path MTU discovery
1657 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1658 struct net *net, u32 pmtu, int ifindex)
1660 struct rt6_info *rt, *nrt;
1663 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1667 if (rt6_check_expired(rt)) {
1672 if (pmtu >= dst_mtu(&rt->dst))
1675 if (pmtu < IPV6_MIN_MTU) {
1677 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1678 * MTU (1280) and a fragment header should always be included
1679 * after a node receiving Too Big message reporting PMTU is
1680 * less than the IPv6 Minimum Link MTU.
1682 pmtu = IPV6_MIN_MTU;
1686 /* New mtu received -> path was valid.
1687 They are sent only in response to data packets,
1688 so that this nexthop apparently is reachable. --ANK
1690 dst_confirm(&rt->dst);
1692 /* Host route. If it is static, it would be better
1693 not to override it, but add new one, so that
1694 when cache entry will expire old pmtu
1695 would return automatically.
1697 if (rt->rt6i_flags & RTF_CACHE) {
1698 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1700 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1701 features |= RTAX_FEATURE_ALLFRAG;
1702 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1704 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1705 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1710 Two cases are possible:
1711 1. It is connected route. Action: COW
1712 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1714 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1715 nrt = rt6_alloc_cow(rt, daddr, saddr);
1717 nrt = rt6_alloc_clone(rt, daddr);
1720 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1722 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1723 features |= RTAX_FEATURE_ALLFRAG;
1724 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1727 /* According to RFC 1981, detecting PMTU increase shouldn't be
1728 * happened within 5 mins, the recommended timer is 10 mins.
1729 * Here this route expiration time is set to ip6_rt_mtu_expires
1730 * which is 10 mins. After 10 mins the decreased pmtu is expired
1731 * and detecting PMTU increase will be automatically happened.
1733 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1734 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1739 dst_release(&rt->dst);
1742 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1743 struct net_device *dev, u32 pmtu)
1745 struct net *net = dev_net(dev);
1748 * RFC 1981 states that a node "MUST reduce the size of the packets it
1749 * is sending along the path" that caused the Packet Too Big message.
1750 * Since it's not possible in the general case to determine which
1751 * interface was used to send the original packet, we update the MTU
1752 * on the interface that will be used to send future packets. We also
1753 * update the MTU on the interface that received the Packet Too Big in
1754 * case the original packet was forced out that interface with
1755 * SO_BINDTODEVICE or similar. This is the next best thing to the
1756 * correct behaviour, which would be to update the MTU on all
1759 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1760 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1764 * Misc support functions
1767 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1768 const struct in6_addr *dest)
1770 struct net *net = dev_net(ort->rt6i_dev);
1771 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1775 rt->dst.input = ort->dst.input;
1776 rt->dst.output = ort->dst.output;
1777 rt->dst.flags |= DST_HOST;
1779 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1780 rt->rt6i_dst.plen = 128;
1781 dst_copy_metrics(&rt->dst, &ort->dst);
1782 rt->dst.error = ort->dst.error;
1783 rt->rt6i_idev = ort->rt6i_idev;
1785 in6_dev_hold(rt->rt6i_idev);
1786 rt->dst.lastuse = jiffies;
1787 rt->rt6i_expires = 0;
1789 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1790 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1791 rt->rt6i_metric = 0;
1793 #ifdef CONFIG_IPV6_SUBTREES
1794 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1796 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1797 rt->rt6i_table = ort->rt6i_table;
1802 #ifdef CONFIG_IPV6_ROUTE_INFO
1803 static struct rt6_info *rt6_get_route_info(struct net *net,
1804 const struct in6_addr *prefix, int prefixlen,
1805 const struct in6_addr *gwaddr, int ifindex)
1807 struct fib6_node *fn;
1808 struct rt6_info *rt = NULL;
1809 struct fib6_table *table;
1811 table = fib6_get_table(net, RT6_TABLE_INFO);
1815 write_lock_bh(&table->tb6_lock);
1816 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1820 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1821 if (rt->rt6i_dev->ifindex != ifindex)
1823 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1825 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1831 write_unlock_bh(&table->tb6_lock);
1835 static struct rt6_info *rt6_add_route_info(struct net *net,
1836 const struct in6_addr *prefix, int prefixlen,
1837 const struct in6_addr *gwaddr, int ifindex,
1840 struct fib6_config cfg = {
1841 .fc_table = RT6_TABLE_INFO,
1842 .fc_metric = IP6_RT_PRIO_USER,
1843 .fc_ifindex = ifindex,
1844 .fc_dst_len = prefixlen,
1845 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1846 RTF_UP | RTF_PREF(pref),
1848 .fc_nlinfo.nlh = NULL,
1849 .fc_nlinfo.nl_net = net,
1852 ipv6_addr_copy(&cfg.fc_dst, prefix);
1853 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1855 /* We should treat it as a default route if prefix length is 0. */
1857 cfg.fc_flags |= RTF_DEFAULT;
1859 ip6_route_add(&cfg);
1861 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1865 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1867 struct rt6_info *rt;
1868 struct fib6_table *table;
1870 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1874 write_lock_bh(&table->tb6_lock);
1875 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1876 if (dev == rt->rt6i_dev &&
1877 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1878 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1883 write_unlock_bh(&table->tb6_lock);
1887 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1888 struct net_device *dev,
1891 struct fib6_config cfg = {
1892 .fc_table = RT6_TABLE_DFLT,
1893 .fc_metric = IP6_RT_PRIO_USER,
1894 .fc_ifindex = dev->ifindex,
1895 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1896 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1898 .fc_nlinfo.nlh = NULL,
1899 .fc_nlinfo.nl_net = dev_net(dev),
1902 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1904 ip6_route_add(&cfg);
1906 return rt6_get_dflt_router(gwaddr, dev);
1909 void rt6_purge_dflt_routers(struct net *net)
1911 struct rt6_info *rt;
1912 struct fib6_table *table;
1914 /* NOTE: Keep consistent with rt6_get_dflt_router */
1915 table = fib6_get_table(net, RT6_TABLE_DFLT);
1920 read_lock_bh(&table->tb6_lock);
1921 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1922 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1924 read_unlock_bh(&table->tb6_lock);
1929 read_unlock_bh(&table->tb6_lock);
1932 static void rtmsg_to_fib6_config(struct net *net,
1933 struct in6_rtmsg *rtmsg,
1934 struct fib6_config *cfg)
1936 memset(cfg, 0, sizeof(*cfg));
1938 cfg->fc_table = RT6_TABLE_MAIN;
1939 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1940 cfg->fc_metric = rtmsg->rtmsg_metric;
1941 cfg->fc_expires = rtmsg->rtmsg_info;
1942 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1943 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1944 cfg->fc_flags = rtmsg->rtmsg_flags;
1946 cfg->fc_nlinfo.nl_net = net;
1948 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1949 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1950 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1953 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1955 struct fib6_config cfg;
1956 struct in6_rtmsg rtmsg;
1960 case SIOCADDRT: /* Add a route */
1961 case SIOCDELRT: /* Delete a route */
1962 if (!capable(CAP_NET_ADMIN))
1964 err = copy_from_user(&rtmsg, arg,
1965 sizeof(struct in6_rtmsg));
1969 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1974 err = ip6_route_add(&cfg);
1977 err = ip6_route_del(&cfg);
1991 * Drop the packet on the floor
1994 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1997 struct dst_entry *dst = skb_dst(skb);
1998 switch (ipstats_mib_noroutes) {
1999 case IPSTATS_MIB_INNOROUTES:
2000 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2001 if (type == IPV6_ADDR_ANY) {
2002 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2003 IPSTATS_MIB_INADDRERRORS);
2007 case IPSTATS_MIB_OUTNOROUTES:
2008 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2009 ipstats_mib_noroutes);
2012 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2017 static int ip6_pkt_discard(struct sk_buff *skb)
2019 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2022 static int ip6_pkt_discard_out(struct sk_buff *skb)
2024 skb->dev = skb_dst(skb)->dev;
2025 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2028 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2030 static int ip6_pkt_prohibit(struct sk_buff *skb)
2032 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2035 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2037 skb->dev = skb_dst(skb)->dev;
2038 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2044 * Allocate a dst for local (unicast / anycast) address.
2047 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2048 const struct in6_addr *addr,
2051 struct net *net = dev_net(idev->dev);
2052 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2053 net->loopback_dev, 0);
2054 struct neighbour *neigh;
2057 if (net_ratelimit())
2058 pr_warning("IPv6: Maximum number of routes reached,"
2059 " consider increasing route/max_size.\n");
2060 return ERR_PTR(-ENOMEM);
2065 rt->dst.flags |= DST_HOST;
2066 rt->dst.input = ip6_input;
2067 rt->dst.output = ip6_output;
2068 rt->rt6i_idev = idev;
2069 rt->dst.obsolete = -1;
2071 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2073 rt->rt6i_flags |= RTF_ANYCAST;
2075 rt->rt6i_flags |= RTF_LOCAL;
2076 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2077 if (IS_ERR(neigh)) {
2080 return ERR_CAST(neigh);
2082 dst_set_neighbour(&rt->dst, neigh);
2084 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2085 rt->rt6i_dst.plen = 128;
2086 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2088 atomic_set(&rt->dst.__refcnt, 1);
2093 int ip6_route_get_saddr(struct net *net,
2094 struct rt6_info *rt,
2095 const struct in6_addr *daddr,
2097 struct in6_addr *saddr)
2099 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2101 if (rt->rt6i_prefsrc.plen)
2102 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2104 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2105 daddr, prefs, saddr);
2109 /* remove deleted ip from prefsrc entries */
2110 struct arg_dev_net_ip {
2111 struct net_device *dev;
2113 struct in6_addr *addr;
2116 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2118 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2119 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2120 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2122 if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2123 rt != net->ipv6.ip6_null_entry &&
2124 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2125 /* remove prefsrc entry */
2126 rt->rt6i_prefsrc.plen = 0;
2131 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2133 struct net *net = dev_net(ifp->idev->dev);
2134 struct arg_dev_net_ip adni = {
2135 .dev = ifp->idev->dev,
2139 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2142 struct arg_dev_net {
2143 struct net_device *dev;
2147 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2149 const struct arg_dev_net *adn = arg;
2150 const struct net_device *dev = adn->dev;
2152 if ((rt->rt6i_dev == dev || dev == NULL) &&
2153 rt != adn->net->ipv6.ip6_null_entry) {
2154 RT6_TRACE("deleted by ifdown %p\n", rt);
2160 void rt6_ifdown(struct net *net, struct net_device *dev)
2162 struct arg_dev_net adn = {
2167 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2168 icmp6_clean_all(fib6_ifdown, &adn);
2171 struct rt6_mtu_change_arg
2173 struct net_device *dev;
2177 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2179 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2180 struct inet6_dev *idev;
2182 /* In IPv6 pmtu discovery is not optional,
2183 so that RTAX_MTU lock cannot disable it.
2184 We still use this lock to block changes
2185 caused by addrconf/ndisc.
2188 idev = __in6_dev_get(arg->dev);
2192 /* For administrative MTU increase, there is no way to discover
2193 IPv6 PMTU increase, so PMTU increase should be updated here.
2194 Since RFC 1981 doesn't include administrative MTU increase
2195 update PMTU increase is a MUST. (i.e. jumbo frame)
2198 If new MTU is less than route PMTU, this new MTU will be the
2199 lowest MTU in the path, update the route PMTU to reflect PMTU
2200 decreases; if new MTU is greater than route PMTU, and the
2201 old MTU is the lowest MTU in the path, update the route PMTU
2202 to reflect the increase. In this case if the other nodes' MTU
2203 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2206 if (rt->rt6i_dev == arg->dev &&
2207 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2208 (dst_mtu(&rt->dst) >= arg->mtu ||
2209 (dst_mtu(&rt->dst) < arg->mtu &&
2210 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2211 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2216 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2218 struct rt6_mtu_change_arg arg = {
2223 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2226 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2227 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2228 [RTA_OIF] = { .type = NLA_U32 },
2229 [RTA_IIF] = { .type = NLA_U32 },
2230 [RTA_PRIORITY] = { .type = NLA_U32 },
2231 [RTA_METRICS] = { .type = NLA_NESTED },
2234 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2235 struct fib6_config *cfg)
2238 struct nlattr *tb[RTA_MAX+1];
2241 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2246 rtm = nlmsg_data(nlh);
2247 memset(cfg, 0, sizeof(*cfg));
2249 cfg->fc_table = rtm->rtm_table;
2250 cfg->fc_dst_len = rtm->rtm_dst_len;
2251 cfg->fc_src_len = rtm->rtm_src_len;
2252 cfg->fc_flags = RTF_UP;
2253 cfg->fc_protocol = rtm->rtm_protocol;
2255 if (rtm->rtm_type == RTN_UNREACHABLE)
2256 cfg->fc_flags |= RTF_REJECT;
2258 if (rtm->rtm_type == RTN_LOCAL)
2259 cfg->fc_flags |= RTF_LOCAL;
2261 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2262 cfg->fc_nlinfo.nlh = nlh;
2263 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2265 if (tb[RTA_GATEWAY]) {
2266 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2267 cfg->fc_flags |= RTF_GATEWAY;
2271 int plen = (rtm->rtm_dst_len + 7) >> 3;
2273 if (nla_len(tb[RTA_DST]) < plen)
2276 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2280 int plen = (rtm->rtm_src_len + 7) >> 3;
2282 if (nla_len(tb[RTA_SRC]) < plen)
2285 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2288 if (tb[RTA_PREFSRC])
2289 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2292 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2294 if (tb[RTA_PRIORITY])
2295 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2297 if (tb[RTA_METRICS]) {
2298 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2299 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2303 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2310 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2312 struct fib6_config cfg;
2315 err = rtm_to_fib6_config(skb, nlh, &cfg);
2319 return ip6_route_del(&cfg);
2322 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2324 struct fib6_config cfg;
2327 err = rtm_to_fib6_config(skb, nlh, &cfg);
2331 return ip6_route_add(&cfg);
2334 static inline size_t rt6_nlmsg_size(void)
2336 return NLMSG_ALIGN(sizeof(struct rtmsg))
2337 + nla_total_size(16) /* RTA_SRC */
2338 + nla_total_size(16) /* RTA_DST */
2339 + nla_total_size(16) /* RTA_GATEWAY */
2340 + nla_total_size(16) /* RTA_PREFSRC */
2341 + nla_total_size(4) /* RTA_TABLE */
2342 + nla_total_size(4) /* RTA_IIF */
2343 + nla_total_size(4) /* RTA_OIF */
2344 + nla_total_size(4) /* RTA_PRIORITY */
2345 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2346 + nla_total_size(sizeof(struct rta_cacheinfo));
2349 static int rt6_fill_node(struct net *net,
2350 struct sk_buff *skb, struct rt6_info *rt,
2351 struct in6_addr *dst, struct in6_addr *src,
2352 int iif, int type, u32 pid, u32 seq,
2353 int prefix, int nowait, unsigned int flags)
2356 struct nlmsghdr *nlh;
2359 struct neighbour *n;
2361 if (prefix) { /* user wants prefix routes only */
2362 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2363 /* success since this is not a prefix route */
2368 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2372 rtm = nlmsg_data(nlh);
2373 rtm->rtm_family = AF_INET6;
2374 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2375 rtm->rtm_src_len = rt->rt6i_src.plen;
2378 table = rt->rt6i_table->tb6_id;
2380 table = RT6_TABLE_UNSPEC;
2381 rtm->rtm_table = table;
2382 NLA_PUT_U32(skb, RTA_TABLE, table);
2383 if (rt->rt6i_flags&RTF_REJECT)
2384 rtm->rtm_type = RTN_UNREACHABLE;
2385 else if (rt->rt6i_flags&RTF_LOCAL)
2386 rtm->rtm_type = RTN_LOCAL;
2387 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2388 rtm->rtm_type = RTN_LOCAL;
2390 rtm->rtm_type = RTN_UNICAST;
2392 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2393 rtm->rtm_protocol = rt->rt6i_protocol;
2394 if (rt->rt6i_flags&RTF_DYNAMIC)
2395 rtm->rtm_protocol = RTPROT_REDIRECT;
2396 else if (rt->rt6i_flags & RTF_ADDRCONF)
2397 rtm->rtm_protocol = RTPROT_KERNEL;
2398 else if (rt->rt6i_flags&RTF_DEFAULT)
2399 rtm->rtm_protocol = RTPROT_RA;
2401 if (rt->rt6i_flags&RTF_CACHE)
2402 rtm->rtm_flags |= RTM_F_CLONED;
2405 NLA_PUT(skb, RTA_DST, 16, dst);
2406 rtm->rtm_dst_len = 128;
2407 } else if (rtm->rtm_dst_len)
2408 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2409 #ifdef CONFIG_IPV6_SUBTREES
2411 NLA_PUT(skb, RTA_SRC, 16, src);
2412 rtm->rtm_src_len = 128;
2413 } else if (rtm->rtm_src_len)
2414 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2417 #ifdef CONFIG_IPV6_MROUTE
2418 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2419 int err = ip6mr_get_route(net, skb, rtm, nowait);
2424 goto nla_put_failure;
2426 if (err == -EMSGSIZE)
2427 goto nla_put_failure;
2432 NLA_PUT_U32(skb, RTA_IIF, iif);
2434 struct in6_addr saddr_buf;
2435 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2436 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2439 if (rt->rt6i_prefsrc.plen) {
2440 struct in6_addr saddr_buf;
2441 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2442 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2445 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2446 goto nla_put_failure;
2449 n = dst_get_neighbour(&rt->dst);
2451 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2453 goto nla_put_failure;
2459 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2461 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2463 if (!(rt->rt6i_flags & RTF_EXPIRES))
2465 else if (rt->rt6i_expires - jiffies < INT_MAX)
2466 expires = rt->rt6i_expires - jiffies;
2470 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2471 expires, rt->dst.error) < 0)
2472 goto nla_put_failure;
2474 return nlmsg_end(skb, nlh);
2477 nlmsg_cancel(skb, nlh);
2481 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2483 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2486 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2487 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2488 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2492 return rt6_fill_node(arg->net,
2493 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2494 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2495 prefix, 0, NLM_F_MULTI);
2498 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2500 struct net *net = sock_net(in_skb->sk);
2501 struct nlattr *tb[RTA_MAX+1];
2502 struct rt6_info *rt;
2503 struct sk_buff *skb;
2508 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2513 memset(&fl6, 0, sizeof(fl6));
2516 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2519 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2523 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2526 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2530 iif = nla_get_u32(tb[RTA_IIF]);
2533 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2536 struct net_device *dev;
2537 dev = __dev_get_by_index(net, iif);
2544 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2550 /* Reserve room for dummy headers, this skb can pass
2551 through good chunk of routing engine.
2553 skb_reset_mac_header(skb);
2554 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2556 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2557 skb_dst_set(skb, &rt->dst);
2559 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2560 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2561 nlh->nlmsg_seq, 0, 0, 0);
2567 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2572 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2574 struct sk_buff *skb;
2575 struct net *net = info->nl_net;
2580 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2582 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2586 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2587 event, info->pid, seq, 0, 0, 0);
2589 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2590 WARN_ON(err == -EMSGSIZE);
2594 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2595 info->nlh, gfp_any());
2599 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2602 static int ip6_route_dev_notify(struct notifier_block *this,
2603 unsigned long event, void *data)
2605 struct net_device *dev = (struct net_device *)data;
2606 struct net *net = dev_net(dev);
2608 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2609 net->ipv6.ip6_null_entry->dst.dev = dev;
2610 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2611 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2612 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2613 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2614 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2615 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2626 #ifdef CONFIG_PROC_FS
2637 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2639 struct seq_file *m = p_arg;
2640 struct neighbour *n;
2642 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2644 #ifdef CONFIG_IPV6_SUBTREES
2645 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2647 seq_puts(m, "00000000000000000000000000000000 00 ");
2650 n = dst_get_neighbour(&rt->dst);
2652 seq_printf(m, "%pi6", n->primary_key);
2654 seq_puts(m, "00000000000000000000000000000000");
2657 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2658 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2659 rt->dst.__use, rt->rt6i_flags,
2660 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2664 static int ipv6_route_show(struct seq_file *m, void *v)
2666 struct net *net = (struct net *)m->private;
2667 fib6_clean_all(net, rt6_info_route, 0, m);
2671 static int ipv6_route_open(struct inode *inode, struct file *file)
2673 return single_open_net(inode, file, ipv6_route_show);
2676 static const struct file_operations ipv6_route_proc_fops = {
2677 .owner = THIS_MODULE,
2678 .open = ipv6_route_open,
2680 .llseek = seq_lseek,
2681 .release = single_release_net,
2684 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2686 struct net *net = (struct net *)seq->private;
2687 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2688 net->ipv6.rt6_stats->fib_nodes,
2689 net->ipv6.rt6_stats->fib_route_nodes,
2690 net->ipv6.rt6_stats->fib_rt_alloc,
2691 net->ipv6.rt6_stats->fib_rt_entries,
2692 net->ipv6.rt6_stats->fib_rt_cache,
2693 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2694 net->ipv6.rt6_stats->fib_discarded_routes);
2699 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2701 return single_open_net(inode, file, rt6_stats_seq_show);
2704 static const struct file_operations rt6_stats_seq_fops = {
2705 .owner = THIS_MODULE,
2706 .open = rt6_stats_seq_open,
2708 .llseek = seq_lseek,
2709 .release = single_release_net,
2711 #endif /* CONFIG_PROC_FS */
2713 #ifdef CONFIG_SYSCTL
2716 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2717 void __user *buffer, size_t *lenp, loff_t *ppos)
2724 net = (struct net *)ctl->extra1;
2725 delay = net->ipv6.sysctl.flush_delay;
2726 proc_dointvec(ctl, write, buffer, lenp, ppos);
2727 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2731 ctl_table ipv6_route_table_template[] = {
2733 .procname = "flush",
2734 .data = &init_net.ipv6.sysctl.flush_delay,
2735 .maxlen = sizeof(int),
2737 .proc_handler = ipv6_sysctl_rtcache_flush
2740 .procname = "gc_thresh",
2741 .data = &ip6_dst_ops_template.gc_thresh,
2742 .maxlen = sizeof(int),
2744 .proc_handler = proc_dointvec,
2747 .procname = "max_size",
2748 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2749 .maxlen = sizeof(int),
2751 .proc_handler = proc_dointvec,
2754 .procname = "gc_min_interval",
2755 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2756 .maxlen = sizeof(int),
2758 .proc_handler = proc_dointvec_jiffies,
2761 .procname = "gc_timeout",
2762 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2763 .maxlen = sizeof(int),
2765 .proc_handler = proc_dointvec_jiffies,
2768 .procname = "gc_interval",
2769 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2770 .maxlen = sizeof(int),
2772 .proc_handler = proc_dointvec_jiffies,
2775 .procname = "gc_elasticity",
2776 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2777 .maxlen = sizeof(int),
2779 .proc_handler = proc_dointvec,
2782 .procname = "mtu_expires",
2783 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2784 .maxlen = sizeof(int),
2786 .proc_handler = proc_dointvec_jiffies,
2789 .procname = "min_adv_mss",
2790 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2791 .maxlen = sizeof(int),
2793 .proc_handler = proc_dointvec,
2796 .procname = "gc_min_interval_ms",
2797 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2798 .maxlen = sizeof(int),
2800 .proc_handler = proc_dointvec_ms_jiffies,
2805 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2807 struct ctl_table *table;
2809 table = kmemdup(ipv6_route_table_template,
2810 sizeof(ipv6_route_table_template),
2814 table[0].data = &net->ipv6.sysctl.flush_delay;
2815 table[0].extra1 = net;
2816 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2817 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2818 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2819 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2820 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2821 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2822 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2823 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2824 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2831 static int __net_init ip6_route_net_init(struct net *net)
2835 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2836 sizeof(net->ipv6.ip6_dst_ops));
2838 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2839 goto out_ip6_dst_ops;
2841 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2842 sizeof(*net->ipv6.ip6_null_entry),
2844 if (!net->ipv6.ip6_null_entry)
2845 goto out_ip6_dst_entries;
2846 net->ipv6.ip6_null_entry->dst.path =
2847 (struct dst_entry *)net->ipv6.ip6_null_entry;
2848 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2849 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2850 ip6_template_metrics, true);
2852 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2853 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2854 sizeof(*net->ipv6.ip6_prohibit_entry),
2856 if (!net->ipv6.ip6_prohibit_entry)
2857 goto out_ip6_null_entry;
2858 net->ipv6.ip6_prohibit_entry->dst.path =
2859 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2860 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2861 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2862 ip6_template_metrics, true);
2864 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2865 sizeof(*net->ipv6.ip6_blk_hole_entry),
2867 if (!net->ipv6.ip6_blk_hole_entry)
2868 goto out_ip6_prohibit_entry;
2869 net->ipv6.ip6_blk_hole_entry->dst.path =
2870 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2871 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2872 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2873 ip6_template_metrics, true);
2876 net->ipv6.sysctl.flush_delay = 0;
2877 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2878 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2879 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2880 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2881 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2882 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2883 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2885 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2891 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2892 out_ip6_prohibit_entry:
2893 kfree(net->ipv6.ip6_prohibit_entry);
2895 kfree(net->ipv6.ip6_null_entry);
2897 out_ip6_dst_entries:
2898 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2903 static void __net_exit ip6_route_net_exit(struct net *net)
2905 kfree(net->ipv6.ip6_null_entry);
2906 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2907 kfree(net->ipv6.ip6_prohibit_entry);
2908 kfree(net->ipv6.ip6_blk_hole_entry);
2910 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2913 static int __net_init ip6_route_net_init_late(struct net *net)
2915 #ifdef CONFIG_PROC_FS
2916 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2917 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2922 static void __net_exit ip6_route_net_exit_late(struct net *net)
2924 #ifdef CONFIG_PROC_FS
2925 proc_net_remove(net, "ipv6_route");
2926 proc_net_remove(net, "rt6_stats");
2930 static struct pernet_operations ip6_route_net_ops = {
2931 .init = ip6_route_net_init,
2932 .exit = ip6_route_net_exit,
2935 static struct pernet_operations ip6_route_net_late_ops = {
2936 .init = ip6_route_net_init_late,
2937 .exit = ip6_route_net_exit_late,
2940 static struct notifier_block ip6_route_dev_notifier = {
2941 .notifier_call = ip6_route_dev_notify,
2945 int __init ip6_route_init(void)
2950 ip6_dst_ops_template.kmem_cachep =
2951 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2952 SLAB_HWCACHE_ALIGN, NULL);
2953 if (!ip6_dst_ops_template.kmem_cachep)
2956 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2958 goto out_kmem_cache;
2960 ret = register_pernet_subsys(&ip6_route_net_ops);
2962 goto out_dst_entries;
2964 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2966 /* Registering of the loopback is done before this portion of code,
2967 * the loopback reference in rt6_info will not be taken, do it
2968 * manually for init_net */
2969 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2970 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2971 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2972 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2973 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2974 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2975 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2979 goto out_register_subsys;
2985 ret = fib6_rules_init();
2989 ret = register_pernet_subsys(&ip6_route_net_late_ops);
2991 goto fib6_rules_init;
2994 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2995 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2996 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2997 goto out_register_late_subsys;
2999 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3001 goto out_register_late_subsys;
3006 out_register_late_subsys:
3007 unregister_pernet_subsys(&ip6_route_net_late_ops);
3009 fib6_rules_cleanup();
3014 out_register_subsys:
3015 unregister_pernet_subsys(&ip6_route_net_ops);
3017 dst_entries_destroy(&ip6_dst_blackhole_ops);
3019 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3023 void ip6_route_cleanup(void)
3025 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3026 unregister_pernet_subsys(&ip6_route_net_late_ops);
3027 fib6_rules_cleanup();
3030 unregister_pernet_subsys(&ip6_route_net_ops);
3031 dst_entries_destroy(&ip6_dst_blackhole_ops);
3032 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);