2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
53 #include <linux/rtnetlink.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
59 #include <asm/uaccess.h>
62 #include <linux/sysctl.h>
65 /* Set to 3 to get tracing. */
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
73 #define RT6_TRACE(x...) do { ; } while (0)
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77 const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void ip6_dst_destroy(struct dst_entry *);
83 static void ip6_dst_ifdown(struct dst_entry *,
84 struct net_device *dev, int how);
85 static int ip6_dst_gc(struct dst_ops *ops);
87 static int ip6_pkt_discard(struct sk_buff *skb);
88 static int ip6_pkt_discard_out(struct sk_buff *skb);
89 static void ip6_link_failure(struct sk_buff *skb);
90 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94 const struct in6_addr *prefix, int prefixlen,
95 const struct in6_addr *gwaddr, int ifindex,
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98 const struct in6_addr *prefix, int prefixlen,
99 const struct in6_addr *gwaddr, int ifindex);
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
104 struct rt6_info *rt = (struct rt6_info *) dst;
105 struct inet_peer *peer;
108 if (!(rt->dst.flags & DST_HOST))
112 rt6_bind_peer(rt, 1);
114 peer = rt->rt6i_peer;
116 u32 *old_p = __DST_METRICS_PTR(old);
117 unsigned long prev, new;
120 if (inet_metrics_new(peer))
121 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
123 new = (unsigned long) p;
124 prev = cmpxchg(&dst->_metrics, old, new);
127 p = __DST_METRICS_PTR(prev);
128 if (prev & DST_METRICS_READ_ONLY)
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
137 return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
140 static struct dst_ops ip6_dst_ops_template = {
142 .protocol = cpu_to_be16(ETH_P_IPV6),
145 .check = ip6_dst_check,
146 .default_advmss = ip6_default_advmss,
148 .cow_metrics = ipv6_cow_metrics,
149 .destroy = ip6_dst_destroy,
150 .ifdown = ip6_dst_ifdown,
151 .negative_advice = ip6_negative_advice,
152 .link_failure = ip6_link_failure,
153 .update_pmtu = ip6_rt_update_pmtu,
154 .local_out = __ip6_local_out,
155 .neigh_lookup = ip6_neigh_lookup,
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
160 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
162 return mtu ? : dst->dev->mtu;
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
175 static struct dst_ops ip6_dst_blackhole_ops = {
177 .protocol = cpu_to_be16(ETH_P_IPV6),
178 .destroy = ip6_dst_destroy,
179 .check = ip6_dst_check,
180 .mtu = ip6_blackhole_mtu,
181 .default_advmss = ip6_default_advmss,
182 .update_pmtu = ip6_rt_blackhole_update_pmtu,
183 .cow_metrics = ip6_rt_blackhole_cow_metrics,
184 .neigh_lookup = ip6_neigh_lookup,
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188 [RTAX_HOPLIMIT - 1] = 0,
191 static struct rt6_info ip6_null_entry_template = {
193 .__refcnt = ATOMIC_INIT(1),
196 .error = -ENETUNREACH,
197 .input = ip6_pkt_discard,
198 .output = ip6_pkt_discard_out,
200 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
201 .rt6i_protocol = RTPROT_KERNEL,
202 .rt6i_metric = ~(u32) 0,
203 .rt6i_ref = ATOMIC_INIT(1),
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
211 static struct rt6_info ip6_prohibit_entry_template = {
213 .__refcnt = ATOMIC_INIT(1),
217 .input = ip6_pkt_prohibit,
218 .output = ip6_pkt_prohibit_out,
220 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
221 .rt6i_protocol = RTPROT_KERNEL,
222 .rt6i_metric = ~(u32) 0,
223 .rt6i_ref = ATOMIC_INIT(1),
226 static struct rt6_info ip6_blk_hole_entry_template = {
228 .__refcnt = ATOMIC_INIT(1),
232 .input = dst_discard,
233 .output = dst_discard,
235 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
236 .rt6i_protocol = RTPROT_KERNEL,
237 .rt6i_metric = ~(u32) 0,
238 .rt6i_ref = ATOMIC_INIT(1),
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245 struct net_device *dev,
248 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
251 memset(&rt->rt6i_table, 0,
252 sizeof(*rt) - sizeof(struct dst_entry));
257 static void ip6_dst_destroy(struct dst_entry *dst)
259 struct rt6_info *rt = (struct rt6_info *)dst;
260 struct inet6_dev *idev = rt->rt6i_idev;
261 struct inet_peer *peer = rt->rt6i_peer;
263 if (!(rt->dst.flags & DST_HOST))
264 dst_destroy_metrics_generic(dst);
267 rt->rt6i_idev = NULL;
271 rt->rt6i_peer = NULL;
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
278 static u32 rt6_peer_genid(void)
280 return atomic_read(&__rt6_peer_genid);
283 void rt6_bind_peer(struct rt6_info *rt, int create)
285 struct inet_peer *peer;
287 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
291 rt->rt6i_peer_genid = rt6_peer_genid();
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
297 struct rt6_info *rt = (struct rt6_info *)dst;
298 struct inet6_dev *idev = rt->rt6i_idev;
299 struct net_device *loopback_dev =
300 dev_net(dev)->loopback_dev;
302 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
303 struct inet6_dev *loopback_idev =
304 in6_dev_get(loopback_dev);
305 if (loopback_idev != NULL) {
306 rt->rt6i_idev = loopback_idev;
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
314 return (rt->rt6i_flags & RTF_EXPIRES) &&
315 time_after(jiffies, rt->rt6i_expires);
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
320 return ipv6_addr_type(daddr) &
321 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
325 * Route lookup. Any table->tb6_lock is implied.
328 static inline struct rt6_info *rt6_device_match(struct net *net,
330 const struct in6_addr *saddr,
334 struct rt6_info *local = NULL;
335 struct rt6_info *sprt;
337 if (!oif && ipv6_addr_any(saddr))
340 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341 struct net_device *dev = sprt->rt6i_dev;
344 if (dev->ifindex == oif)
346 if (dev->flags & IFF_LOOPBACK) {
347 if (sprt->rt6i_idev == NULL ||
348 sprt->rt6i_idev->dev->ifindex != oif) {
349 if (flags & RT6_LOOKUP_F_IFACE && oif)
351 if (local && (!oif ||
352 local->rt6i_idev->dev->ifindex == oif))
358 if (ipv6_chk_addr(net, saddr, dev,
359 flags & RT6_LOOKUP_F_IFACE))
368 if (flags & RT6_LOOKUP_F_IFACE)
369 return net->ipv6.ip6_null_entry;
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
378 struct neighbour *neigh;
380 * Okay, this does not seem to be appropriate
381 * for now, however, we need to check if it
382 * is really so; aka Router Reachability Probing.
384 * Router Reachability Probe MUST be rate-limited
385 * to no more than one per minute.
388 neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389 if (!neigh || (neigh->nud_state & NUD_VALID))
391 read_lock_bh(&neigh->lock);
392 if (!(neigh->nud_state & NUD_VALID) &&
393 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394 struct in6_addr mcaddr;
395 struct in6_addr *target;
397 neigh->updated = jiffies;
398 read_unlock_bh(&neigh->lock);
400 target = (struct in6_addr *)&neigh->primary_key;
401 addrconf_addr_solict_mult(target, &mcaddr);
402 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
404 read_unlock_bh(&neigh->lock);
410 static inline void rt6_probe(struct rt6_info *rt)
416 * Default Router Selection (RFC 2461 6.3.6)
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
420 struct net_device *dev = rt->rt6i_dev;
421 if (!oif || dev->ifindex == oif)
423 if ((dev->flags & IFF_LOOPBACK) &&
424 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
429 static inline int rt6_check_neigh(struct rt6_info *rt)
431 struct neighbour *neigh;
435 neigh = dst_get_neighbour(&rt->dst);
436 if (rt->rt6i_flags & RTF_NONEXTHOP ||
437 !(rt->rt6i_flags & RTF_GATEWAY))
440 read_lock_bh(&neigh->lock);
441 if (neigh->nud_state & NUD_VALID)
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444 else if (neigh->nud_state & NUD_FAILED)
449 read_unlock_bh(&neigh->lock);
456 static int rt6_score_route(struct rt6_info *rt, int oif,
461 m = rt6_check_dev(rt, oif);
462 if (!m && (strict & RT6_LOOKUP_F_IFACE))
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
467 n = rt6_check_neigh(rt);
468 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474 int *mpri, struct rt6_info *match)
478 if (rt6_check_expired(rt))
481 m = rt6_score_route(rt, oif, strict);
486 if (strict & RT6_LOOKUP_F_REACHABLE)
490 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499 struct rt6_info *rr_head,
500 u32 metric, int oif, int strict)
502 struct rt6_info *rt, *match;
506 for (rt = rr_head; rt && rt->rt6i_metric == metric;
507 rt = rt->dst.rt6_next)
508 match = find_match(rt, oif, strict, &mpri, match);
509 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510 rt = rt->dst.rt6_next)
511 match = find_match(rt, oif, strict, &mpri, match);
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
518 struct rt6_info *match, *rt0;
521 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522 __func__, fn->leaf, oif);
526 fn->rr_ptr = rt0 = fn->leaf;
528 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
531 (strict & RT6_LOOKUP_F_REACHABLE)) {
532 struct rt6_info *next = rt0->dst.rt6_next;
534 /* no entries matched; do round-robin */
535 if (!next || next->rt6i_metric != rt0->rt6i_metric)
542 RT6_TRACE("%s() => %p\n",
545 net = dev_net(rt0->rt6i_dev);
546 return match ? match : net->ipv6.ip6_null_entry;
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551 const struct in6_addr *gwaddr)
553 struct net *net = dev_net(dev);
554 struct route_info *rinfo = (struct route_info *) opt;
555 struct in6_addr prefix_buf, *prefix;
557 unsigned long lifetime;
560 if (len < sizeof(struct route_info)) {
564 /* Sanity check for prefix_len and length */
565 if (rinfo->length > 3) {
567 } else if (rinfo->prefix_len > 128) {
569 } else if (rinfo->prefix_len > 64) {
570 if (rinfo->length < 2) {
573 } else if (rinfo->prefix_len > 0) {
574 if (rinfo->length < 1) {
579 pref = rinfo->route_pref;
580 if (pref == ICMPV6_ROUTER_PREF_INVALID)
583 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
585 if (rinfo->length == 3)
586 prefix = (struct in6_addr *)rinfo->prefix;
588 /* this function is safe */
589 ipv6_addr_prefix(&prefix_buf,
590 (struct in6_addr *)rinfo->prefix,
592 prefix = &prefix_buf;
595 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
598 if (rt && !lifetime) {
604 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
607 rt->rt6i_flags = RTF_ROUTEINFO |
608 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
611 if (!addrconf_finite_timeout(lifetime)) {
612 rt->rt6i_flags &= ~RTF_EXPIRES;
614 rt->rt6i_expires = jiffies + HZ * lifetime;
615 rt->rt6i_flags |= RTF_EXPIRES;
617 dst_release(&rt->dst);
623 #define BACKTRACK(__net, saddr) \
625 if (rt == __net->ipv6.ip6_null_entry) { \
626 struct fib6_node *pn; \
628 if (fn->fn_flags & RTN_TL_ROOT) \
631 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
632 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
635 if (fn->fn_flags & RTN_RTINFO) \
641 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
642 struct fib6_table *table,
643 struct flowi6 *fl6, int flags)
645 struct fib6_node *fn;
648 read_lock_bh(&table->tb6_lock);
649 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
652 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
653 BACKTRACK(net, &fl6->saddr);
655 dst_use(&rt->dst, jiffies);
656 read_unlock_bh(&table->tb6_lock);
661 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
662 const struct in6_addr *saddr, int oif, int strict)
664 struct flowi6 fl6 = {
668 struct dst_entry *dst;
669 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
672 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
673 flags |= RT6_LOOKUP_F_HAS_SADDR;
676 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
678 return (struct rt6_info *) dst;
685 EXPORT_SYMBOL(rt6_lookup);
687 /* ip6_ins_rt is called with FREE table->tb6_lock.
688 It takes new route entry, the addition fails by any reason the
689 route is freed. In any case, if caller does not hold it, it may
693 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
696 struct fib6_table *table;
698 table = rt->rt6i_table;
699 write_lock_bh(&table->tb6_lock);
700 err = fib6_add(&table->tb6_root, rt, info);
701 write_unlock_bh(&table->tb6_lock);
706 int ip6_ins_rt(struct rt6_info *rt)
708 struct nl_info info = {
709 .nl_net = dev_net(rt->rt6i_dev),
711 return __ip6_ins_rt(rt, &info);
714 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
715 const struct in6_addr *daddr,
716 const struct in6_addr *saddr)
724 rt = ip6_rt_copy(ort, daddr);
727 struct neighbour *neigh;
728 int attempts = !in_softirq();
730 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
731 if (ort->rt6i_dst.plen != 128 &&
732 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
733 rt->rt6i_flags |= RTF_ANYCAST;
734 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
737 rt->rt6i_flags |= RTF_CACHE;
739 #ifdef CONFIG_IPV6_SUBTREES
740 if (rt->rt6i_src.plen && saddr) {
741 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
742 rt->rt6i_src.plen = 128;
747 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
749 struct net *net = dev_net(rt->rt6i_dev);
750 int saved_rt_min_interval =
751 net->ipv6.sysctl.ip6_rt_gc_min_interval;
752 int saved_rt_elasticity =
753 net->ipv6.sysctl.ip6_rt_gc_elasticity;
755 if (attempts-- > 0) {
756 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
757 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
759 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
761 net->ipv6.sysctl.ip6_rt_gc_elasticity =
763 net->ipv6.sysctl.ip6_rt_gc_min_interval =
764 saved_rt_min_interval;
770 "ipv6: Neighbour table overflow.\n");
774 dst_set_neighbour(&rt->dst, neigh);
781 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
782 const struct in6_addr *daddr)
784 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
787 rt->rt6i_flags |= RTF_CACHE;
788 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
793 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
794 struct flowi6 *fl6, int flags)
796 struct fib6_node *fn;
797 struct rt6_info *rt, *nrt;
801 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
803 strict |= flags & RT6_LOOKUP_F_IFACE;
806 read_lock_bh(&table->tb6_lock);
809 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
812 rt = rt6_select(fn, oif, strict | reachable);
814 BACKTRACK(net, &fl6->saddr);
815 if (rt == net->ipv6.ip6_null_entry ||
816 rt->rt6i_flags & RTF_CACHE)
820 read_unlock_bh(&table->tb6_lock);
822 if (!dst_get_neighbour_raw(&rt->dst)
823 && !(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_LOCAL)))
824 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
825 else if (!(rt->dst.flags & DST_HOST))
826 nrt = rt6_alloc_clone(rt, &fl6->daddr);
830 dst_release(&rt->dst);
831 rt = nrt ? : net->ipv6.ip6_null_entry;
835 err = ip6_ins_rt(nrt);
844 * Race condition! In the gap, when table->tb6_lock was
845 * released someone could insert this route. Relookup.
847 dst_release(&rt->dst);
856 read_unlock_bh(&table->tb6_lock);
858 rt->dst.lastuse = jiffies;
864 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
865 struct flowi6 *fl6, int flags)
867 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
870 void ip6_route_input(struct sk_buff *skb)
872 const struct ipv6hdr *iph = ipv6_hdr(skb);
873 struct net *net = dev_net(skb->dev);
874 int flags = RT6_LOOKUP_F_HAS_SADDR;
875 struct flowi6 fl6 = {
876 .flowi6_iif = skb->dev->ifindex,
879 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
880 .flowi6_mark = skb->mark,
881 .flowi6_proto = iph->nexthdr,
884 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
885 flags |= RT6_LOOKUP_F_IFACE;
887 skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
890 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
891 struct flowi6 *fl6, int flags)
893 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
896 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
901 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
902 flags |= RT6_LOOKUP_F_IFACE;
904 if (!ipv6_addr_any(&fl6->saddr))
905 flags |= RT6_LOOKUP_F_HAS_SADDR;
907 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
909 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
912 EXPORT_SYMBOL(ip6_route_output);
914 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
916 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
917 struct dst_entry *new = NULL;
919 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
921 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
926 new->input = dst_discard;
927 new->output = dst_discard;
929 if (dst_metrics_read_only(&ort->dst))
930 new->_metrics = ort->dst._metrics;
932 dst_copy_metrics(new, &ort->dst);
933 rt->rt6i_idev = ort->rt6i_idev;
935 in6_dev_hold(rt->rt6i_idev);
936 rt->rt6i_expires = 0;
938 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
939 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
942 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
943 #ifdef CONFIG_IPV6_SUBTREES
944 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
950 dst_release(dst_orig);
951 return new ? new : ERR_PTR(-ENOMEM);
955 * Destination cache support functions
958 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
962 rt = (struct rt6_info *) dst;
964 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
965 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
967 rt6_bind_peer(rt, 0);
968 rt->rt6i_peer_genid = rt6_peer_genid();
975 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
977 struct rt6_info *rt = (struct rt6_info *) dst;
980 if (rt->rt6i_flags & RTF_CACHE) {
981 if (rt6_check_expired(rt)) {
993 static void ip6_link_failure(struct sk_buff *skb)
997 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
999 rt = (struct rt6_info *) skb_dst(skb);
1001 if (rt->rt6i_flags&RTF_CACHE) {
1002 dst_set_expires(&rt->dst, 0);
1003 rt->rt6i_flags |= RTF_EXPIRES;
1004 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1005 rt->rt6i_node->fn_sernum = -1;
1009 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1011 struct rt6_info *rt6 = (struct rt6_info*)dst;
1013 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1014 rt6->rt6i_flags |= RTF_MODIFIED;
1015 if (mtu < IPV6_MIN_MTU) {
1016 u32 features = dst_metric(dst, RTAX_FEATURES);
1018 features |= RTAX_FEATURE_ALLFRAG;
1019 dst_metric_set(dst, RTAX_FEATURES, features);
1021 dst_metric_set(dst, RTAX_MTU, mtu);
1025 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1027 struct net_device *dev = dst->dev;
1028 unsigned int mtu = dst_mtu(dst);
1029 struct net *net = dev_net(dev);
1031 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1033 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1034 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1037 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1038 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1039 * IPV6_MAXPLEN is also valid and means: "any MSS,
1040 * rely only on pmtu discovery"
1042 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1047 static unsigned int ip6_mtu(const struct dst_entry *dst)
1049 struct inet6_dev *idev;
1050 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1058 idev = __in6_dev_get(dst->dev);
1060 mtu = idev->cnf.mtu6;
1066 static struct dst_entry *icmp6_dst_gc_list;
1067 static DEFINE_SPINLOCK(icmp6_dst_lock);
1069 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1070 struct neighbour *neigh,
1071 const struct in6_addr *addr)
1073 struct rt6_info *rt;
1074 struct inet6_dev *idev = in6_dev_get(dev);
1075 struct net *net = dev_net(dev);
1077 if (unlikely(idev == NULL))
1080 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1081 if (unlikely(rt == NULL)) {
1089 neigh = ndisc_get_neigh(dev, addr);
1094 rt->dst.flags |= DST_HOST;
1095 rt->dst.output = ip6_output;
1096 dst_set_neighbour(&rt->dst, neigh);
1097 atomic_set(&rt->dst.__refcnt, 1);
1098 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1099 rt->rt6i_dst.plen = 128;
1100 rt->rt6i_idev = idev;
1101 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1103 spin_lock_bh(&icmp6_dst_lock);
1104 rt->dst.next = icmp6_dst_gc_list;
1105 icmp6_dst_gc_list = &rt->dst;
1106 spin_unlock_bh(&icmp6_dst_lock);
1108 fib6_force_start_gc(net);
1114 int icmp6_dst_gc(void)
1116 struct dst_entry *dst, **pprev;
1119 spin_lock_bh(&icmp6_dst_lock);
1120 pprev = &icmp6_dst_gc_list;
1122 while ((dst = *pprev) != NULL) {
1123 if (!atomic_read(&dst->__refcnt)) {
1132 spin_unlock_bh(&icmp6_dst_lock);
1137 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1140 struct dst_entry *dst, **pprev;
1142 spin_lock_bh(&icmp6_dst_lock);
1143 pprev = &icmp6_dst_gc_list;
1144 while ((dst = *pprev) != NULL) {
1145 struct rt6_info *rt = (struct rt6_info *) dst;
1146 if (func(rt, arg)) {
1153 spin_unlock_bh(&icmp6_dst_lock);
1156 static int ip6_dst_gc(struct dst_ops *ops)
1158 unsigned long now = jiffies;
1159 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1160 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1161 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1162 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1163 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1164 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1167 entries = dst_entries_get_fast(ops);
1168 if (time_after(rt_last_gc + rt_min_interval, now) &&
1169 entries <= rt_max_size)
1172 net->ipv6.ip6_rt_gc_expire++;
1173 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1174 net->ipv6.ip6_rt_last_gc = now;
1175 entries = dst_entries_get_slow(ops);
1176 if (entries < ops->gc_thresh)
1177 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1179 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1180 return entries > rt_max_size;
1183 /* Clean host part of a prefix. Not necessary in radix tree,
1184 but results in cleaner routing tables.
1186 Remove it only when all the things will work!
1189 int ip6_dst_hoplimit(struct dst_entry *dst)
1191 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1192 if (hoplimit == 0) {
1193 struct net_device *dev = dst->dev;
1194 struct inet6_dev *idev;
1197 idev = __in6_dev_get(dev);
1199 hoplimit = idev->cnf.hop_limit;
1201 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1206 EXPORT_SYMBOL(ip6_dst_hoplimit);
1212 int ip6_route_add(struct fib6_config *cfg)
1215 struct net *net = cfg->fc_nlinfo.nl_net;
1216 struct rt6_info *rt = NULL;
1217 struct net_device *dev = NULL;
1218 struct inet6_dev *idev = NULL;
1219 struct fib6_table *table;
1222 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1224 #ifndef CONFIG_IPV6_SUBTREES
1225 if (cfg->fc_src_len)
1228 if (cfg->fc_ifindex) {
1230 dev = dev_get_by_index(net, cfg->fc_ifindex);
1233 idev = in6_dev_get(dev);
1238 if (cfg->fc_metric == 0)
1239 cfg->fc_metric = IP6_RT_PRIO_USER;
1241 table = fib6_new_table(net, cfg->fc_table);
1242 if (table == NULL) {
1247 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1254 rt->dst.obsolete = -1;
1255 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1256 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1259 if (cfg->fc_protocol == RTPROT_UNSPEC)
1260 cfg->fc_protocol = RTPROT_BOOT;
1261 rt->rt6i_protocol = cfg->fc_protocol;
1263 addr_type = ipv6_addr_type(&cfg->fc_dst);
1265 if (addr_type & IPV6_ADDR_MULTICAST)
1266 rt->dst.input = ip6_mc_input;
1267 else if (cfg->fc_flags & RTF_LOCAL)
1268 rt->dst.input = ip6_input;
1270 rt->dst.input = ip6_forward;
1272 rt->dst.output = ip6_output;
1274 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1275 rt->rt6i_dst.plen = cfg->fc_dst_len;
1276 if (rt->rt6i_dst.plen == 128)
1277 rt->dst.flags |= DST_HOST;
1279 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1280 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1285 dst_init_metrics(&rt->dst, metrics, 0);
1287 #ifdef CONFIG_IPV6_SUBTREES
1288 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1289 rt->rt6i_src.plen = cfg->fc_src_len;
1292 rt->rt6i_metric = cfg->fc_metric;
1294 /* We cannot add true routes via loopback here,
1295 they would result in kernel looping; promote them to reject routes
1297 if ((cfg->fc_flags & RTF_REJECT) ||
1298 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1299 && !(cfg->fc_flags&RTF_LOCAL))) {
1300 /* hold loopback dev/idev if we haven't done so. */
1301 if (dev != net->loopback_dev) {
1306 dev = net->loopback_dev;
1308 idev = in6_dev_get(dev);
1314 rt->dst.output = ip6_pkt_discard_out;
1315 rt->dst.input = ip6_pkt_discard;
1316 rt->dst.error = -ENETUNREACH;
1317 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1321 if (cfg->fc_flags & RTF_GATEWAY) {
1322 const struct in6_addr *gw_addr;
1325 gw_addr = &cfg->fc_gateway;
1326 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1327 gwa_type = ipv6_addr_type(gw_addr);
1329 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1330 struct rt6_info *grt;
1332 /* IPv6 strictly inhibits using not link-local
1333 addresses as nexthop address.
1334 Otherwise, router will not able to send redirects.
1335 It is very good, but in some (rare!) circumstances
1336 (SIT, PtP, NBMA NOARP links) it is handy to allow
1337 some exceptions. --ANK
1340 if (!(gwa_type&IPV6_ADDR_UNICAST))
1343 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1345 err = -EHOSTUNREACH;
1349 if (dev != grt->rt6i_dev) {
1350 dst_release(&grt->dst);
1354 dev = grt->rt6i_dev;
1355 idev = grt->rt6i_idev;
1357 in6_dev_hold(grt->rt6i_idev);
1359 if (!(grt->rt6i_flags&RTF_GATEWAY))
1361 dst_release(&grt->dst);
1367 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1375 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1376 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1380 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1381 rt->rt6i_prefsrc.plen = 128;
1383 rt->rt6i_prefsrc.plen = 0;
1385 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1386 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1391 dst_set_neighbour(&rt->dst, n);
1394 rt->rt6i_flags = cfg->fc_flags;
1401 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1402 int type = nla_type(nla);
1405 if (type > RTAX_MAX) {
1410 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1416 rt->rt6i_idev = idev;
1417 rt->rt6i_table = table;
1419 cfg->fc_nlinfo.nl_net = dev_net(dev);
1421 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1433 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1436 struct fib6_table *table;
1437 struct net *net = dev_net(rt->rt6i_dev);
1439 if (rt == net->ipv6.ip6_null_entry) {
1444 table = rt->rt6i_table;
1445 write_lock_bh(&table->tb6_lock);
1446 err = fib6_del(rt, info);
1447 write_unlock_bh(&table->tb6_lock);
1450 dst_release(&rt->dst);
1454 int ip6_del_rt(struct rt6_info *rt)
1456 struct nl_info info = {
1457 .nl_net = dev_net(rt->rt6i_dev),
1459 return __ip6_del_rt(rt, &info);
1462 static int ip6_route_del(struct fib6_config *cfg)
1464 struct fib6_table *table;
1465 struct fib6_node *fn;
1466 struct rt6_info *rt;
1469 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1473 read_lock_bh(&table->tb6_lock);
1475 fn = fib6_locate(&table->tb6_root,
1476 &cfg->fc_dst, cfg->fc_dst_len,
1477 &cfg->fc_src, cfg->fc_src_len);
1480 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1481 if (cfg->fc_ifindex &&
1482 (rt->rt6i_dev == NULL ||
1483 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1485 if (cfg->fc_flags & RTF_GATEWAY &&
1486 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1488 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1491 read_unlock_bh(&table->tb6_lock);
1493 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1496 read_unlock_bh(&table->tb6_lock);
1504 struct ip6rd_flowi {
1506 struct in6_addr gateway;
1509 static struct rt6_info *__ip6_route_redirect(struct net *net,
1510 struct fib6_table *table,
1514 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1515 struct rt6_info *rt;
1516 struct fib6_node *fn;
1519 * Get the "current" route for this destination and
1520 * check if the redirect has come from approriate router.
1522 * RFC 2461 specifies that redirects should only be
1523 * accepted if they come from the nexthop to the target.
1524 * Due to the way the routes are chosen, this notion
1525 * is a bit fuzzy and one might need to check all possible
1529 read_lock_bh(&table->tb6_lock);
1530 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1532 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1534 * Current route is on-link; redirect is always invalid.
1536 * Seems, previous statement is not true. It could
1537 * be node, which looks for us as on-link (f.e. proxy ndisc)
1538 * But then router serving it might decide, that we should
1539 * know truth 8)8) --ANK (980726).
1541 if (rt6_check_expired(rt))
1543 if (!(rt->rt6i_flags & RTF_GATEWAY))
1545 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1547 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1553 rt = net->ipv6.ip6_null_entry;
1554 BACKTRACK(net, &fl6->saddr);
1558 read_unlock_bh(&table->tb6_lock);
1563 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1564 const struct in6_addr *src,
1565 const struct in6_addr *gateway,
1566 struct net_device *dev)
1568 int flags = RT6_LOOKUP_F_HAS_SADDR;
1569 struct net *net = dev_net(dev);
1570 struct ip6rd_flowi rdfl = {
1572 .flowi6_oif = dev->ifindex,
1578 ipv6_addr_copy(&rdfl.gateway, gateway);
1580 if (rt6_need_strict(dest))
1581 flags |= RT6_LOOKUP_F_IFACE;
1583 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1584 flags, __ip6_route_redirect);
1587 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1588 const struct in6_addr *saddr,
1589 struct neighbour *neigh, u8 *lladdr, int on_link)
1591 struct rt6_info *rt, *nrt = NULL;
1592 struct netevent_redirect netevent;
1593 struct net *net = dev_net(neigh->dev);
1595 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1597 if (rt == net->ipv6.ip6_null_entry) {
1598 if (net_ratelimit())
1599 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1600 "for redirect target\n");
1605 * We have finally decided to accept it.
1608 neigh_update(neigh, lladdr, NUD_STALE,
1609 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1610 NEIGH_UPDATE_F_OVERRIDE|
1611 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1612 NEIGH_UPDATE_F_ISROUTER))
1616 * Redirect received -> path was valid.
1617 * Look, redirects are sent only in response to data packets,
1618 * so that this nexthop apparently is reachable. --ANK
1620 dst_confirm(&rt->dst);
1622 /* Duplicate redirect: silently ignore. */
1623 if (neigh == dst_get_neighbour_raw(&rt->dst))
1626 nrt = ip6_rt_copy(rt, dest);
1630 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1632 nrt->rt6i_flags &= ~RTF_GATEWAY;
1634 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1635 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1637 if (ip6_ins_rt(nrt))
1640 netevent.old = &rt->dst;
1641 netevent.new = &nrt->dst;
1642 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1644 if (rt->rt6i_flags&RTF_CACHE) {
1650 dst_release(&rt->dst);
1654 * Handle ICMP "packet too big" messages
1655 * i.e. Path MTU discovery
1658 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1659 struct net *net, u32 pmtu, int ifindex)
1661 struct rt6_info *rt, *nrt;
1664 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1668 if (rt6_check_expired(rt)) {
1673 if (pmtu >= dst_mtu(&rt->dst))
1676 if (pmtu < IPV6_MIN_MTU) {
1678 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1679 * MTU (1280) and a fragment header should always be included
1680 * after a node receiving Too Big message reporting PMTU is
1681 * less than the IPv6 Minimum Link MTU.
1683 pmtu = IPV6_MIN_MTU;
1687 /* New mtu received -> path was valid.
1688 They are sent only in response to data packets,
1689 so that this nexthop apparently is reachable. --ANK
1691 dst_confirm(&rt->dst);
1693 /* Host route. If it is static, it would be better
1694 not to override it, but add new one, so that
1695 when cache entry will expire old pmtu
1696 would return automatically.
1698 if (rt->rt6i_flags & RTF_CACHE) {
1699 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1701 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1702 features |= RTAX_FEATURE_ALLFRAG;
1703 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1705 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1706 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1711 Two cases are possible:
1712 1. It is connected route. Action: COW
1713 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1715 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1716 nrt = rt6_alloc_cow(rt, daddr, saddr);
1718 nrt = rt6_alloc_clone(rt, daddr);
1721 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1723 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1724 features |= RTAX_FEATURE_ALLFRAG;
1725 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1728 /* According to RFC 1981, detecting PMTU increase shouldn't be
1729 * happened within 5 mins, the recommended timer is 10 mins.
1730 * Here this route expiration time is set to ip6_rt_mtu_expires
1731 * which is 10 mins. After 10 mins the decreased pmtu is expired
1732 * and detecting PMTU increase will be automatically happened.
1734 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1735 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1740 dst_release(&rt->dst);
1743 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1744 struct net_device *dev, u32 pmtu)
1746 struct net *net = dev_net(dev);
1749 * RFC 1981 states that a node "MUST reduce the size of the packets it
1750 * is sending along the path" that caused the Packet Too Big message.
1751 * Since it's not possible in the general case to determine which
1752 * interface was used to send the original packet, we update the MTU
1753 * on the interface that will be used to send future packets. We also
1754 * update the MTU on the interface that received the Packet Too Big in
1755 * case the original packet was forced out that interface with
1756 * SO_BINDTODEVICE or similar. This is the next best thing to the
1757 * correct behaviour, which would be to update the MTU on all
1760 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1761 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1765 * Misc support functions
1768 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1769 const struct in6_addr *dest)
1771 struct net *net = dev_net(ort->rt6i_dev);
1772 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1776 rt->dst.input = ort->dst.input;
1777 rt->dst.output = ort->dst.output;
1778 rt->dst.flags |= DST_HOST;
1780 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1781 rt->rt6i_dst.plen = 128;
1782 dst_copy_metrics(&rt->dst, &ort->dst);
1783 rt->dst.error = ort->dst.error;
1784 rt->rt6i_idev = ort->rt6i_idev;
1786 in6_dev_hold(rt->rt6i_idev);
1787 rt->dst.lastuse = jiffies;
1788 rt->rt6i_expires = 0;
1790 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1791 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1792 rt->rt6i_metric = 0;
1794 #ifdef CONFIG_IPV6_SUBTREES
1795 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1797 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1798 rt->rt6i_table = ort->rt6i_table;
1803 #ifdef CONFIG_IPV6_ROUTE_INFO
1804 static struct rt6_info *rt6_get_route_info(struct net *net,
1805 const struct in6_addr *prefix, int prefixlen,
1806 const struct in6_addr *gwaddr, int ifindex)
1808 struct fib6_node *fn;
1809 struct rt6_info *rt = NULL;
1810 struct fib6_table *table;
1812 table = fib6_get_table(net, RT6_TABLE_INFO);
1816 write_lock_bh(&table->tb6_lock);
1817 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1821 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1822 if (rt->rt6i_dev->ifindex != ifindex)
1824 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1826 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1832 write_unlock_bh(&table->tb6_lock);
1836 static struct rt6_info *rt6_add_route_info(struct net *net,
1837 const struct in6_addr *prefix, int prefixlen,
1838 const struct in6_addr *gwaddr, int ifindex,
1841 struct fib6_config cfg = {
1842 .fc_table = RT6_TABLE_INFO,
1843 .fc_metric = IP6_RT_PRIO_USER,
1844 .fc_ifindex = ifindex,
1845 .fc_dst_len = prefixlen,
1846 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1847 RTF_UP | RTF_PREF(pref),
1849 .fc_nlinfo.nlh = NULL,
1850 .fc_nlinfo.nl_net = net,
1853 ipv6_addr_copy(&cfg.fc_dst, prefix);
1854 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1856 /* We should treat it as a default route if prefix length is 0. */
1858 cfg.fc_flags |= RTF_DEFAULT;
1860 ip6_route_add(&cfg);
1862 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1866 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1868 struct rt6_info *rt;
1869 struct fib6_table *table;
1871 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1875 write_lock_bh(&table->tb6_lock);
1876 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1877 if (dev == rt->rt6i_dev &&
1878 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1879 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1884 write_unlock_bh(&table->tb6_lock);
1888 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1889 struct net_device *dev,
1892 struct fib6_config cfg = {
1893 .fc_table = RT6_TABLE_DFLT,
1894 .fc_metric = IP6_RT_PRIO_USER,
1895 .fc_ifindex = dev->ifindex,
1896 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1897 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1899 .fc_nlinfo.nlh = NULL,
1900 .fc_nlinfo.nl_net = dev_net(dev),
1903 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1905 ip6_route_add(&cfg);
1907 return rt6_get_dflt_router(gwaddr, dev);
1910 void rt6_purge_dflt_routers(struct net *net)
1912 struct rt6_info *rt;
1913 struct fib6_table *table;
1915 /* NOTE: Keep consistent with rt6_get_dflt_router */
1916 table = fib6_get_table(net, RT6_TABLE_DFLT);
1921 read_lock_bh(&table->tb6_lock);
1922 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1923 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
1924 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
1926 read_unlock_bh(&table->tb6_lock);
1931 read_unlock_bh(&table->tb6_lock);
1934 static void rtmsg_to_fib6_config(struct net *net,
1935 struct in6_rtmsg *rtmsg,
1936 struct fib6_config *cfg)
1938 memset(cfg, 0, sizeof(*cfg));
1940 cfg->fc_table = RT6_TABLE_MAIN;
1941 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1942 cfg->fc_metric = rtmsg->rtmsg_metric;
1943 cfg->fc_expires = rtmsg->rtmsg_info;
1944 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1945 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1946 cfg->fc_flags = rtmsg->rtmsg_flags;
1948 cfg->fc_nlinfo.nl_net = net;
1950 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1951 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1952 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1955 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1957 struct fib6_config cfg;
1958 struct in6_rtmsg rtmsg;
1962 case SIOCADDRT: /* Add a route */
1963 case SIOCDELRT: /* Delete a route */
1964 if (!capable(CAP_NET_ADMIN))
1966 err = copy_from_user(&rtmsg, arg,
1967 sizeof(struct in6_rtmsg));
1971 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1976 err = ip6_route_add(&cfg);
1979 err = ip6_route_del(&cfg);
1993 * Drop the packet on the floor
1996 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1999 struct dst_entry *dst = skb_dst(skb);
2000 switch (ipstats_mib_noroutes) {
2001 case IPSTATS_MIB_INNOROUTES:
2002 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2003 if (type == IPV6_ADDR_ANY) {
2004 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2005 IPSTATS_MIB_INADDRERRORS);
2009 case IPSTATS_MIB_OUTNOROUTES:
2010 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2011 ipstats_mib_noroutes);
2014 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2019 static int ip6_pkt_discard(struct sk_buff *skb)
2021 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2024 static int ip6_pkt_discard_out(struct sk_buff *skb)
2026 skb->dev = skb_dst(skb)->dev;
2027 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2030 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2032 static int ip6_pkt_prohibit(struct sk_buff *skb)
2034 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2037 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2039 skb->dev = skb_dst(skb)->dev;
2040 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2046 * Allocate a dst for local (unicast / anycast) address.
2049 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2050 const struct in6_addr *addr,
2053 struct net *net = dev_net(idev->dev);
2054 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2055 net->loopback_dev, 0);
2056 struct neighbour *neigh;
2059 if (net_ratelimit())
2060 pr_warning("IPv6: Maximum number of routes reached,"
2061 " consider increasing route/max_size.\n");
2062 return ERR_PTR(-ENOMEM);
2067 rt->dst.flags |= DST_HOST;
2068 rt->dst.input = ip6_input;
2069 rt->dst.output = ip6_output;
2070 rt->rt6i_idev = idev;
2071 rt->dst.obsolete = -1;
2073 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2075 rt->rt6i_flags |= RTF_ANYCAST;
2077 rt->rt6i_flags |= RTF_LOCAL;
2078 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2079 if (IS_ERR(neigh)) {
2082 return ERR_CAST(neigh);
2084 dst_set_neighbour(&rt->dst, neigh);
2086 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2087 rt->rt6i_dst.plen = 128;
2088 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2090 atomic_set(&rt->dst.__refcnt, 1);
2095 int ip6_route_get_saddr(struct net *net,
2096 struct rt6_info *rt,
2097 const struct in6_addr *daddr,
2099 struct in6_addr *saddr)
2101 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2103 if (rt->rt6i_prefsrc.plen)
2104 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2106 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2107 daddr, prefs, saddr);
2111 /* remove deleted ip from prefsrc entries */
2112 struct arg_dev_net_ip {
2113 struct net_device *dev;
2115 struct in6_addr *addr;
2118 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2120 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2121 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2122 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2124 if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2125 rt != net->ipv6.ip6_null_entry &&
2126 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2127 /* remove prefsrc entry */
2128 rt->rt6i_prefsrc.plen = 0;
2133 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2135 struct net *net = dev_net(ifp->idev->dev);
2136 struct arg_dev_net_ip adni = {
2137 .dev = ifp->idev->dev,
2141 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2144 struct arg_dev_net {
2145 struct net_device *dev;
2149 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2151 const struct arg_dev_net *adn = arg;
2152 const struct net_device *dev = adn->dev;
2154 if ((rt->rt6i_dev == dev || dev == NULL) &&
2155 rt != adn->net->ipv6.ip6_null_entry) {
2156 RT6_TRACE("deleted by ifdown %p\n", rt);
2162 void rt6_ifdown(struct net *net, struct net_device *dev)
2164 struct arg_dev_net adn = {
2169 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2170 icmp6_clean_all(fib6_ifdown, &adn);
2173 struct rt6_mtu_change_arg
2175 struct net_device *dev;
2179 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2181 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2182 struct inet6_dev *idev;
2184 /* In IPv6 pmtu discovery is not optional,
2185 so that RTAX_MTU lock cannot disable it.
2186 We still use this lock to block changes
2187 caused by addrconf/ndisc.
2190 idev = __in6_dev_get(arg->dev);
2194 /* For administrative MTU increase, there is no way to discover
2195 IPv6 PMTU increase, so PMTU increase should be updated here.
2196 Since RFC 1981 doesn't include administrative MTU increase
2197 update PMTU increase is a MUST. (i.e. jumbo frame)
2200 If new MTU is less than route PMTU, this new MTU will be the
2201 lowest MTU in the path, update the route PMTU to reflect PMTU
2202 decreases; if new MTU is greater than route PMTU, and the
2203 old MTU is the lowest MTU in the path, update the route PMTU
2204 to reflect the increase. In this case if the other nodes' MTU
2205 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2208 if (rt->rt6i_dev == arg->dev &&
2209 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2210 (dst_mtu(&rt->dst) >= arg->mtu ||
2211 (dst_mtu(&rt->dst) < arg->mtu &&
2212 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2213 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2218 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2220 struct rt6_mtu_change_arg arg = {
2225 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2228 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2229 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2230 [RTA_OIF] = { .type = NLA_U32 },
2231 [RTA_IIF] = { .type = NLA_U32 },
2232 [RTA_PRIORITY] = { .type = NLA_U32 },
2233 [RTA_METRICS] = { .type = NLA_NESTED },
2236 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2237 struct fib6_config *cfg)
2240 struct nlattr *tb[RTA_MAX+1];
2243 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2248 rtm = nlmsg_data(nlh);
2249 memset(cfg, 0, sizeof(*cfg));
2251 cfg->fc_table = rtm->rtm_table;
2252 cfg->fc_dst_len = rtm->rtm_dst_len;
2253 cfg->fc_src_len = rtm->rtm_src_len;
2254 cfg->fc_flags = RTF_UP;
2255 cfg->fc_protocol = rtm->rtm_protocol;
2257 if (rtm->rtm_type == RTN_UNREACHABLE)
2258 cfg->fc_flags |= RTF_REJECT;
2260 if (rtm->rtm_type == RTN_LOCAL)
2261 cfg->fc_flags |= RTF_LOCAL;
2263 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2264 cfg->fc_nlinfo.nlh = nlh;
2265 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2267 if (tb[RTA_GATEWAY]) {
2268 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2269 cfg->fc_flags |= RTF_GATEWAY;
2273 int plen = (rtm->rtm_dst_len + 7) >> 3;
2275 if (nla_len(tb[RTA_DST]) < plen)
2278 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2282 int plen = (rtm->rtm_src_len + 7) >> 3;
2284 if (nla_len(tb[RTA_SRC]) < plen)
2287 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2290 if (tb[RTA_PREFSRC])
2291 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2294 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2296 if (tb[RTA_PRIORITY])
2297 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2299 if (tb[RTA_METRICS]) {
2300 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2301 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2305 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2312 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2314 struct fib6_config cfg;
2317 err = rtm_to_fib6_config(skb, nlh, &cfg);
2321 return ip6_route_del(&cfg);
2324 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2326 struct fib6_config cfg;
2329 err = rtm_to_fib6_config(skb, nlh, &cfg);
2333 return ip6_route_add(&cfg);
2336 static inline size_t rt6_nlmsg_size(void)
2338 return NLMSG_ALIGN(sizeof(struct rtmsg))
2339 + nla_total_size(16) /* RTA_SRC */
2340 + nla_total_size(16) /* RTA_DST */
2341 + nla_total_size(16) /* RTA_GATEWAY */
2342 + nla_total_size(16) /* RTA_PREFSRC */
2343 + nla_total_size(4) /* RTA_TABLE */
2344 + nla_total_size(4) /* RTA_IIF */
2345 + nla_total_size(4) /* RTA_OIF */
2346 + nla_total_size(4) /* RTA_PRIORITY */
2347 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2348 + nla_total_size(sizeof(struct rta_cacheinfo));
2351 static int rt6_fill_node(struct net *net,
2352 struct sk_buff *skb, struct rt6_info *rt,
2353 struct in6_addr *dst, struct in6_addr *src,
2354 int iif, int type, u32 pid, u32 seq,
2355 int prefix, int nowait, unsigned int flags)
2358 struct nlmsghdr *nlh;
2361 struct neighbour *n;
2363 if (prefix) { /* user wants prefix routes only */
2364 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2365 /* success since this is not a prefix route */
2370 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2374 rtm = nlmsg_data(nlh);
2375 rtm->rtm_family = AF_INET6;
2376 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2377 rtm->rtm_src_len = rt->rt6i_src.plen;
2380 table = rt->rt6i_table->tb6_id;
2382 table = RT6_TABLE_UNSPEC;
2383 rtm->rtm_table = table;
2384 NLA_PUT_U32(skb, RTA_TABLE, table);
2385 if (rt->rt6i_flags&RTF_REJECT)
2386 rtm->rtm_type = RTN_UNREACHABLE;
2387 else if (rt->rt6i_flags&RTF_LOCAL)
2388 rtm->rtm_type = RTN_LOCAL;
2389 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2390 rtm->rtm_type = RTN_LOCAL;
2392 rtm->rtm_type = RTN_UNICAST;
2394 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2395 rtm->rtm_protocol = rt->rt6i_protocol;
2396 if (rt->rt6i_flags&RTF_DYNAMIC)
2397 rtm->rtm_protocol = RTPROT_REDIRECT;
2398 else if (rt->rt6i_flags & RTF_ADDRCONF)
2399 rtm->rtm_protocol = RTPROT_KERNEL;
2400 else if (rt->rt6i_flags&RTF_DEFAULT)
2401 rtm->rtm_protocol = RTPROT_RA;
2403 if (rt->rt6i_flags&RTF_CACHE)
2404 rtm->rtm_flags |= RTM_F_CLONED;
2407 NLA_PUT(skb, RTA_DST, 16, dst);
2408 rtm->rtm_dst_len = 128;
2409 } else if (rtm->rtm_dst_len)
2410 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2411 #ifdef CONFIG_IPV6_SUBTREES
2413 NLA_PUT(skb, RTA_SRC, 16, src);
2414 rtm->rtm_src_len = 128;
2415 } else if (rtm->rtm_src_len)
2416 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2419 #ifdef CONFIG_IPV6_MROUTE
2420 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2421 int err = ip6mr_get_route(net, skb, rtm, nowait);
2426 goto nla_put_failure;
2428 if (err == -EMSGSIZE)
2429 goto nla_put_failure;
2434 NLA_PUT_U32(skb, RTA_IIF, iif);
2436 struct in6_addr saddr_buf;
2437 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2438 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2441 if (rt->rt6i_prefsrc.plen) {
2442 struct in6_addr saddr_buf;
2443 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2444 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2447 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2448 goto nla_put_failure;
2451 n = dst_get_neighbour(&rt->dst);
2453 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2455 goto nla_put_failure;
2461 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2463 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2465 if (!(rt->rt6i_flags & RTF_EXPIRES))
2467 else if (rt->rt6i_expires - jiffies < INT_MAX)
2468 expires = rt->rt6i_expires - jiffies;
2472 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2473 expires, rt->dst.error) < 0)
2474 goto nla_put_failure;
2476 return nlmsg_end(skb, nlh);
2479 nlmsg_cancel(skb, nlh);
2483 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2485 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2488 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2489 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2490 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2494 return rt6_fill_node(arg->net,
2495 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2496 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2497 prefix, 0, NLM_F_MULTI);
2500 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2502 struct net *net = sock_net(in_skb->sk);
2503 struct nlattr *tb[RTA_MAX+1];
2504 struct rt6_info *rt;
2505 struct sk_buff *skb;
2510 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2515 memset(&fl6, 0, sizeof(fl6));
2518 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2521 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2525 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2528 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2532 iif = nla_get_u32(tb[RTA_IIF]);
2535 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2538 struct net_device *dev;
2539 dev = __dev_get_by_index(net, iif);
2546 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2552 /* Reserve room for dummy headers, this skb can pass
2553 through good chunk of routing engine.
2555 skb_reset_mac_header(skb);
2556 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2558 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2559 skb_dst_set(skb, &rt->dst);
2561 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2562 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2563 nlh->nlmsg_seq, 0, 0, 0);
2569 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2574 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2576 struct sk_buff *skb;
2577 struct net *net = info->nl_net;
2582 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2584 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2588 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2589 event, info->pid, seq, 0, 0, 0);
2591 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2592 WARN_ON(err == -EMSGSIZE);
2596 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2597 info->nlh, gfp_any());
2601 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2604 static int ip6_route_dev_notify(struct notifier_block *this,
2605 unsigned long event, void *data)
2607 struct net_device *dev = (struct net_device *)data;
2608 struct net *net = dev_net(dev);
2610 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2611 net->ipv6.ip6_null_entry->dst.dev = dev;
2612 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2613 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2614 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2615 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2616 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2617 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2628 #ifdef CONFIG_PROC_FS
2639 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2641 struct seq_file *m = p_arg;
2642 struct neighbour *n;
2644 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2646 #ifdef CONFIG_IPV6_SUBTREES
2647 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2649 seq_puts(m, "00000000000000000000000000000000 00 ");
2652 n = dst_get_neighbour(&rt->dst);
2654 seq_printf(m, "%pi6", n->primary_key);
2656 seq_puts(m, "00000000000000000000000000000000");
2659 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2660 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2661 rt->dst.__use, rt->rt6i_flags,
2662 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2666 static int ipv6_route_show(struct seq_file *m, void *v)
2668 struct net *net = (struct net *)m->private;
2669 fib6_clean_all(net, rt6_info_route, 0, m);
2673 static int ipv6_route_open(struct inode *inode, struct file *file)
2675 return single_open_net(inode, file, ipv6_route_show);
2678 static const struct file_operations ipv6_route_proc_fops = {
2679 .owner = THIS_MODULE,
2680 .open = ipv6_route_open,
2682 .llseek = seq_lseek,
2683 .release = single_release_net,
2686 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2688 struct net *net = (struct net *)seq->private;
2689 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2690 net->ipv6.rt6_stats->fib_nodes,
2691 net->ipv6.rt6_stats->fib_route_nodes,
2692 net->ipv6.rt6_stats->fib_rt_alloc,
2693 net->ipv6.rt6_stats->fib_rt_entries,
2694 net->ipv6.rt6_stats->fib_rt_cache,
2695 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2696 net->ipv6.rt6_stats->fib_discarded_routes);
2701 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2703 return single_open_net(inode, file, rt6_stats_seq_show);
2706 static const struct file_operations rt6_stats_seq_fops = {
2707 .owner = THIS_MODULE,
2708 .open = rt6_stats_seq_open,
2710 .llseek = seq_lseek,
2711 .release = single_release_net,
2713 #endif /* CONFIG_PROC_FS */
2715 #ifdef CONFIG_SYSCTL
2718 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2719 void __user *buffer, size_t *lenp, loff_t *ppos)
2726 net = (struct net *)ctl->extra1;
2727 delay = net->ipv6.sysctl.flush_delay;
2728 proc_dointvec(ctl, write, buffer, lenp, ppos);
2729 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2733 ctl_table ipv6_route_table_template[] = {
2735 .procname = "flush",
2736 .data = &init_net.ipv6.sysctl.flush_delay,
2737 .maxlen = sizeof(int),
2739 .proc_handler = ipv6_sysctl_rtcache_flush
2742 .procname = "gc_thresh",
2743 .data = &ip6_dst_ops_template.gc_thresh,
2744 .maxlen = sizeof(int),
2746 .proc_handler = proc_dointvec,
2749 .procname = "max_size",
2750 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2751 .maxlen = sizeof(int),
2753 .proc_handler = proc_dointvec,
2756 .procname = "gc_min_interval",
2757 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2758 .maxlen = sizeof(int),
2760 .proc_handler = proc_dointvec_jiffies,
2763 .procname = "gc_timeout",
2764 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2765 .maxlen = sizeof(int),
2767 .proc_handler = proc_dointvec_jiffies,
2770 .procname = "gc_interval",
2771 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2772 .maxlen = sizeof(int),
2774 .proc_handler = proc_dointvec_jiffies,
2777 .procname = "gc_elasticity",
2778 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2779 .maxlen = sizeof(int),
2781 .proc_handler = proc_dointvec,
2784 .procname = "mtu_expires",
2785 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2786 .maxlen = sizeof(int),
2788 .proc_handler = proc_dointvec_jiffies,
2791 .procname = "min_adv_mss",
2792 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2793 .maxlen = sizeof(int),
2795 .proc_handler = proc_dointvec,
2798 .procname = "gc_min_interval_ms",
2799 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2800 .maxlen = sizeof(int),
2802 .proc_handler = proc_dointvec_ms_jiffies,
2807 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2809 struct ctl_table *table;
2811 table = kmemdup(ipv6_route_table_template,
2812 sizeof(ipv6_route_table_template),
2816 table[0].data = &net->ipv6.sysctl.flush_delay;
2817 table[0].extra1 = net;
2818 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2819 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2820 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2821 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2822 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2823 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2824 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2825 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2826 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2833 static int __net_init ip6_route_net_init(struct net *net)
2837 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2838 sizeof(net->ipv6.ip6_dst_ops));
2840 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2841 goto out_ip6_dst_ops;
2843 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2844 sizeof(*net->ipv6.ip6_null_entry),
2846 if (!net->ipv6.ip6_null_entry)
2847 goto out_ip6_dst_entries;
2848 net->ipv6.ip6_null_entry->dst.path =
2849 (struct dst_entry *)net->ipv6.ip6_null_entry;
2850 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2851 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2852 ip6_template_metrics, true);
2854 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2855 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2856 sizeof(*net->ipv6.ip6_prohibit_entry),
2858 if (!net->ipv6.ip6_prohibit_entry)
2859 goto out_ip6_null_entry;
2860 net->ipv6.ip6_prohibit_entry->dst.path =
2861 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2862 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2863 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2864 ip6_template_metrics, true);
2866 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2867 sizeof(*net->ipv6.ip6_blk_hole_entry),
2869 if (!net->ipv6.ip6_blk_hole_entry)
2870 goto out_ip6_prohibit_entry;
2871 net->ipv6.ip6_blk_hole_entry->dst.path =
2872 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2873 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2874 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2875 ip6_template_metrics, true);
2878 net->ipv6.sysctl.flush_delay = 0;
2879 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2880 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2881 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2882 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2883 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2884 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2885 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2887 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2893 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2894 out_ip6_prohibit_entry:
2895 kfree(net->ipv6.ip6_prohibit_entry);
2897 kfree(net->ipv6.ip6_null_entry);
2899 out_ip6_dst_entries:
2900 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2905 static void __net_exit ip6_route_net_exit(struct net *net)
2907 kfree(net->ipv6.ip6_null_entry);
2908 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2909 kfree(net->ipv6.ip6_prohibit_entry);
2910 kfree(net->ipv6.ip6_blk_hole_entry);
2912 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2915 static int __net_init ip6_route_net_init_late(struct net *net)
2917 #ifdef CONFIG_PROC_FS
2918 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2919 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2924 static void __net_exit ip6_route_net_exit_late(struct net *net)
2926 #ifdef CONFIG_PROC_FS
2927 proc_net_remove(net, "ipv6_route");
2928 proc_net_remove(net, "rt6_stats");
2932 static struct pernet_operations ip6_route_net_ops = {
2933 .init = ip6_route_net_init,
2934 .exit = ip6_route_net_exit,
2937 static struct pernet_operations ip6_route_net_late_ops = {
2938 .init = ip6_route_net_init_late,
2939 .exit = ip6_route_net_exit_late,
2942 static struct notifier_block ip6_route_dev_notifier = {
2943 .notifier_call = ip6_route_dev_notify,
2947 int __init ip6_route_init(void)
2952 ip6_dst_ops_template.kmem_cachep =
2953 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2954 SLAB_HWCACHE_ALIGN, NULL);
2955 if (!ip6_dst_ops_template.kmem_cachep)
2958 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2960 goto out_kmem_cache;
2962 ret = register_pernet_subsys(&ip6_route_net_ops);
2964 goto out_dst_entries;
2966 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2968 /* Registering of the loopback is done before this portion of code,
2969 * the loopback reference in rt6_info will not be taken, do it
2970 * manually for init_net */
2971 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2972 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2973 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2974 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2975 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2976 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2977 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2981 goto out_register_subsys;
2987 ret = fib6_rules_init();
2991 ret = register_pernet_subsys(&ip6_route_net_late_ops);
2993 goto fib6_rules_init;
2996 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2997 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2998 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2999 goto out_register_late_subsys;
3001 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3003 goto out_register_late_subsys;
3008 out_register_late_subsys:
3009 unregister_pernet_subsys(&ip6_route_net_late_ops);
3011 fib6_rules_cleanup();
3016 out_register_subsys:
3017 unregister_pernet_subsys(&ip6_route_net_ops);
3019 dst_entries_destroy(&ip6_dst_blackhole_ops);
3021 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3025 void ip6_route_cleanup(void)
3027 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3028 unregister_pernet_subsys(&ip6_route_net_late_ops);
3029 fib6_rules_cleanup();
3032 unregister_pernet_subsys(&ip6_route_net_ops);
3033 dst_entries_destroy(&ip6_dst_blackhole_ops);
3034 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);