2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
55 #include <linux/rtnetlink.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
61 #include <asm/uaccess.h>
64 #include <linux/sysctl.h>
67 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
68 const struct in6_addr *dest);
69 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
70 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
71 static unsigned int ip6_mtu(const struct dst_entry *dst);
72 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
73 static void ip6_dst_destroy(struct dst_entry *);
74 static void ip6_dst_ifdown(struct dst_entry *,
75 struct net_device *dev, int how);
76 static int ip6_dst_gc(struct dst_ops *ops);
78 static int ip6_pkt_discard(struct sk_buff *skb);
79 static int ip6_pkt_discard_out(struct sk_buff *skb);
80 static void ip6_link_failure(struct sk_buff *skb);
81 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
83 #ifdef CONFIG_IPV6_ROUTE_INFO
84 static struct rt6_info *rt6_add_route_info(struct net *net,
85 const struct in6_addr *prefix, int prefixlen,
86 const struct in6_addr *gwaddr, int ifindex,
88 static struct rt6_info *rt6_get_route_info(struct net *net,
89 const struct in6_addr *prefix, int prefixlen,
90 const struct in6_addr *gwaddr, int ifindex);
93 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
95 struct rt6_info *rt = (struct rt6_info *) dst;
96 struct inet_peer *peer;
99 if (!(rt->dst.flags & DST_HOST))
103 rt6_bind_peer(rt, 1);
105 peer = rt->rt6i_peer;
107 u32 *old_p = __DST_METRICS_PTR(old);
108 unsigned long prev, new;
111 if (inet_metrics_new(peer))
112 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
114 new = (unsigned long) p;
115 prev = cmpxchg(&dst->_metrics, old, new);
118 p = __DST_METRICS_PTR(prev);
119 if (prev & DST_METRICS_READ_ONLY)
126 static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
128 struct in6_addr *p = &rt->rt6i_gateway;
130 if (!ipv6_addr_any(p))
131 return (const void *) p;
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
137 struct rt6_info *rt = (struct rt6_info *) dst;
140 daddr = choose_neigh_daddr(rt, daddr);
141 n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
144 return neigh_create(&nd_tbl, daddr, dst->dev);
147 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
149 struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
151 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
155 dst_set_neighbour(&rt->dst, n);
160 static struct dst_ops ip6_dst_ops_template = {
162 .protocol = cpu_to_be16(ETH_P_IPV6),
165 .check = ip6_dst_check,
166 .default_advmss = ip6_default_advmss,
168 .cow_metrics = ipv6_cow_metrics,
169 .destroy = ip6_dst_destroy,
170 .ifdown = ip6_dst_ifdown,
171 .negative_advice = ip6_negative_advice,
172 .link_failure = ip6_link_failure,
173 .update_pmtu = ip6_rt_update_pmtu,
174 .local_out = __ip6_local_out,
175 .neigh_lookup = ip6_neigh_lookup,
178 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
180 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
182 return mtu ? : dst->dev->mtu;
185 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
189 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
195 static struct dst_ops ip6_dst_blackhole_ops = {
197 .protocol = cpu_to_be16(ETH_P_IPV6),
198 .destroy = ip6_dst_destroy,
199 .check = ip6_dst_check,
200 .mtu = ip6_blackhole_mtu,
201 .default_advmss = ip6_default_advmss,
202 .update_pmtu = ip6_rt_blackhole_update_pmtu,
203 .cow_metrics = ip6_rt_blackhole_cow_metrics,
204 .neigh_lookup = ip6_neigh_lookup,
207 static const u32 ip6_template_metrics[RTAX_MAX] = {
208 [RTAX_HOPLIMIT - 1] = 255,
211 static struct rt6_info ip6_null_entry_template = {
213 .__refcnt = ATOMIC_INIT(1),
216 .error = -ENETUNREACH,
217 .input = ip6_pkt_discard,
218 .output = ip6_pkt_discard_out,
220 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
221 .rt6i_protocol = RTPROT_KERNEL,
222 .rt6i_metric = ~(u32) 0,
223 .rt6i_ref = ATOMIC_INIT(1),
226 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
228 static int ip6_pkt_prohibit(struct sk_buff *skb);
229 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
231 static struct rt6_info ip6_prohibit_entry_template = {
233 .__refcnt = ATOMIC_INIT(1),
237 .input = ip6_pkt_prohibit,
238 .output = ip6_pkt_prohibit_out,
240 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
241 .rt6i_protocol = RTPROT_KERNEL,
242 .rt6i_metric = ~(u32) 0,
243 .rt6i_ref = ATOMIC_INIT(1),
246 static struct rt6_info ip6_blk_hole_entry_template = {
248 .__refcnt = ATOMIC_INIT(1),
252 .input = dst_discard,
253 .output = dst_discard,
255 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
256 .rt6i_protocol = RTPROT_KERNEL,
257 .rt6i_metric = ~(u32) 0,
258 .rt6i_ref = ATOMIC_INIT(1),
263 /* allocate dst with ip6_dst_ops */
264 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
265 struct net_device *dev,
268 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
271 memset(&rt->rt6i_table, 0,
272 sizeof(*rt) - sizeof(struct dst_entry));
277 static void ip6_dst_destroy(struct dst_entry *dst)
279 struct rt6_info *rt = (struct rt6_info *)dst;
280 struct inet6_dev *idev = rt->rt6i_idev;
281 struct inet_peer *peer = rt->rt6i_peer;
283 if (!(rt->dst.flags & DST_HOST))
284 dst_destroy_metrics_generic(dst);
287 rt->rt6i_idev = NULL;
291 if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
292 dst_release(dst->from);
295 rt->rt6i_peer = NULL;
300 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
302 static u32 rt6_peer_genid(void)
304 return atomic_read(&__rt6_peer_genid);
307 void rt6_bind_peer(struct rt6_info *rt, int create)
309 struct net *net = dev_net(rt->dst.dev);
310 struct inet_peer *peer;
312 peer = inet_getpeer_v6(net, &rt->rt6i_dst.addr, create);
313 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
316 rt->rt6i_peer_genid = rt6_peer_genid();
319 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
322 struct rt6_info *rt = (struct rt6_info *)dst;
323 struct inet6_dev *idev = rt->rt6i_idev;
324 struct net_device *loopback_dev =
325 dev_net(dev)->loopback_dev;
327 if (dev != loopback_dev && idev && idev->dev == dev) {
328 struct inet6_dev *loopback_idev =
329 in6_dev_get(loopback_dev);
331 rt->rt6i_idev = loopback_idev;
337 static bool rt6_check_expired(const struct rt6_info *rt)
339 struct rt6_info *ort = NULL;
341 if (rt->rt6i_flags & RTF_EXPIRES) {
342 if (time_after(jiffies, rt->dst.expires))
344 } else if (rt->dst.from) {
345 ort = (struct rt6_info *) rt->dst.from;
346 return (ort->rt6i_flags & RTF_EXPIRES) &&
347 time_after(jiffies, ort->dst.expires);
352 static bool rt6_need_strict(const struct in6_addr *daddr)
354 return ipv6_addr_type(daddr) &
355 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
359 * Route lookup. Any table->tb6_lock is implied.
362 static inline struct rt6_info *rt6_device_match(struct net *net,
364 const struct in6_addr *saddr,
368 struct rt6_info *local = NULL;
369 struct rt6_info *sprt;
371 if (!oif && ipv6_addr_any(saddr))
374 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
375 struct net_device *dev = sprt->dst.dev;
378 if (dev->ifindex == oif)
380 if (dev->flags & IFF_LOOPBACK) {
381 if (!sprt->rt6i_idev ||
382 sprt->rt6i_idev->dev->ifindex != oif) {
383 if (flags & RT6_LOOKUP_F_IFACE && oif)
385 if (local && (!oif ||
386 local->rt6i_idev->dev->ifindex == oif))
392 if (ipv6_chk_addr(net, saddr, dev,
393 flags & RT6_LOOKUP_F_IFACE))
402 if (flags & RT6_LOOKUP_F_IFACE)
403 return net->ipv6.ip6_null_entry;
409 #ifdef CONFIG_IPV6_ROUTER_PREF
410 static void rt6_probe(struct rt6_info *rt)
412 struct neighbour *neigh;
414 * Okay, this does not seem to be appropriate
415 * for now, however, we need to check if it
416 * is really so; aka Router Reachability Probing.
418 * Router Reachability Probe MUST be rate-limited
419 * to no more than one per minute.
422 neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
423 if (!neigh || (neigh->nud_state & NUD_VALID))
425 read_lock_bh(&neigh->lock);
426 if (!(neigh->nud_state & NUD_VALID) &&
427 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
428 struct in6_addr mcaddr;
429 struct in6_addr *target;
431 neigh->updated = jiffies;
432 read_unlock_bh(&neigh->lock);
434 target = (struct in6_addr *)&neigh->primary_key;
435 addrconf_addr_solict_mult(target, &mcaddr);
436 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
438 read_unlock_bh(&neigh->lock);
444 static inline void rt6_probe(struct rt6_info *rt)
450 * Default Router Selection (RFC 2461 6.3.6)
452 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
454 struct net_device *dev = rt->dst.dev;
455 if (!oif || dev->ifindex == oif)
457 if ((dev->flags & IFF_LOOPBACK) &&
458 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
463 static inline int rt6_check_neigh(struct rt6_info *rt)
465 struct neighbour *neigh;
469 neigh = dst_get_neighbour_noref(&rt->dst);
470 if (rt->rt6i_flags & RTF_NONEXTHOP ||
471 !(rt->rt6i_flags & RTF_GATEWAY))
474 read_lock_bh(&neigh->lock);
475 if (neigh->nud_state & NUD_VALID)
477 #ifdef CONFIG_IPV6_ROUTER_PREF
478 else if (neigh->nud_state & NUD_FAILED)
483 read_unlock_bh(&neigh->lock);
490 static int rt6_score_route(struct rt6_info *rt, int oif,
495 m = rt6_check_dev(rt, oif);
496 if (!m && (strict & RT6_LOOKUP_F_IFACE))
498 #ifdef CONFIG_IPV6_ROUTER_PREF
499 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
501 n = rt6_check_neigh(rt);
502 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
507 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
508 int *mpri, struct rt6_info *match)
512 if (rt6_check_expired(rt))
515 m = rt6_score_route(rt, oif, strict);
520 if (strict & RT6_LOOKUP_F_REACHABLE)
524 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
532 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
533 struct rt6_info *rr_head,
534 u32 metric, int oif, int strict)
536 struct rt6_info *rt, *match;
540 for (rt = rr_head; rt && rt->rt6i_metric == metric;
541 rt = rt->dst.rt6_next)
542 match = find_match(rt, oif, strict, &mpri, match);
543 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
544 rt = rt->dst.rt6_next)
545 match = find_match(rt, oif, strict, &mpri, match);
550 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
552 struct rt6_info *match, *rt0;
557 fn->rr_ptr = rt0 = fn->leaf;
559 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
562 (strict & RT6_LOOKUP_F_REACHABLE)) {
563 struct rt6_info *next = rt0->dst.rt6_next;
565 /* no entries matched; do round-robin */
566 if (!next || next->rt6i_metric != rt0->rt6i_metric)
573 net = dev_net(rt0->dst.dev);
574 return match ? match : net->ipv6.ip6_null_entry;
577 #ifdef CONFIG_IPV6_ROUTE_INFO
578 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
579 const struct in6_addr *gwaddr)
581 struct net *net = dev_net(dev);
582 struct route_info *rinfo = (struct route_info *) opt;
583 struct in6_addr prefix_buf, *prefix;
585 unsigned long lifetime;
588 if (len < sizeof(struct route_info)) {
592 /* Sanity check for prefix_len and length */
593 if (rinfo->length > 3) {
595 } else if (rinfo->prefix_len > 128) {
597 } else if (rinfo->prefix_len > 64) {
598 if (rinfo->length < 2) {
601 } else if (rinfo->prefix_len > 0) {
602 if (rinfo->length < 1) {
607 pref = rinfo->route_pref;
608 if (pref == ICMPV6_ROUTER_PREF_INVALID)
611 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
613 if (rinfo->length == 3)
614 prefix = (struct in6_addr *)rinfo->prefix;
616 /* this function is safe */
617 ipv6_addr_prefix(&prefix_buf,
618 (struct in6_addr *)rinfo->prefix,
620 prefix = &prefix_buf;
623 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
626 if (rt && !lifetime) {
632 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
635 rt->rt6i_flags = RTF_ROUTEINFO |
636 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
639 if (!addrconf_finite_timeout(lifetime))
640 rt6_clean_expires(rt);
642 rt6_set_expires(rt, jiffies + HZ * lifetime);
644 dst_release(&rt->dst);
650 #define BACKTRACK(__net, saddr) \
652 if (rt == __net->ipv6.ip6_null_entry) { \
653 struct fib6_node *pn; \
655 if (fn->fn_flags & RTN_TL_ROOT) \
658 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
659 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
662 if (fn->fn_flags & RTN_RTINFO) \
668 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
669 struct fib6_table *table,
670 struct flowi6 *fl6, int flags)
672 struct fib6_node *fn;
675 read_lock_bh(&table->tb6_lock);
676 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
679 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
680 BACKTRACK(net, &fl6->saddr);
682 dst_use(&rt->dst, jiffies);
683 read_unlock_bh(&table->tb6_lock);
688 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
691 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
693 EXPORT_SYMBOL_GPL(ip6_route_lookup);
695 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
696 const struct in6_addr *saddr, int oif, int strict)
698 struct flowi6 fl6 = {
702 struct dst_entry *dst;
703 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
706 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
707 flags |= RT6_LOOKUP_F_HAS_SADDR;
710 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
712 return (struct rt6_info *) dst;
719 EXPORT_SYMBOL(rt6_lookup);
721 /* ip6_ins_rt is called with FREE table->tb6_lock.
722 It takes new route entry, the addition fails by any reason the
723 route is freed. In any case, if caller does not hold it, it may
727 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
730 struct fib6_table *table;
732 table = rt->rt6i_table;
733 write_lock_bh(&table->tb6_lock);
734 err = fib6_add(&table->tb6_root, rt, info);
735 write_unlock_bh(&table->tb6_lock);
740 int ip6_ins_rt(struct rt6_info *rt)
742 struct nl_info info = {
743 .nl_net = dev_net(rt->dst.dev),
745 return __ip6_ins_rt(rt, &info);
748 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
749 const struct in6_addr *daddr,
750 const struct in6_addr *saddr)
758 rt = ip6_rt_copy(ort, daddr);
761 int attempts = !in_softirq();
763 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
764 if (ort->rt6i_dst.plen != 128 &&
765 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
766 rt->rt6i_flags |= RTF_ANYCAST;
767 rt->rt6i_gateway = *daddr;
770 rt->rt6i_flags |= RTF_CACHE;
772 #ifdef CONFIG_IPV6_SUBTREES
773 if (rt->rt6i_src.plen && saddr) {
774 rt->rt6i_src.addr = *saddr;
775 rt->rt6i_src.plen = 128;
780 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
781 struct net *net = dev_net(rt->dst.dev);
782 int saved_rt_min_interval =
783 net->ipv6.sysctl.ip6_rt_gc_min_interval;
784 int saved_rt_elasticity =
785 net->ipv6.sysctl.ip6_rt_gc_elasticity;
787 if (attempts-- > 0) {
788 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
789 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
791 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
793 net->ipv6.sysctl.ip6_rt_gc_elasticity =
795 net->ipv6.sysctl.ip6_rt_gc_min_interval =
796 saved_rt_min_interval;
800 net_warn_ratelimited("Neighbour table overflow\n");
809 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
810 const struct in6_addr *daddr)
812 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
815 rt->rt6i_flags |= RTF_CACHE;
816 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
821 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
822 struct flowi6 *fl6, int flags)
824 struct fib6_node *fn;
825 struct rt6_info *rt, *nrt;
829 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
831 strict |= flags & RT6_LOOKUP_F_IFACE;
834 read_lock_bh(&table->tb6_lock);
837 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
840 rt = rt6_select(fn, oif, strict | reachable);
842 BACKTRACK(net, &fl6->saddr);
843 if (rt == net->ipv6.ip6_null_entry ||
844 rt->rt6i_flags & RTF_CACHE)
848 read_unlock_bh(&table->tb6_lock);
850 if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
851 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
852 else if (!(rt->dst.flags & DST_HOST))
853 nrt = rt6_alloc_clone(rt, &fl6->daddr);
857 dst_release(&rt->dst);
858 rt = nrt ? : net->ipv6.ip6_null_entry;
862 err = ip6_ins_rt(nrt);
871 * Race condition! In the gap, when table->tb6_lock was
872 * released someone could insert this route. Relookup.
874 dst_release(&rt->dst);
883 read_unlock_bh(&table->tb6_lock);
885 rt->dst.lastuse = jiffies;
891 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
892 struct flowi6 *fl6, int flags)
894 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
897 static struct dst_entry *ip6_route_input_lookup(struct net *net,
898 struct net_device *dev,
899 struct flowi6 *fl6, int flags)
901 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
902 flags |= RT6_LOOKUP_F_IFACE;
904 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
907 void ip6_route_input(struct sk_buff *skb)
909 const struct ipv6hdr *iph = ipv6_hdr(skb);
910 struct net *net = dev_net(skb->dev);
911 int flags = RT6_LOOKUP_F_HAS_SADDR;
912 struct flowi6 fl6 = {
913 .flowi6_iif = skb->dev->ifindex,
916 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
917 .flowi6_mark = skb->mark,
918 .flowi6_proto = iph->nexthdr,
921 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
924 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
925 struct flowi6 *fl6, int flags)
927 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
930 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
935 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
936 flags |= RT6_LOOKUP_F_IFACE;
938 if (!ipv6_addr_any(&fl6->saddr))
939 flags |= RT6_LOOKUP_F_HAS_SADDR;
941 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
943 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
946 EXPORT_SYMBOL(ip6_route_output);
948 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
950 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
951 struct dst_entry *new = NULL;
953 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
955 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
960 new->input = dst_discard;
961 new->output = dst_discard;
963 if (dst_metrics_read_only(&ort->dst))
964 new->_metrics = ort->dst._metrics;
966 dst_copy_metrics(new, &ort->dst);
967 rt->rt6i_idev = ort->rt6i_idev;
969 in6_dev_hold(rt->rt6i_idev);
971 rt->rt6i_gateway = ort->rt6i_gateway;
972 rt->rt6i_flags = ort->rt6i_flags;
973 rt6_clean_expires(rt);
976 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
977 #ifdef CONFIG_IPV6_SUBTREES
978 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
984 dst_release(dst_orig);
985 return new ? new : ERR_PTR(-ENOMEM);
989 * Destination cache support functions
992 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
996 rt = (struct rt6_info *) dst;
998 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
999 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
1001 rt6_bind_peer(rt, 0);
1002 rt->rt6i_peer_genid = rt6_peer_genid();
1009 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1011 struct rt6_info *rt = (struct rt6_info *) dst;
1014 if (rt->rt6i_flags & RTF_CACHE) {
1015 if (rt6_check_expired(rt)) {
1027 static void ip6_link_failure(struct sk_buff *skb)
1029 struct rt6_info *rt;
1031 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1033 rt = (struct rt6_info *) skb_dst(skb);
1035 if (rt->rt6i_flags & RTF_CACHE)
1036 rt6_update_expires(rt, 0);
1037 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1038 rt->rt6i_node->fn_sernum = -1;
1042 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1044 struct rt6_info *rt6 = (struct rt6_info*)dst;
1046 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1047 rt6->rt6i_flags |= RTF_MODIFIED;
1048 if (mtu < IPV6_MIN_MTU) {
1049 u32 features = dst_metric(dst, RTAX_FEATURES);
1051 features |= RTAX_FEATURE_ALLFRAG;
1052 dst_metric_set(dst, RTAX_FEATURES, features);
1054 dst_metric_set(dst, RTAX_MTU, mtu);
1058 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1060 struct net_device *dev = dst->dev;
1061 unsigned int mtu = dst_mtu(dst);
1062 struct net *net = dev_net(dev);
1064 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1066 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1067 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1070 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1071 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1072 * IPV6_MAXPLEN is also valid and means: "any MSS,
1073 * rely only on pmtu discovery"
1075 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1080 static unsigned int ip6_mtu(const struct dst_entry *dst)
1082 struct inet6_dev *idev;
1083 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1091 idev = __in6_dev_get(dst->dev);
1093 mtu = idev->cnf.mtu6;
1099 static struct dst_entry *icmp6_dst_gc_list;
1100 static DEFINE_SPINLOCK(icmp6_dst_lock);
1102 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1103 struct neighbour *neigh,
1106 struct dst_entry *dst;
1107 struct rt6_info *rt;
1108 struct inet6_dev *idev = in6_dev_get(dev);
1109 struct net *net = dev_net(dev);
1111 if (unlikely(!idev))
1112 return ERR_PTR(-ENODEV);
1114 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1115 if (unlikely(!rt)) {
1117 dst = ERR_PTR(-ENOMEM);
1124 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1125 if (IS_ERR(neigh)) {
1128 return ERR_CAST(neigh);
1132 rt->dst.flags |= DST_HOST;
1133 rt->dst.output = ip6_output;
1134 dst_set_neighbour(&rt->dst, neigh);
1135 atomic_set(&rt->dst.__refcnt, 1);
1136 rt->rt6i_dst.addr = fl6->daddr;
1137 rt->rt6i_dst.plen = 128;
1138 rt->rt6i_idev = idev;
1139 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1141 spin_lock_bh(&icmp6_dst_lock);
1142 rt->dst.next = icmp6_dst_gc_list;
1143 icmp6_dst_gc_list = &rt->dst;
1144 spin_unlock_bh(&icmp6_dst_lock);
1146 fib6_force_start_gc(net);
1148 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1154 int icmp6_dst_gc(void)
1156 struct dst_entry *dst, **pprev;
1159 spin_lock_bh(&icmp6_dst_lock);
1160 pprev = &icmp6_dst_gc_list;
1162 while ((dst = *pprev) != NULL) {
1163 if (!atomic_read(&dst->__refcnt)) {
1172 spin_unlock_bh(&icmp6_dst_lock);
1177 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1180 struct dst_entry *dst, **pprev;
1182 spin_lock_bh(&icmp6_dst_lock);
1183 pprev = &icmp6_dst_gc_list;
1184 while ((dst = *pprev) != NULL) {
1185 struct rt6_info *rt = (struct rt6_info *) dst;
1186 if (func(rt, arg)) {
1193 spin_unlock_bh(&icmp6_dst_lock);
1196 static int ip6_dst_gc(struct dst_ops *ops)
1198 unsigned long now = jiffies;
1199 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1200 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1201 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1202 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1203 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1204 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1207 entries = dst_entries_get_fast(ops);
1208 if (time_after(rt_last_gc + rt_min_interval, now) &&
1209 entries <= rt_max_size)
1212 net->ipv6.ip6_rt_gc_expire++;
1213 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1214 net->ipv6.ip6_rt_last_gc = now;
1215 entries = dst_entries_get_slow(ops);
1216 if (entries < ops->gc_thresh)
1217 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1219 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1220 return entries > rt_max_size;
1223 /* Clean host part of a prefix. Not necessary in radix tree,
1224 but results in cleaner routing tables.
1226 Remove it only when all the things will work!
1229 int ip6_dst_hoplimit(struct dst_entry *dst)
1231 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1232 if (hoplimit == 0) {
1233 struct net_device *dev = dst->dev;
1234 struct inet6_dev *idev;
1237 idev = __in6_dev_get(dev);
1239 hoplimit = idev->cnf.hop_limit;
1241 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1246 EXPORT_SYMBOL(ip6_dst_hoplimit);
1252 int ip6_route_add(struct fib6_config *cfg)
1255 struct net *net = cfg->fc_nlinfo.nl_net;
1256 struct rt6_info *rt = NULL;
1257 struct net_device *dev = NULL;
1258 struct inet6_dev *idev = NULL;
1259 struct fib6_table *table;
1262 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1264 #ifndef CONFIG_IPV6_SUBTREES
1265 if (cfg->fc_src_len)
1268 if (cfg->fc_ifindex) {
1270 dev = dev_get_by_index(net, cfg->fc_ifindex);
1273 idev = in6_dev_get(dev);
1278 if (cfg->fc_metric == 0)
1279 cfg->fc_metric = IP6_RT_PRIO_USER;
1282 if (cfg->fc_nlinfo.nlh &&
1283 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1284 table = fib6_get_table(net, cfg->fc_table);
1286 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1287 table = fib6_new_table(net, cfg->fc_table);
1290 table = fib6_new_table(net, cfg->fc_table);
1296 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1303 rt->dst.obsolete = -1;
1305 if (cfg->fc_flags & RTF_EXPIRES)
1306 rt6_set_expires(rt, jiffies +
1307 clock_t_to_jiffies(cfg->fc_expires));
1309 rt6_clean_expires(rt);
1311 if (cfg->fc_protocol == RTPROT_UNSPEC)
1312 cfg->fc_protocol = RTPROT_BOOT;
1313 rt->rt6i_protocol = cfg->fc_protocol;
1315 addr_type = ipv6_addr_type(&cfg->fc_dst);
1317 if (addr_type & IPV6_ADDR_MULTICAST)
1318 rt->dst.input = ip6_mc_input;
1319 else if (cfg->fc_flags & RTF_LOCAL)
1320 rt->dst.input = ip6_input;
1322 rt->dst.input = ip6_forward;
1324 rt->dst.output = ip6_output;
1326 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1327 rt->rt6i_dst.plen = cfg->fc_dst_len;
1328 if (rt->rt6i_dst.plen == 128)
1329 rt->dst.flags |= DST_HOST;
1331 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1332 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1337 dst_init_metrics(&rt->dst, metrics, 0);
1339 #ifdef CONFIG_IPV6_SUBTREES
1340 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1341 rt->rt6i_src.plen = cfg->fc_src_len;
1344 rt->rt6i_metric = cfg->fc_metric;
1346 /* We cannot add true routes via loopback here,
1347 they would result in kernel looping; promote them to reject routes
1349 if ((cfg->fc_flags & RTF_REJECT) ||
1350 (dev && (dev->flags & IFF_LOOPBACK) &&
1351 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1352 !(cfg->fc_flags & RTF_LOCAL))) {
1353 /* hold loopback dev/idev if we haven't done so. */
1354 if (dev != net->loopback_dev) {
1359 dev = net->loopback_dev;
1361 idev = in6_dev_get(dev);
1367 rt->dst.output = ip6_pkt_discard_out;
1368 rt->dst.input = ip6_pkt_discard;
1369 rt->dst.error = -ENETUNREACH;
1370 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1374 if (cfg->fc_flags & RTF_GATEWAY) {
1375 const struct in6_addr *gw_addr;
1378 gw_addr = &cfg->fc_gateway;
1379 rt->rt6i_gateway = *gw_addr;
1380 gwa_type = ipv6_addr_type(gw_addr);
1382 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1383 struct rt6_info *grt;
1385 /* IPv6 strictly inhibits using not link-local
1386 addresses as nexthop address.
1387 Otherwise, router will not able to send redirects.
1388 It is very good, but in some (rare!) circumstances
1389 (SIT, PtP, NBMA NOARP links) it is handy to allow
1390 some exceptions. --ANK
1393 if (!(gwa_type & IPV6_ADDR_UNICAST))
1396 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1398 err = -EHOSTUNREACH;
1402 if (dev != grt->dst.dev) {
1403 dst_release(&grt->dst);
1408 idev = grt->rt6i_idev;
1410 in6_dev_hold(grt->rt6i_idev);
1412 if (!(grt->rt6i_flags & RTF_GATEWAY))
1414 dst_release(&grt->dst);
1420 if (!dev || (dev->flags & IFF_LOOPBACK))
1428 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1429 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1433 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1434 rt->rt6i_prefsrc.plen = 128;
1436 rt->rt6i_prefsrc.plen = 0;
1438 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1439 err = rt6_bind_neighbour(rt, dev);
1444 rt->rt6i_flags = cfg->fc_flags;
1451 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1452 int type = nla_type(nla);
1455 if (type > RTAX_MAX) {
1460 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1466 rt->rt6i_idev = idev;
1467 rt->rt6i_table = table;
1469 cfg->fc_nlinfo.nl_net = dev_net(dev);
1471 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1483 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1486 struct fib6_table *table;
1487 struct net *net = dev_net(rt->dst.dev);
1489 if (rt == net->ipv6.ip6_null_entry)
1492 table = rt->rt6i_table;
1493 write_lock_bh(&table->tb6_lock);
1495 err = fib6_del(rt, info);
1496 dst_release(&rt->dst);
1498 write_unlock_bh(&table->tb6_lock);
1503 int ip6_del_rt(struct rt6_info *rt)
1505 struct nl_info info = {
1506 .nl_net = dev_net(rt->dst.dev),
1508 return __ip6_del_rt(rt, &info);
1511 static int ip6_route_del(struct fib6_config *cfg)
1513 struct fib6_table *table;
1514 struct fib6_node *fn;
1515 struct rt6_info *rt;
1518 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1522 read_lock_bh(&table->tb6_lock);
1524 fn = fib6_locate(&table->tb6_root,
1525 &cfg->fc_dst, cfg->fc_dst_len,
1526 &cfg->fc_src, cfg->fc_src_len);
1529 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1530 if (cfg->fc_ifindex &&
1532 rt->dst.dev->ifindex != cfg->fc_ifindex))
1534 if (cfg->fc_flags & RTF_GATEWAY &&
1535 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1537 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1540 read_unlock_bh(&table->tb6_lock);
1542 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1545 read_unlock_bh(&table->tb6_lock);
1553 struct ip6rd_flowi {
1555 struct in6_addr gateway;
1558 static struct rt6_info *__ip6_route_redirect(struct net *net,
1559 struct fib6_table *table,
1563 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1564 struct rt6_info *rt;
1565 struct fib6_node *fn;
1568 * Get the "current" route for this destination and
1569 * check if the redirect has come from approriate router.
1571 * RFC 2461 specifies that redirects should only be
1572 * accepted if they come from the nexthop to the target.
1573 * Due to the way the routes are chosen, this notion
1574 * is a bit fuzzy and one might need to check all possible
1578 read_lock_bh(&table->tb6_lock);
1579 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1581 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1583 * Current route is on-link; redirect is always invalid.
1585 * Seems, previous statement is not true. It could
1586 * be node, which looks for us as on-link (f.e. proxy ndisc)
1587 * But then router serving it might decide, that we should
1588 * know truth 8)8) --ANK (980726).
1590 if (rt6_check_expired(rt))
1592 if (!(rt->rt6i_flags & RTF_GATEWAY))
1594 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1596 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1602 rt = net->ipv6.ip6_null_entry;
1603 BACKTRACK(net, &fl6->saddr);
1607 read_unlock_bh(&table->tb6_lock);
1612 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1613 const struct in6_addr *src,
1614 const struct in6_addr *gateway,
1615 struct net_device *dev)
1617 int flags = RT6_LOOKUP_F_HAS_SADDR;
1618 struct net *net = dev_net(dev);
1619 struct ip6rd_flowi rdfl = {
1621 .flowi6_oif = dev->ifindex,
1627 rdfl.gateway = *gateway;
1629 if (rt6_need_strict(dest))
1630 flags |= RT6_LOOKUP_F_IFACE;
1632 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1633 flags, __ip6_route_redirect);
1636 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1637 const struct in6_addr *saddr,
1638 struct neighbour *neigh, u8 *lladdr, int on_link)
1640 struct rt6_info *rt, *nrt = NULL;
1641 struct netevent_redirect netevent;
1642 struct net *net = dev_net(neigh->dev);
1644 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1646 if (rt == net->ipv6.ip6_null_entry) {
1647 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1652 * We have finally decided to accept it.
1655 neigh_update(neigh, lladdr, NUD_STALE,
1656 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1657 NEIGH_UPDATE_F_OVERRIDE|
1658 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1659 NEIGH_UPDATE_F_ISROUTER))
1663 * Redirect received -> path was valid.
1664 * Look, redirects are sent only in response to data packets,
1665 * so that this nexthop apparently is reachable. --ANK
1667 dst_confirm(&rt->dst);
1669 /* Duplicate redirect: silently ignore. */
1670 if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1673 nrt = ip6_rt_copy(rt, dest);
1677 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1679 nrt->rt6i_flags &= ~RTF_GATEWAY;
1681 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1682 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1684 if (ip6_ins_rt(nrt))
1687 netevent.old = &rt->dst;
1688 netevent.new = &nrt->dst;
1689 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1691 if (rt->rt6i_flags & RTF_CACHE) {
1697 dst_release(&rt->dst);
1701 * Handle ICMP "packet too big" messages
1702 * i.e. Path MTU discovery
1705 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1706 struct net *net, u32 pmtu, int ifindex)
1708 struct rt6_info *rt, *nrt;
1711 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1715 if (rt6_check_expired(rt)) {
1720 if (pmtu >= dst_mtu(&rt->dst))
1723 if (pmtu < IPV6_MIN_MTU) {
1725 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1726 * MTU (1280) and a fragment header should always be included
1727 * after a node receiving Too Big message reporting PMTU is
1728 * less than the IPv6 Minimum Link MTU.
1730 pmtu = IPV6_MIN_MTU;
1734 /* New mtu received -> path was valid.
1735 They are sent only in response to data packets,
1736 so that this nexthop apparently is reachable. --ANK
1738 dst_confirm(&rt->dst);
1740 /* Host route. If it is static, it would be better
1741 not to override it, but add new one, so that
1742 when cache entry will expire old pmtu
1743 would return automatically.
1745 if (rt->rt6i_flags & RTF_CACHE) {
1746 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1748 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1749 features |= RTAX_FEATURE_ALLFRAG;
1750 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1752 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1753 rt->rt6i_flags |= RTF_MODIFIED;
1758 Two cases are possible:
1759 1. It is connected route. Action: COW
1760 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1762 if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1763 nrt = rt6_alloc_cow(rt, daddr, saddr);
1765 nrt = rt6_alloc_clone(rt, daddr);
1768 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1770 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1771 features |= RTAX_FEATURE_ALLFRAG;
1772 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1775 /* According to RFC 1981, detecting PMTU increase shouldn't be
1776 * happened within 5 mins, the recommended timer is 10 mins.
1777 * Here this route expiration time is set to ip6_rt_mtu_expires
1778 * which is 10 mins. After 10 mins the decreased pmtu is expired
1779 * and detecting PMTU increase will be automatically happened.
1781 rt6_update_expires(nrt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1782 nrt->rt6i_flags |= RTF_DYNAMIC;
1786 dst_release(&rt->dst);
1789 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1790 struct net_device *dev, u32 pmtu)
1792 struct net *net = dev_net(dev);
1795 * RFC 1981 states that a node "MUST reduce the size of the packets it
1796 * is sending along the path" that caused the Packet Too Big message.
1797 * Since it's not possible in the general case to determine which
1798 * interface was used to send the original packet, we update the MTU
1799 * on the interface that will be used to send future packets. We also
1800 * update the MTU on the interface that received the Packet Too Big in
1801 * case the original packet was forced out that interface with
1802 * SO_BINDTODEVICE or similar. This is the next best thing to the
1803 * correct behaviour, which would be to update the MTU on all
1806 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1807 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1811 * Misc support functions
1814 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1815 const struct in6_addr *dest)
1817 struct net *net = dev_net(ort->dst.dev);
1818 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1822 rt->dst.input = ort->dst.input;
1823 rt->dst.output = ort->dst.output;
1824 rt->dst.flags |= DST_HOST;
1826 rt->rt6i_dst.addr = *dest;
1827 rt->rt6i_dst.plen = 128;
1828 dst_copy_metrics(&rt->dst, &ort->dst);
1829 rt->dst.error = ort->dst.error;
1830 rt->rt6i_idev = ort->rt6i_idev;
1832 in6_dev_hold(rt->rt6i_idev);
1833 rt->dst.lastuse = jiffies;
1835 rt->rt6i_gateway = ort->rt6i_gateway;
1836 rt->rt6i_flags = ort->rt6i_flags;
1837 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1838 (RTF_DEFAULT | RTF_ADDRCONF))
1839 rt6_set_from(rt, ort);
1841 rt6_clean_expires(rt);
1842 rt->rt6i_metric = 0;
1844 #ifdef CONFIG_IPV6_SUBTREES
1845 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1847 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1848 rt->rt6i_table = ort->rt6i_table;
1853 #ifdef CONFIG_IPV6_ROUTE_INFO
1854 static struct rt6_info *rt6_get_route_info(struct net *net,
1855 const struct in6_addr *prefix, int prefixlen,
1856 const struct in6_addr *gwaddr, int ifindex)
1858 struct fib6_node *fn;
1859 struct rt6_info *rt = NULL;
1860 struct fib6_table *table;
1862 table = fib6_get_table(net, RT6_TABLE_INFO);
1866 write_lock_bh(&table->tb6_lock);
1867 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1871 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1872 if (rt->dst.dev->ifindex != ifindex)
1874 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1876 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1882 write_unlock_bh(&table->tb6_lock);
1886 static struct rt6_info *rt6_add_route_info(struct net *net,
1887 const struct in6_addr *prefix, int prefixlen,
1888 const struct in6_addr *gwaddr, int ifindex,
1891 struct fib6_config cfg = {
1892 .fc_table = RT6_TABLE_INFO,
1893 .fc_metric = IP6_RT_PRIO_USER,
1894 .fc_ifindex = ifindex,
1895 .fc_dst_len = prefixlen,
1896 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1897 RTF_UP | RTF_PREF(pref),
1899 .fc_nlinfo.nlh = NULL,
1900 .fc_nlinfo.nl_net = net,
1903 cfg.fc_dst = *prefix;
1904 cfg.fc_gateway = *gwaddr;
1906 /* We should treat it as a default route if prefix length is 0. */
1908 cfg.fc_flags |= RTF_DEFAULT;
1910 ip6_route_add(&cfg);
1912 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1916 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1918 struct rt6_info *rt;
1919 struct fib6_table *table;
1921 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1925 write_lock_bh(&table->tb6_lock);
1926 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1927 if (dev == rt->dst.dev &&
1928 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1929 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1934 write_unlock_bh(&table->tb6_lock);
1938 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1939 struct net_device *dev,
1942 struct fib6_config cfg = {
1943 .fc_table = RT6_TABLE_DFLT,
1944 .fc_metric = IP6_RT_PRIO_USER,
1945 .fc_ifindex = dev->ifindex,
1946 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1947 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1949 .fc_nlinfo.nlh = NULL,
1950 .fc_nlinfo.nl_net = dev_net(dev),
1953 cfg.fc_gateway = *gwaddr;
1955 ip6_route_add(&cfg);
1957 return rt6_get_dflt_router(gwaddr, dev);
1960 void rt6_purge_dflt_routers(struct net *net)
1962 struct rt6_info *rt;
1963 struct fib6_table *table;
1965 /* NOTE: Keep consistent with rt6_get_dflt_router */
1966 table = fib6_get_table(net, RT6_TABLE_DFLT);
1971 read_lock_bh(&table->tb6_lock);
1972 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1973 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1975 read_unlock_bh(&table->tb6_lock);
1980 read_unlock_bh(&table->tb6_lock);
1983 static void rtmsg_to_fib6_config(struct net *net,
1984 struct in6_rtmsg *rtmsg,
1985 struct fib6_config *cfg)
1987 memset(cfg, 0, sizeof(*cfg));
1989 cfg->fc_table = RT6_TABLE_MAIN;
1990 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1991 cfg->fc_metric = rtmsg->rtmsg_metric;
1992 cfg->fc_expires = rtmsg->rtmsg_info;
1993 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1994 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1995 cfg->fc_flags = rtmsg->rtmsg_flags;
1997 cfg->fc_nlinfo.nl_net = net;
1999 cfg->fc_dst = rtmsg->rtmsg_dst;
2000 cfg->fc_src = rtmsg->rtmsg_src;
2001 cfg->fc_gateway = rtmsg->rtmsg_gateway;
2004 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2006 struct fib6_config cfg;
2007 struct in6_rtmsg rtmsg;
2011 case SIOCADDRT: /* Add a route */
2012 case SIOCDELRT: /* Delete a route */
2013 if (!capable(CAP_NET_ADMIN))
2015 err = copy_from_user(&rtmsg, arg,
2016 sizeof(struct in6_rtmsg));
2020 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2025 err = ip6_route_add(&cfg);
2028 err = ip6_route_del(&cfg);
2042 * Drop the packet on the floor
2045 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2048 struct dst_entry *dst = skb_dst(skb);
2049 switch (ipstats_mib_noroutes) {
2050 case IPSTATS_MIB_INNOROUTES:
2051 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2052 if (type == IPV6_ADDR_ANY) {
2053 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2054 IPSTATS_MIB_INADDRERRORS);
2058 case IPSTATS_MIB_OUTNOROUTES:
2059 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2060 ipstats_mib_noroutes);
2063 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2068 static int ip6_pkt_discard(struct sk_buff *skb)
2070 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2073 static int ip6_pkt_discard_out(struct sk_buff *skb)
2075 skb->dev = skb_dst(skb)->dev;
2076 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2079 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2081 static int ip6_pkt_prohibit(struct sk_buff *skb)
2083 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2086 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2088 skb->dev = skb_dst(skb)->dev;
2089 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2095 * Allocate a dst for local (unicast / anycast) address.
2098 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2099 const struct in6_addr *addr,
2102 struct net *net = dev_net(idev->dev);
2103 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2104 net->loopback_dev, 0);
2108 net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2109 return ERR_PTR(-ENOMEM);
2114 rt->dst.flags |= DST_HOST;
2115 rt->dst.input = ip6_input;
2116 rt->dst.output = ip6_output;
2117 rt->rt6i_idev = idev;
2118 rt->dst.obsolete = -1;
2120 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2122 rt->rt6i_flags |= RTF_ANYCAST;
2124 rt->rt6i_flags |= RTF_LOCAL;
2125 err = rt6_bind_neighbour(rt, rt->dst.dev);
2128 return ERR_PTR(err);
2131 rt->rt6i_dst.addr = *addr;
2132 rt->rt6i_dst.plen = 128;
2133 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2135 atomic_set(&rt->dst.__refcnt, 1);
2140 int ip6_route_get_saddr(struct net *net,
2141 struct rt6_info *rt,
2142 const struct in6_addr *daddr,
2144 struct in6_addr *saddr)
2146 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2148 if (rt->rt6i_prefsrc.plen)
2149 *saddr = rt->rt6i_prefsrc.addr;
2151 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2152 daddr, prefs, saddr);
2156 /* remove deleted ip from prefsrc entries */
2157 struct arg_dev_net_ip {
2158 struct net_device *dev;
2160 struct in6_addr *addr;
2163 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2165 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2166 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2167 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2169 if (((void *)rt->dst.dev == dev || !dev) &&
2170 rt != net->ipv6.ip6_null_entry &&
2171 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2172 /* remove prefsrc entry */
2173 rt->rt6i_prefsrc.plen = 0;
2178 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2180 struct net *net = dev_net(ifp->idev->dev);
2181 struct arg_dev_net_ip adni = {
2182 .dev = ifp->idev->dev,
2186 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2189 struct arg_dev_net {
2190 struct net_device *dev;
2194 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2196 const struct arg_dev_net *adn = arg;
2197 const struct net_device *dev = adn->dev;
2199 if ((rt->dst.dev == dev || !dev) &&
2200 rt != adn->net->ipv6.ip6_null_entry)
2206 void rt6_ifdown(struct net *net, struct net_device *dev)
2208 struct arg_dev_net adn = {
2213 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2214 icmp6_clean_all(fib6_ifdown, &adn);
2217 struct rt6_mtu_change_arg {
2218 struct net_device *dev;
2222 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2224 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2225 struct inet6_dev *idev;
2227 /* In IPv6 pmtu discovery is not optional,
2228 so that RTAX_MTU lock cannot disable it.
2229 We still use this lock to block changes
2230 caused by addrconf/ndisc.
2233 idev = __in6_dev_get(arg->dev);
2237 /* For administrative MTU increase, there is no way to discover
2238 IPv6 PMTU increase, so PMTU increase should be updated here.
2239 Since RFC 1981 doesn't include administrative MTU increase
2240 update PMTU increase is a MUST. (i.e. jumbo frame)
2243 If new MTU is less than route PMTU, this new MTU will be the
2244 lowest MTU in the path, update the route PMTU to reflect PMTU
2245 decreases; if new MTU is greater than route PMTU, and the
2246 old MTU is the lowest MTU in the path, update the route PMTU
2247 to reflect the increase. In this case if the other nodes' MTU
2248 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2251 if (rt->dst.dev == arg->dev &&
2252 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2253 (dst_mtu(&rt->dst) >= arg->mtu ||
2254 (dst_mtu(&rt->dst) < arg->mtu &&
2255 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2256 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2261 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2263 struct rt6_mtu_change_arg arg = {
2268 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2271 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2272 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2273 [RTA_OIF] = { .type = NLA_U32 },
2274 [RTA_IIF] = { .type = NLA_U32 },
2275 [RTA_PRIORITY] = { .type = NLA_U32 },
2276 [RTA_METRICS] = { .type = NLA_NESTED },
2279 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2280 struct fib6_config *cfg)
2283 struct nlattr *tb[RTA_MAX+1];
2286 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2291 rtm = nlmsg_data(nlh);
2292 memset(cfg, 0, sizeof(*cfg));
2294 cfg->fc_table = rtm->rtm_table;
2295 cfg->fc_dst_len = rtm->rtm_dst_len;
2296 cfg->fc_src_len = rtm->rtm_src_len;
2297 cfg->fc_flags = RTF_UP;
2298 cfg->fc_protocol = rtm->rtm_protocol;
2300 if (rtm->rtm_type == RTN_UNREACHABLE)
2301 cfg->fc_flags |= RTF_REJECT;
2303 if (rtm->rtm_type == RTN_LOCAL)
2304 cfg->fc_flags |= RTF_LOCAL;
2306 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2307 cfg->fc_nlinfo.nlh = nlh;
2308 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2310 if (tb[RTA_GATEWAY]) {
2311 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2312 cfg->fc_flags |= RTF_GATEWAY;
2316 int plen = (rtm->rtm_dst_len + 7) >> 3;
2318 if (nla_len(tb[RTA_DST]) < plen)
2321 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2325 int plen = (rtm->rtm_src_len + 7) >> 3;
2327 if (nla_len(tb[RTA_SRC]) < plen)
2330 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2333 if (tb[RTA_PREFSRC])
2334 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2337 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2339 if (tb[RTA_PRIORITY])
2340 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2342 if (tb[RTA_METRICS]) {
2343 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2344 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2348 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2355 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2357 struct fib6_config cfg;
2360 err = rtm_to_fib6_config(skb, nlh, &cfg);
2364 return ip6_route_del(&cfg);
2367 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2369 struct fib6_config cfg;
2372 err = rtm_to_fib6_config(skb, nlh, &cfg);
2376 return ip6_route_add(&cfg);
2379 static inline size_t rt6_nlmsg_size(void)
2381 return NLMSG_ALIGN(sizeof(struct rtmsg))
2382 + nla_total_size(16) /* RTA_SRC */
2383 + nla_total_size(16) /* RTA_DST */
2384 + nla_total_size(16) /* RTA_GATEWAY */
2385 + nla_total_size(16) /* RTA_PREFSRC */
2386 + nla_total_size(4) /* RTA_TABLE */
2387 + nla_total_size(4) /* RTA_IIF */
2388 + nla_total_size(4) /* RTA_OIF */
2389 + nla_total_size(4) /* RTA_PRIORITY */
2390 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2391 + nla_total_size(sizeof(struct rta_cacheinfo));
2394 static int rt6_fill_node(struct net *net,
2395 struct sk_buff *skb, struct rt6_info *rt,
2396 struct in6_addr *dst, struct in6_addr *src,
2397 int iif, int type, u32 pid, u32 seq,
2398 int prefix, int nowait, unsigned int flags)
2400 const struct inet_peer *peer;
2402 struct nlmsghdr *nlh;
2405 struct neighbour *n;
2408 if (prefix) { /* user wants prefix routes only */
2409 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2410 /* success since this is not a prefix route */
2415 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2419 rtm = nlmsg_data(nlh);
2420 rtm->rtm_family = AF_INET6;
2421 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2422 rtm->rtm_src_len = rt->rt6i_src.plen;
2425 table = rt->rt6i_table->tb6_id;
2427 table = RT6_TABLE_UNSPEC;
2428 rtm->rtm_table = table;
2429 if (nla_put_u32(skb, RTA_TABLE, table))
2430 goto nla_put_failure;
2431 if (rt->rt6i_flags & RTF_REJECT)
2432 rtm->rtm_type = RTN_UNREACHABLE;
2433 else if (rt->rt6i_flags & RTF_LOCAL)
2434 rtm->rtm_type = RTN_LOCAL;
2435 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2436 rtm->rtm_type = RTN_LOCAL;
2438 rtm->rtm_type = RTN_UNICAST;
2440 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2441 rtm->rtm_protocol = rt->rt6i_protocol;
2442 if (rt->rt6i_flags & RTF_DYNAMIC)
2443 rtm->rtm_protocol = RTPROT_REDIRECT;
2444 else if (rt->rt6i_flags & RTF_ADDRCONF)
2445 rtm->rtm_protocol = RTPROT_KERNEL;
2446 else if (rt->rt6i_flags & RTF_DEFAULT)
2447 rtm->rtm_protocol = RTPROT_RA;
2449 if (rt->rt6i_flags & RTF_CACHE)
2450 rtm->rtm_flags |= RTM_F_CLONED;
2453 if (nla_put(skb, RTA_DST, 16, dst))
2454 goto nla_put_failure;
2455 rtm->rtm_dst_len = 128;
2456 } else if (rtm->rtm_dst_len)
2457 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2458 goto nla_put_failure;
2459 #ifdef CONFIG_IPV6_SUBTREES
2461 if (nla_put(skb, RTA_SRC, 16, src))
2462 goto nla_put_failure;
2463 rtm->rtm_src_len = 128;
2464 } else if (rtm->rtm_src_len &&
2465 nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2466 goto nla_put_failure;
2469 #ifdef CONFIG_IPV6_MROUTE
2470 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2471 int err = ip6mr_get_route(net, skb, rtm, nowait);
2476 goto nla_put_failure;
2478 if (err == -EMSGSIZE)
2479 goto nla_put_failure;
2484 if (nla_put_u32(skb, RTA_IIF, iif))
2485 goto nla_put_failure;
2487 struct in6_addr saddr_buf;
2488 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2489 nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2490 goto nla_put_failure;
2493 if (rt->rt6i_prefsrc.plen) {
2494 struct in6_addr saddr_buf;
2495 saddr_buf = rt->rt6i_prefsrc.addr;
2496 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2497 goto nla_put_failure;
2500 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2501 goto nla_put_failure;
2504 n = dst_get_neighbour_noref(&rt->dst);
2506 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2508 goto nla_put_failure;
2514 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2515 goto nla_put_failure;
2516 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2517 goto nla_put_failure;
2518 if (!(rt->rt6i_flags & RTF_EXPIRES))
2520 else if (rt->dst.expires - jiffies < INT_MAX)
2521 expires = rt->dst.expires - jiffies;
2525 peer = rt->rt6i_peer;
2527 if (peer && peer->tcp_ts_stamp) {
2529 tsage = get_seconds() - peer->tcp_ts_stamp;
2532 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2533 expires, rt->dst.error) < 0)
2534 goto nla_put_failure;
2536 return nlmsg_end(skb, nlh);
2539 nlmsg_cancel(skb, nlh);
2543 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2545 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2548 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2549 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2550 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2554 return rt6_fill_node(arg->net,
2555 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2556 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2557 prefix, 0, NLM_F_MULTI);
2560 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2562 struct net *net = sock_net(in_skb->sk);
2563 struct nlattr *tb[RTA_MAX+1];
2564 struct rt6_info *rt;
2565 struct sk_buff *skb;
2568 int err, iif = 0, oif = 0;
2570 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2575 memset(&fl6, 0, sizeof(fl6));
2578 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2581 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2585 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2588 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2592 iif = nla_get_u32(tb[RTA_IIF]);
2595 oif = nla_get_u32(tb[RTA_OIF]);
2598 struct net_device *dev;
2601 dev = __dev_get_by_index(net, iif);
2607 fl6.flowi6_iif = iif;
2609 if (!ipv6_addr_any(&fl6.saddr))
2610 flags |= RT6_LOOKUP_F_HAS_SADDR;
2612 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2615 fl6.flowi6_oif = oif;
2617 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2620 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2622 dst_release(&rt->dst);
2627 /* Reserve room for dummy headers, this skb can pass
2628 through good chunk of routing engine.
2630 skb_reset_mac_header(skb);
2631 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2633 skb_dst_set(skb, &rt->dst);
2635 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2636 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2637 nlh->nlmsg_seq, 0, 0, 0);
2643 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2648 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2650 struct sk_buff *skb;
2651 struct net *net = info->nl_net;
2656 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2658 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2662 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2663 event, info->pid, seq, 0, 0, 0);
2665 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2666 WARN_ON(err == -EMSGSIZE);
2670 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2671 info->nlh, gfp_any());
2675 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2678 static int ip6_route_dev_notify(struct notifier_block *this,
2679 unsigned long event, void *data)
2681 struct net_device *dev = (struct net_device *)data;
2682 struct net *net = dev_net(dev);
2684 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2685 net->ipv6.ip6_null_entry->dst.dev = dev;
2686 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2687 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2688 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2689 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2690 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2691 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2702 #ifdef CONFIG_PROC_FS
2713 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2715 struct seq_file *m = p_arg;
2716 struct neighbour *n;
2718 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2720 #ifdef CONFIG_IPV6_SUBTREES
2721 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2723 seq_puts(m, "00000000000000000000000000000000 00 ");
2726 n = dst_get_neighbour_noref(&rt->dst);
2728 seq_printf(m, "%pi6", n->primary_key);
2730 seq_puts(m, "00000000000000000000000000000000");
2733 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2734 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2735 rt->dst.__use, rt->rt6i_flags,
2736 rt->dst.dev ? rt->dst.dev->name : "");
2740 static int ipv6_route_show(struct seq_file *m, void *v)
2742 struct net *net = (struct net *)m->private;
2743 fib6_clean_all_ro(net, rt6_info_route, 0, m);
2747 static int ipv6_route_open(struct inode *inode, struct file *file)
2749 return single_open_net(inode, file, ipv6_route_show);
2752 static const struct file_operations ipv6_route_proc_fops = {
2753 .owner = THIS_MODULE,
2754 .open = ipv6_route_open,
2756 .llseek = seq_lseek,
2757 .release = single_release_net,
2760 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2762 struct net *net = (struct net *)seq->private;
2763 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2764 net->ipv6.rt6_stats->fib_nodes,
2765 net->ipv6.rt6_stats->fib_route_nodes,
2766 net->ipv6.rt6_stats->fib_rt_alloc,
2767 net->ipv6.rt6_stats->fib_rt_entries,
2768 net->ipv6.rt6_stats->fib_rt_cache,
2769 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2770 net->ipv6.rt6_stats->fib_discarded_routes);
2775 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2777 return single_open_net(inode, file, rt6_stats_seq_show);
2780 static const struct file_operations rt6_stats_seq_fops = {
2781 .owner = THIS_MODULE,
2782 .open = rt6_stats_seq_open,
2784 .llseek = seq_lseek,
2785 .release = single_release_net,
2787 #endif /* CONFIG_PROC_FS */
2789 #ifdef CONFIG_SYSCTL
2792 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2793 void __user *buffer, size_t *lenp, loff_t *ppos)
2800 net = (struct net *)ctl->extra1;
2801 delay = net->ipv6.sysctl.flush_delay;
2802 proc_dointvec(ctl, write, buffer, lenp, ppos);
2803 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2807 ctl_table ipv6_route_table_template[] = {
2809 .procname = "flush",
2810 .data = &init_net.ipv6.sysctl.flush_delay,
2811 .maxlen = sizeof(int),
2813 .proc_handler = ipv6_sysctl_rtcache_flush
2816 .procname = "gc_thresh",
2817 .data = &ip6_dst_ops_template.gc_thresh,
2818 .maxlen = sizeof(int),
2820 .proc_handler = proc_dointvec,
2823 .procname = "max_size",
2824 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2825 .maxlen = sizeof(int),
2827 .proc_handler = proc_dointvec,
2830 .procname = "gc_min_interval",
2831 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2832 .maxlen = sizeof(int),
2834 .proc_handler = proc_dointvec_jiffies,
2837 .procname = "gc_timeout",
2838 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2839 .maxlen = sizeof(int),
2841 .proc_handler = proc_dointvec_jiffies,
2844 .procname = "gc_interval",
2845 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2846 .maxlen = sizeof(int),
2848 .proc_handler = proc_dointvec_jiffies,
2851 .procname = "gc_elasticity",
2852 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2853 .maxlen = sizeof(int),
2855 .proc_handler = proc_dointvec,
2858 .procname = "mtu_expires",
2859 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2860 .maxlen = sizeof(int),
2862 .proc_handler = proc_dointvec_jiffies,
2865 .procname = "min_adv_mss",
2866 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2867 .maxlen = sizeof(int),
2869 .proc_handler = proc_dointvec,
2872 .procname = "gc_min_interval_ms",
2873 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2874 .maxlen = sizeof(int),
2876 .proc_handler = proc_dointvec_ms_jiffies,
2881 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2883 struct ctl_table *table;
2885 table = kmemdup(ipv6_route_table_template,
2886 sizeof(ipv6_route_table_template),
2890 table[0].data = &net->ipv6.sysctl.flush_delay;
2891 table[0].extra1 = net;
2892 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2893 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2894 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2895 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2896 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2897 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2898 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2899 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2900 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2907 static int __net_init ip6_route_net_init(struct net *net)
2911 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2912 sizeof(net->ipv6.ip6_dst_ops));
2914 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2915 goto out_ip6_dst_ops;
2917 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2918 sizeof(*net->ipv6.ip6_null_entry),
2920 if (!net->ipv6.ip6_null_entry)
2921 goto out_ip6_dst_entries;
2922 net->ipv6.ip6_null_entry->dst.path =
2923 (struct dst_entry *)net->ipv6.ip6_null_entry;
2924 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2925 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2926 ip6_template_metrics, true);
2928 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2929 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2930 sizeof(*net->ipv6.ip6_prohibit_entry),
2932 if (!net->ipv6.ip6_prohibit_entry)
2933 goto out_ip6_null_entry;
2934 net->ipv6.ip6_prohibit_entry->dst.path =
2935 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2936 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2937 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2938 ip6_template_metrics, true);
2940 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2941 sizeof(*net->ipv6.ip6_blk_hole_entry),
2943 if (!net->ipv6.ip6_blk_hole_entry)
2944 goto out_ip6_prohibit_entry;
2945 net->ipv6.ip6_blk_hole_entry->dst.path =
2946 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2947 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2948 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2949 ip6_template_metrics, true);
2952 net->ipv6.sysctl.flush_delay = 0;
2953 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2954 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2955 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2956 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2957 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2958 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2959 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2961 #ifdef CONFIG_PROC_FS
2962 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2963 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2965 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2971 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2972 out_ip6_prohibit_entry:
2973 kfree(net->ipv6.ip6_prohibit_entry);
2975 kfree(net->ipv6.ip6_null_entry);
2977 out_ip6_dst_entries:
2978 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2983 static void __net_exit ip6_route_net_exit(struct net *net)
2985 #ifdef CONFIG_PROC_FS
2986 proc_net_remove(net, "ipv6_route");
2987 proc_net_remove(net, "rt6_stats");
2989 kfree(net->ipv6.ip6_null_entry);
2990 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2991 kfree(net->ipv6.ip6_prohibit_entry);
2992 kfree(net->ipv6.ip6_blk_hole_entry);
2994 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2997 static struct pernet_operations ip6_route_net_ops = {
2998 .init = ip6_route_net_init,
2999 .exit = ip6_route_net_exit,
3002 static struct notifier_block ip6_route_dev_notifier = {
3003 .notifier_call = ip6_route_dev_notify,
3007 int __init ip6_route_init(void)
3012 ip6_dst_ops_template.kmem_cachep =
3013 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3014 SLAB_HWCACHE_ALIGN, NULL);
3015 if (!ip6_dst_ops_template.kmem_cachep)
3018 ret = dst_entries_init(&ip6_dst_blackhole_ops);
3020 goto out_kmem_cache;
3022 ret = register_pernet_subsys(&ip6_route_net_ops);
3024 goto out_dst_entries;
3026 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3028 /* Registering of the loopback is done before this portion of code,
3029 * the loopback reference in rt6_info will not be taken, do it
3030 * manually for init_net */
3031 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3032 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3033 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3034 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3035 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3036 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3037 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3041 goto out_register_subsys;
3047 ret = fib6_rules_init();
3052 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3053 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3054 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3055 goto fib6_rules_init;
3057 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3059 goto fib6_rules_init;
3065 fib6_rules_cleanup();
3070 out_register_subsys:
3071 unregister_pernet_subsys(&ip6_route_net_ops);
3073 dst_entries_destroy(&ip6_dst_blackhole_ops);
3075 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3079 void ip6_route_cleanup(void)
3081 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3082 fib6_rules_cleanup();
3085 unregister_pernet_subsys(&ip6_route_net_ops);
3086 dst_entries_destroy(&ip6_dst_blackhole_ops);
3087 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);