ipv6: Limit mtu to 65575 bytes
[pandora-kernel.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 /* Set to 3 to get tracing. */
66 #define RT6_DEBUG 2
67
68 #if RT6_DEBUG >= 3
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #else
72 #define RDBG(x)
73 #define RT6_TRACE(x...) do { ; } while (0)
74 #endif
75
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77                                     const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int      ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void             ip6_dst_destroy(struct dst_entry *);
83 static void             ip6_dst_ifdown(struct dst_entry *,
84                                        struct net_device *dev, int how);
85 static int               ip6_dst_gc(struct dst_ops *ops);
86
87 static int              ip6_pkt_discard(struct sk_buff *skb);
88 static int              ip6_pkt_discard_out(struct sk_buff *skb);
89 static void             ip6_link_failure(struct sk_buff *skb);
90 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
91
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94                                            const struct in6_addr *prefix, int prefixlen,
95                                            const struct in6_addr *gwaddr, int ifindex,
96                                            unsigned pref);
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98                                            const struct in6_addr *prefix, int prefixlen,
99                                            const struct in6_addr *gwaddr, int ifindex);
100 #endif
101
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
103 {
104         struct rt6_info *rt = (struct rt6_info *) dst;
105         struct inet_peer *peer;
106         u32 *p = NULL;
107
108         if (!(rt->dst.flags & DST_HOST))
109                 return NULL;
110
111         if (!rt->rt6i_peer)
112                 rt6_bind_peer(rt, 1);
113
114         peer = rt->rt6i_peer;
115         if (peer) {
116                 u32 *old_p = __DST_METRICS_PTR(old);
117                 unsigned long prev, new;
118
119                 p = peer->metrics;
120                 if (inet_metrics_new(peer))
121                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
122
123                 new = (unsigned long) p;
124                 prev = cmpxchg(&dst->_metrics, old, new);
125
126                 if (prev != old) {
127                         p = __DST_METRICS_PTR(prev);
128                         if (prev & DST_METRICS_READ_ONLY)
129                                 p = NULL;
130                 }
131         }
132         return p;
133 }
134
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
136 {
137         return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
138 }
139
140 static struct dst_ops ip6_dst_ops_template = {
141         .family                 =       AF_INET6,
142         .protocol               =       cpu_to_be16(ETH_P_IPV6),
143         .gc                     =       ip6_dst_gc,
144         .gc_thresh              =       1024,
145         .check                  =       ip6_dst_check,
146         .default_advmss         =       ip6_default_advmss,
147         .mtu                    =       ip6_mtu,
148         .cow_metrics            =       ipv6_cow_metrics,
149         .destroy                =       ip6_dst_destroy,
150         .ifdown                 =       ip6_dst_ifdown,
151         .negative_advice        =       ip6_negative_advice,
152         .link_failure           =       ip6_link_failure,
153         .update_pmtu            =       ip6_rt_update_pmtu,
154         .local_out              =       __ip6_local_out,
155         .neigh_lookup           =       ip6_neigh_lookup,
156 };
157
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
159 {
160         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
161
162         return mtu ? : dst->dev->mtu;
163 }
164
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
166 {
167 }
168
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
170                                          unsigned long old)
171 {
172         return NULL;
173 }
174
175 static struct dst_ops ip6_dst_blackhole_ops = {
176         .family                 =       AF_INET6,
177         .protocol               =       cpu_to_be16(ETH_P_IPV6),
178         .destroy                =       ip6_dst_destroy,
179         .check                  =       ip6_dst_check,
180         .mtu                    =       ip6_blackhole_mtu,
181         .default_advmss         =       ip6_default_advmss,
182         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
183         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
184         .neigh_lookup           =       ip6_neigh_lookup,
185 };
186
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188         [RTAX_HOPLIMIT - 1] = 0,
189 };
190
191 static struct rt6_info ip6_null_entry_template = {
192         .dst = {
193                 .__refcnt       = ATOMIC_INIT(1),
194                 .__use          = 1,
195                 .obsolete       = -1,
196                 .error          = -ENETUNREACH,
197                 .input          = ip6_pkt_discard,
198                 .output         = ip6_pkt_discard_out,
199         },
200         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
201         .rt6i_protocol  = RTPROT_KERNEL,
202         .rt6i_metric    = ~(u32) 0,
203         .rt6i_ref       = ATOMIC_INIT(1),
204 };
205
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
207
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
210
211 static struct rt6_info ip6_prohibit_entry_template = {
212         .dst = {
213                 .__refcnt       = ATOMIC_INIT(1),
214                 .__use          = 1,
215                 .obsolete       = -1,
216                 .error          = -EACCES,
217                 .input          = ip6_pkt_prohibit,
218                 .output         = ip6_pkt_prohibit_out,
219         },
220         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
221         .rt6i_protocol  = RTPROT_KERNEL,
222         .rt6i_metric    = ~(u32) 0,
223         .rt6i_ref       = ATOMIC_INIT(1),
224 };
225
226 static struct rt6_info ip6_blk_hole_entry_template = {
227         .dst = {
228                 .__refcnt       = ATOMIC_INIT(1),
229                 .__use          = 1,
230                 .obsolete       = -1,
231                 .error          = -EINVAL,
232                 .input          = dst_discard,
233                 .output         = dst_discard,
234         },
235         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
236         .rt6i_protocol  = RTPROT_KERNEL,
237         .rt6i_metric    = ~(u32) 0,
238         .rt6i_ref       = ATOMIC_INIT(1),
239 };
240
241 #endif
242
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245                                              struct net_device *dev,
246                                              int flags)
247 {
248         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
249
250         if (rt != NULL)
251                 memset(&rt->rt6i_table, 0,
252                         sizeof(*rt) - sizeof(struct dst_entry));
253
254         return rt;
255 }
256
257 static void ip6_dst_destroy(struct dst_entry *dst)
258 {
259         struct rt6_info *rt = (struct rt6_info *)dst;
260         struct inet6_dev *idev = rt->rt6i_idev;
261         struct inet_peer *peer = rt->rt6i_peer;
262
263         if (!(rt->dst.flags & DST_HOST))
264                 dst_destroy_metrics_generic(dst);
265
266         if (idev != NULL) {
267                 rt->rt6i_idev = NULL;
268                 in6_dev_put(idev);
269         }
270         if (peer) {
271                 rt->rt6i_peer = NULL;
272                 inet_putpeer(peer);
273         }
274 }
275
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
277
278 static u32 rt6_peer_genid(void)
279 {
280         return atomic_read(&__rt6_peer_genid);
281 }
282
283 void rt6_bind_peer(struct rt6_info *rt, int create)
284 {
285         struct inet_peer *peer;
286
287         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
289                 inet_putpeer(peer);
290         else
291                 rt->rt6i_peer_genid = rt6_peer_genid();
292 }
293
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
295                            int how)
296 {
297         struct rt6_info *rt = (struct rt6_info *)dst;
298         struct inet6_dev *idev = rt->rt6i_idev;
299         struct net_device *loopback_dev =
300                 dev_net(dev)->loopback_dev;
301
302         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
303                 struct inet6_dev *loopback_idev =
304                         in6_dev_get(loopback_dev);
305                 if (loopback_idev != NULL) {
306                         rt->rt6i_idev = loopback_idev;
307                         in6_dev_put(idev);
308                 }
309         }
310 }
311
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
313 {
314         return (rt->rt6i_flags & RTF_EXPIRES) &&
315                 time_after(jiffies, rt->rt6i_expires);
316 }
317
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
319 {
320         return ipv6_addr_type(daddr) &
321                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
322 }
323
324 /*
325  *      Route lookup. Any table->tb6_lock is implied.
326  */
327
328 static inline struct rt6_info *rt6_device_match(struct net *net,
329                                                     struct rt6_info *rt,
330                                                     const struct in6_addr *saddr,
331                                                     int oif,
332                                                     int flags)
333 {
334         struct rt6_info *local = NULL;
335         struct rt6_info *sprt;
336
337         if (!oif && ipv6_addr_any(saddr))
338                 goto out;
339
340         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341                 struct net_device *dev = sprt->rt6i_dev;
342
343                 if (oif) {
344                         if (dev->ifindex == oif)
345                                 return sprt;
346                         if (dev->flags & IFF_LOOPBACK) {
347                                 if (sprt->rt6i_idev == NULL ||
348                                     sprt->rt6i_idev->dev->ifindex != oif) {
349                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
350                                                 continue;
351                                         if (local && (!oif ||
352                                                       local->rt6i_idev->dev->ifindex == oif))
353                                                 continue;
354                                 }
355                                 local = sprt;
356                         }
357                 } else {
358                         if (ipv6_chk_addr(net, saddr, dev,
359                                           flags & RT6_LOOKUP_F_IFACE))
360                                 return sprt;
361                 }
362         }
363
364         if (oif) {
365                 if (local)
366                         return local;
367
368                 if (flags & RT6_LOOKUP_F_IFACE)
369                         return net->ipv6.ip6_null_entry;
370         }
371 out:
372         return rt;
373 }
374
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
377 {
378         struct neighbour *neigh;
379         /*
380          * Okay, this does not seem to be appropriate
381          * for now, however, we need to check if it
382          * is really so; aka Router Reachability Probing.
383          *
384          * Router Reachability Probe MUST be rate-limited
385          * to no more than one per minute.
386          */
387         rcu_read_lock();
388         neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389         if (!neigh || (neigh->nud_state & NUD_VALID))
390                 goto out;
391         read_lock_bh(&neigh->lock);
392         if (!(neigh->nud_state & NUD_VALID) &&
393             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394                 struct in6_addr mcaddr;
395                 struct in6_addr *target;
396
397                 neigh->updated = jiffies;
398                 read_unlock_bh(&neigh->lock);
399
400                 target = (struct in6_addr *)&neigh->primary_key;
401                 addrconf_addr_solict_mult(target, &mcaddr);
402                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
403         } else {
404                 read_unlock_bh(&neigh->lock);
405         }
406 out:
407         rcu_read_unlock();
408 }
409 #else
410 static inline void rt6_probe(struct rt6_info *rt)
411 {
412 }
413 #endif
414
415 /*
416  * Default Router Selection (RFC 2461 6.3.6)
417  */
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
419 {
420         struct net_device *dev = rt->rt6i_dev;
421         if (!oif || dev->ifindex == oif)
422                 return 2;
423         if ((dev->flags & IFF_LOOPBACK) &&
424             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
425                 return 1;
426         return 0;
427 }
428
429 static inline int rt6_check_neigh(struct rt6_info *rt)
430 {
431         struct neighbour *neigh;
432         int m;
433
434         rcu_read_lock();
435         neigh = dst_get_neighbour(&rt->dst);
436         if (rt->rt6i_flags & RTF_NONEXTHOP ||
437             !(rt->rt6i_flags & RTF_GATEWAY))
438                 m = 1;
439         else if (neigh) {
440                 read_lock_bh(&neigh->lock);
441                 if (neigh->nud_state & NUD_VALID)
442                         m = 2;
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444                 else if (neigh->nud_state & NUD_FAILED)
445                         m = 0;
446 #endif
447                 else
448                         m = 1;
449                 read_unlock_bh(&neigh->lock);
450         } else
451                 m = 0;
452         rcu_read_unlock();
453         return m;
454 }
455
456 static int rt6_score_route(struct rt6_info *rt, int oif,
457                            int strict)
458 {
459         int m, n;
460
461         m = rt6_check_dev(rt, oif);
462         if (!m && (strict & RT6_LOOKUP_F_IFACE))
463                 return -1;
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
466 #endif
467         n = rt6_check_neigh(rt);
468         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
469                 return -1;
470         return m;
471 }
472
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474                                    int *mpri, struct rt6_info *match)
475 {
476         int m;
477
478         if (rt6_check_expired(rt))
479                 goto out;
480
481         m = rt6_score_route(rt, oif, strict);
482         if (m < 0)
483                 goto out;
484
485         if (m > *mpri) {
486                 if (strict & RT6_LOOKUP_F_REACHABLE)
487                         rt6_probe(match);
488                 *mpri = m;
489                 match = rt;
490         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
491                 rt6_probe(rt);
492         }
493
494 out:
495         return match;
496 }
497
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499                                      struct rt6_info *rr_head,
500                                      u32 metric, int oif, int strict)
501 {
502         struct rt6_info *rt, *match;
503         int mpri = -1;
504
505         match = NULL;
506         for (rt = rr_head; rt && rt->rt6i_metric == metric;
507              rt = rt->dst.rt6_next)
508                 match = find_match(rt, oif, strict, &mpri, match);
509         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510              rt = rt->dst.rt6_next)
511                 match = find_match(rt, oif, strict, &mpri, match);
512
513         return match;
514 }
515
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
517 {
518         struct rt6_info *match, *rt0;
519         struct net *net;
520
521         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522                   __func__, fn->leaf, oif);
523
524         rt0 = fn->rr_ptr;
525         if (!rt0)
526                 fn->rr_ptr = rt0 = fn->leaf;
527
528         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
529
530         if (!match &&
531             (strict & RT6_LOOKUP_F_REACHABLE)) {
532                 struct rt6_info *next = rt0->dst.rt6_next;
533
534                 /* no entries matched; do round-robin */
535                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
536                         next = fn->leaf;
537
538                 if (next != rt0)
539                         fn->rr_ptr = next;
540         }
541
542         RT6_TRACE("%s() => %p\n",
543                   __func__, match);
544
545         net = dev_net(rt0->rt6i_dev);
546         return match ? match : net->ipv6.ip6_null_entry;
547 }
548
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551                   const struct in6_addr *gwaddr)
552 {
553         struct net *net = dev_net(dev);
554         struct route_info *rinfo = (struct route_info *) opt;
555         struct in6_addr prefix_buf, *prefix;
556         unsigned int pref;
557         unsigned long lifetime;
558         struct rt6_info *rt;
559
560         if (len < sizeof(struct route_info)) {
561                 return -EINVAL;
562         }
563
564         /* Sanity check for prefix_len and length */
565         if (rinfo->length > 3) {
566                 return -EINVAL;
567         } else if (rinfo->prefix_len > 128) {
568                 return -EINVAL;
569         } else if (rinfo->prefix_len > 64) {
570                 if (rinfo->length < 2) {
571                         return -EINVAL;
572                 }
573         } else if (rinfo->prefix_len > 0) {
574                 if (rinfo->length < 1) {
575                         return -EINVAL;
576                 }
577         }
578
579         pref = rinfo->route_pref;
580         if (pref == ICMPV6_ROUTER_PREF_INVALID)
581                 return -EINVAL;
582
583         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
584
585         if (rinfo->length == 3)
586                 prefix = (struct in6_addr *)rinfo->prefix;
587         else {
588                 /* this function is safe */
589                 ipv6_addr_prefix(&prefix_buf,
590                                  (struct in6_addr *)rinfo->prefix,
591                                  rinfo->prefix_len);
592                 prefix = &prefix_buf;
593         }
594
595         if (rinfo->prefix_len == 0)
596                 rt = rt6_get_dflt_router(gwaddr, dev);
597         else
598                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
599                                         gwaddr, dev->ifindex);
600
601         if (rt && !lifetime) {
602                 ip6_del_rt(rt);
603                 rt = NULL;
604         }
605
606         if (!rt && lifetime)
607                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
608                                         pref);
609         else if (rt)
610                 rt->rt6i_flags = RTF_ROUTEINFO |
611                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
612
613         if (rt) {
614                 if (!addrconf_finite_timeout(lifetime)) {
615                         rt->rt6i_flags &= ~RTF_EXPIRES;
616                 } else {
617                         rt->rt6i_expires = jiffies + HZ * lifetime;
618                         rt->rt6i_flags |= RTF_EXPIRES;
619                 }
620                 dst_release(&rt->dst);
621         }
622         return 0;
623 }
624 #endif
625
626 #define BACKTRACK(__net, saddr)                 \
627 do { \
628         if (rt == __net->ipv6.ip6_null_entry) { \
629                 struct fib6_node *pn; \
630                 while (1) { \
631                         if (fn->fn_flags & RTN_TL_ROOT) \
632                                 goto out; \
633                         pn = fn->parent; \
634                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
635                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
636                         else \
637                                 fn = pn; \
638                         if (fn->fn_flags & RTN_RTINFO) \
639                                 goto restart; \
640                 } \
641         } \
642 } while(0)
643
644 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
645                                              struct fib6_table *table,
646                                              struct flowi6 *fl6, int flags)
647 {
648         struct fib6_node *fn;
649         struct rt6_info *rt;
650
651         read_lock_bh(&table->tb6_lock);
652         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
653 restart:
654         rt = fn->leaf;
655         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
656         BACKTRACK(net, &fl6->saddr);
657 out:
658         dst_use(&rt->dst, jiffies);
659         read_unlock_bh(&table->tb6_lock);
660         return rt;
661
662 }
663
664 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
665                             const struct in6_addr *saddr, int oif, int strict)
666 {
667         struct flowi6 fl6 = {
668                 .flowi6_oif = oif,
669                 .daddr = *daddr,
670         };
671         struct dst_entry *dst;
672         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
673
674         if (saddr) {
675                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
676                 flags |= RT6_LOOKUP_F_HAS_SADDR;
677         }
678
679         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
680         if (dst->error == 0)
681                 return (struct rt6_info *) dst;
682
683         dst_release(dst);
684
685         return NULL;
686 }
687
688 EXPORT_SYMBOL(rt6_lookup);
689
690 /* ip6_ins_rt is called with FREE table->tb6_lock.
691    It takes new route entry, the addition fails by any reason the
692    route is freed. In any case, if caller does not hold it, it may
693    be destroyed.
694  */
695
696 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
697 {
698         int err;
699         struct fib6_table *table;
700
701         table = rt->rt6i_table;
702         write_lock_bh(&table->tb6_lock);
703         err = fib6_add(&table->tb6_root, rt, info);
704         write_unlock_bh(&table->tb6_lock);
705
706         return err;
707 }
708
709 int ip6_ins_rt(struct rt6_info *rt)
710 {
711         struct nl_info info = {
712                 .nl_net = dev_net(rt->rt6i_dev),
713         };
714         return __ip6_ins_rt(rt, &info);
715 }
716
717 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
718                                       const struct in6_addr *daddr,
719                                       const struct in6_addr *saddr)
720 {
721         struct rt6_info *rt;
722
723         /*
724          *      Clone the route.
725          */
726
727         rt = ip6_rt_copy(ort, daddr);
728
729         if (rt) {
730                 struct neighbour *neigh;
731                 int attempts = !in_softirq();
732
733                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
734                         if (ort->rt6i_dst.plen != 128 &&
735                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
736                                 rt->rt6i_flags |= RTF_ANYCAST;
737                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
738                 }
739
740                 rt->rt6i_flags |= RTF_CACHE;
741
742 #ifdef CONFIG_IPV6_SUBTREES
743                 if (rt->rt6i_src.plen && saddr) {
744                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
745                         rt->rt6i_src.plen = 128;
746                 }
747 #endif
748
749         retry:
750                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
751                 if (IS_ERR(neigh)) {
752                         struct net *net = dev_net(rt->rt6i_dev);
753                         int saved_rt_min_interval =
754                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
755                         int saved_rt_elasticity =
756                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
757
758                         if (attempts-- > 0) {
759                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
760                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
761
762                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
763
764                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
765                                         saved_rt_elasticity;
766                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
767                                         saved_rt_min_interval;
768                                 goto retry;
769                         }
770
771                         if (net_ratelimit())
772                                 printk(KERN_WARNING
773                                        "ipv6: Neighbour table overflow.\n");
774                         dst_free(&rt->dst);
775                         return NULL;
776                 }
777                 dst_set_neighbour(&rt->dst, neigh);
778
779         }
780
781         return rt;
782 }
783
784 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
785                                         const struct in6_addr *daddr)
786 {
787         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
788
789         if (rt) {
790                 rt->rt6i_flags |= RTF_CACHE;
791                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
792         }
793         return rt;
794 }
795
796 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
797                                       struct flowi6 *fl6, int flags, bool input)
798 {
799         struct fib6_node *fn;
800         struct rt6_info *rt, *nrt;
801         int strict = 0;
802         int attempts = 3;
803         int err;
804         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
805         int local = RTF_NONEXTHOP;
806
807         strict |= flags & RT6_LOOKUP_F_IFACE;
808         if (input)
809                 local |= RTF_LOCAL;
810
811 relookup:
812         read_lock_bh(&table->tb6_lock);
813
814 restart_2:
815         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
816
817 restart:
818         rt = rt6_select(fn, oif, strict | reachable);
819
820         BACKTRACK(net, &fl6->saddr);
821         if (rt == net->ipv6.ip6_null_entry ||
822             rt->rt6i_flags & RTF_CACHE)
823                 goto out;
824
825         dst_hold(&rt->dst);
826         read_unlock_bh(&table->tb6_lock);
827
828         if (!dst_get_neighbour_raw(&rt->dst)
829             && !(rt->rt6i_flags & local))
830                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
831         else if (!(rt->dst.flags & DST_HOST))
832                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
833         else
834                 goto out2;
835
836         dst_release(&rt->dst);
837         rt = nrt ? : net->ipv6.ip6_null_entry;
838
839         dst_hold(&rt->dst);
840         if (nrt) {
841                 err = ip6_ins_rt(nrt);
842                 if (!err)
843                         goto out2;
844         }
845
846         if (--attempts <= 0)
847                 goto out2;
848
849         /*
850          * Race condition! In the gap, when table->tb6_lock was
851          * released someone could insert this route.  Relookup.
852          */
853         dst_release(&rt->dst);
854         goto relookup;
855
856 out:
857         if (reachable) {
858                 reachable = 0;
859                 goto restart_2;
860         }
861         dst_hold(&rt->dst);
862         read_unlock_bh(&table->tb6_lock);
863 out2:
864         rt->dst.lastuse = jiffies;
865         rt->dst.__use++;
866
867         return rt;
868 }
869
870 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
871                                             struct flowi6 *fl6, int flags)
872 {
873         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags, true);
874 }
875
876 void ip6_route_input(struct sk_buff *skb)
877 {
878         const struct ipv6hdr *iph = ipv6_hdr(skb);
879         struct net *net = dev_net(skb->dev);
880         int flags = RT6_LOOKUP_F_HAS_SADDR;
881         struct flowi6 fl6 = {
882                 .flowi6_iif = skb->dev->ifindex,
883                 .daddr = iph->daddr,
884                 .saddr = iph->saddr,
885                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
886                 .flowi6_mark = skb->mark,
887                 .flowi6_proto = iph->nexthdr,
888         };
889
890         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
891                 flags |= RT6_LOOKUP_F_IFACE;
892
893         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
894 }
895
896 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
897                                              struct flowi6 *fl6, int flags)
898 {
899         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags, false);
900 }
901
902 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
903                                     struct flowi6 *fl6)
904 {
905         int flags = 0;
906
907         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
908                 flags |= RT6_LOOKUP_F_IFACE;
909
910         if (!ipv6_addr_any(&fl6->saddr))
911                 flags |= RT6_LOOKUP_F_HAS_SADDR;
912         else if (sk)
913                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
914
915         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
916 }
917
918 EXPORT_SYMBOL(ip6_route_output);
919
920 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
921 {
922         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
923         struct dst_entry *new = NULL;
924
925         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
926         if (rt) {
927                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
928
929                 new = &rt->dst;
930
931                 new->__use = 1;
932                 new->input = dst_discard;
933                 new->output = dst_discard;
934
935                 if (dst_metrics_read_only(&ort->dst))
936                         new->_metrics = ort->dst._metrics;
937                 else
938                         dst_copy_metrics(new, &ort->dst);
939                 rt->rt6i_idev = ort->rt6i_idev;
940                 if (rt->rt6i_idev)
941                         in6_dev_hold(rt->rt6i_idev);
942                 rt->rt6i_expires = 0;
943
944                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
945                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
946                 rt->rt6i_metric = 0;
947
948                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
949 #ifdef CONFIG_IPV6_SUBTREES
950                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
951 #endif
952
953                 dst_free(new);
954         }
955
956         dst_release(dst_orig);
957         return new ? new : ERR_PTR(-ENOMEM);
958 }
959
960 /*
961  *      Destination cache support functions
962  */
963
964 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
965 {
966         struct rt6_info *rt;
967
968         rt = (struct rt6_info *) dst;
969
970         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
971                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
972                         if (!rt->rt6i_peer)
973                                 rt6_bind_peer(rt, 0);
974                         rt->rt6i_peer_genid = rt6_peer_genid();
975                 }
976                 return dst;
977         }
978         return NULL;
979 }
980
981 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
982 {
983         struct rt6_info *rt = (struct rt6_info *) dst;
984
985         if (rt) {
986                 if (rt->rt6i_flags & RTF_CACHE) {
987                         if (rt6_check_expired(rt)) {
988                                 ip6_del_rt(rt);
989                                 dst = NULL;
990                         }
991                 } else {
992                         dst_release(dst);
993                         dst = NULL;
994                 }
995         }
996         return dst;
997 }
998
999 static void ip6_link_failure(struct sk_buff *skb)
1000 {
1001         struct rt6_info *rt;
1002
1003         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1004
1005         rt = (struct rt6_info *) skb_dst(skb);
1006         if (rt) {
1007                 if (rt->rt6i_flags&RTF_CACHE) {
1008                         dst_set_expires(&rt->dst, 0);
1009                         rt->rt6i_flags |= RTF_EXPIRES;
1010                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1011                         rt->rt6i_node->fn_sernum = -1;
1012         }
1013 }
1014
1015 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1016 {
1017         struct rt6_info *rt6 = (struct rt6_info*)dst;
1018
1019         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1020                 rt6->rt6i_flags |= RTF_MODIFIED;
1021                 if (mtu < IPV6_MIN_MTU) {
1022                         u32 features = dst_metric(dst, RTAX_FEATURES);
1023                         mtu = IPV6_MIN_MTU;
1024                         features |= RTAX_FEATURE_ALLFRAG;
1025                         dst_metric_set(dst, RTAX_FEATURES, features);
1026                 }
1027                 dst_metric_set(dst, RTAX_MTU, mtu);
1028         }
1029 }
1030
1031 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1032 {
1033         struct net_device *dev = dst->dev;
1034         unsigned int mtu = dst_mtu(dst);
1035         struct net *net = dev_net(dev);
1036
1037         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1038
1039         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1040                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1041
1042         /*
1043          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1044          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1045          * IPV6_MAXPLEN is also valid and means: "any MSS,
1046          * rely only on pmtu discovery"
1047          */
1048         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1049                 mtu = IPV6_MAXPLEN;
1050         return mtu;
1051 }
1052
1053 static unsigned int ip6_mtu(const struct dst_entry *dst)
1054 {
1055         struct inet6_dev *idev;
1056         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1057
1058         if (mtu)
1059                 goto out;
1060
1061         mtu = IPV6_MIN_MTU;
1062
1063         rcu_read_lock();
1064         idev = __in6_dev_get(dst->dev);
1065         if (idev)
1066                 mtu = idev->cnf.mtu6;
1067         rcu_read_unlock();
1068
1069 out:
1070         return min_t(unsigned int, mtu, IP6_MAX_MTU);
1071 }
1072
1073 static struct dst_entry *icmp6_dst_gc_list;
1074 static DEFINE_SPINLOCK(icmp6_dst_lock);
1075
1076 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1077                                   struct neighbour *neigh,
1078                                   const struct in6_addr *addr)
1079 {
1080         struct rt6_info *rt;
1081         struct inet6_dev *idev = in6_dev_get(dev);
1082         struct net *net = dev_net(dev);
1083
1084         if (unlikely(idev == NULL))
1085                 return NULL;
1086
1087         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1088         if (unlikely(rt == NULL)) {
1089                 in6_dev_put(idev);
1090                 goto out;
1091         }
1092
1093         if (neigh)
1094                 neigh_hold(neigh);
1095         else {
1096                 neigh = ndisc_get_neigh(dev, addr);
1097                 if (IS_ERR(neigh))
1098                         neigh = NULL;
1099         }
1100
1101         rt->dst.flags |= DST_HOST;
1102         rt->dst.output  = ip6_output;
1103         dst_set_neighbour(&rt->dst, neigh);
1104         atomic_set(&rt->dst.__refcnt, 1);
1105         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1106         rt->rt6i_dst.plen = 128;
1107         rt->rt6i_idev     = idev;
1108         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1109
1110         spin_lock_bh(&icmp6_dst_lock);
1111         rt->dst.next = icmp6_dst_gc_list;
1112         icmp6_dst_gc_list = &rt->dst;
1113         spin_unlock_bh(&icmp6_dst_lock);
1114
1115         fib6_force_start_gc(net);
1116
1117 out:
1118         return &rt->dst;
1119 }
1120
1121 int icmp6_dst_gc(void)
1122 {
1123         struct dst_entry *dst, **pprev;
1124         int more = 0;
1125
1126         spin_lock_bh(&icmp6_dst_lock);
1127         pprev = &icmp6_dst_gc_list;
1128
1129         while ((dst = *pprev) != NULL) {
1130                 if (!atomic_read(&dst->__refcnt)) {
1131                         *pprev = dst->next;
1132                         dst_free(dst);
1133                 } else {
1134                         pprev = &dst->next;
1135                         ++more;
1136                 }
1137         }
1138
1139         spin_unlock_bh(&icmp6_dst_lock);
1140
1141         return more;
1142 }
1143
1144 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1145                             void *arg)
1146 {
1147         struct dst_entry *dst, **pprev;
1148
1149         spin_lock_bh(&icmp6_dst_lock);
1150         pprev = &icmp6_dst_gc_list;
1151         while ((dst = *pprev) != NULL) {
1152                 struct rt6_info *rt = (struct rt6_info *) dst;
1153                 if (func(rt, arg)) {
1154                         *pprev = dst->next;
1155                         dst_free(dst);
1156                 } else {
1157                         pprev = &dst->next;
1158                 }
1159         }
1160         spin_unlock_bh(&icmp6_dst_lock);
1161 }
1162
1163 static int ip6_dst_gc(struct dst_ops *ops)
1164 {
1165         unsigned long now = jiffies;
1166         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1167         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1168         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1169         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1170         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1171         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1172         int entries;
1173
1174         entries = dst_entries_get_fast(ops);
1175         if (time_after(rt_last_gc + rt_min_interval, now) &&
1176             entries <= rt_max_size)
1177                 goto out;
1178
1179         net->ipv6.ip6_rt_gc_expire++;
1180         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1181         net->ipv6.ip6_rt_last_gc = now;
1182         entries = dst_entries_get_slow(ops);
1183         if (entries < ops->gc_thresh)
1184                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1185 out:
1186         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1187         return entries > rt_max_size;
1188 }
1189
1190 /* Clean host part of a prefix. Not necessary in radix tree,
1191    but results in cleaner routing tables.
1192
1193    Remove it only when all the things will work!
1194  */
1195
1196 int ip6_dst_hoplimit(struct dst_entry *dst)
1197 {
1198         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1199         if (hoplimit == 0) {
1200                 struct net_device *dev = dst->dev;
1201                 struct inet6_dev *idev;
1202
1203                 rcu_read_lock();
1204                 idev = __in6_dev_get(dev);
1205                 if (idev)
1206                         hoplimit = idev->cnf.hop_limit;
1207                 else
1208                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1209                 rcu_read_unlock();
1210         }
1211         return hoplimit;
1212 }
1213 EXPORT_SYMBOL(ip6_dst_hoplimit);
1214
1215 /*
1216  *
1217  */
1218
1219 int ip6_route_add(struct fib6_config *cfg)
1220 {
1221         int err;
1222         struct net *net = cfg->fc_nlinfo.nl_net;
1223         struct rt6_info *rt = NULL;
1224         struct net_device *dev = NULL;
1225         struct inet6_dev *idev = NULL;
1226         struct fib6_table *table;
1227         int addr_type;
1228
1229         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1230                 return -EINVAL;
1231 #ifndef CONFIG_IPV6_SUBTREES
1232         if (cfg->fc_src_len)
1233                 return -EINVAL;
1234 #endif
1235         if (cfg->fc_ifindex) {
1236                 err = -ENODEV;
1237                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1238                 if (!dev)
1239                         goto out;
1240                 idev = in6_dev_get(dev);
1241                 if (!idev)
1242                         goto out;
1243         }
1244
1245         if (cfg->fc_metric == 0)
1246                 cfg->fc_metric = IP6_RT_PRIO_USER;
1247
1248         table = fib6_new_table(net, cfg->fc_table);
1249         if (table == NULL) {
1250                 err = -ENOBUFS;
1251                 goto out;
1252         }
1253
1254         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1255
1256         if (rt == NULL) {
1257                 err = -ENOMEM;
1258                 goto out;
1259         }
1260
1261         rt->dst.obsolete = -1;
1262         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1263                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1264                                 0;
1265
1266         if (cfg->fc_protocol == RTPROT_UNSPEC)
1267                 cfg->fc_protocol = RTPROT_BOOT;
1268         rt->rt6i_protocol = cfg->fc_protocol;
1269
1270         addr_type = ipv6_addr_type(&cfg->fc_dst);
1271
1272         if (addr_type & IPV6_ADDR_MULTICAST)
1273                 rt->dst.input = ip6_mc_input;
1274         else if (cfg->fc_flags & RTF_LOCAL)
1275                 rt->dst.input = ip6_input;
1276         else
1277                 rt->dst.input = ip6_forward;
1278
1279         rt->dst.output = ip6_output;
1280
1281         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1282         rt->rt6i_dst.plen = cfg->fc_dst_len;
1283         if (rt->rt6i_dst.plen == 128)
1284                rt->dst.flags |= DST_HOST;
1285
1286         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1287                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1288                 if (!metrics) {
1289                         err = -ENOMEM;
1290                         goto out;
1291                 }
1292                 dst_init_metrics(&rt->dst, metrics, 0);
1293         }
1294 #ifdef CONFIG_IPV6_SUBTREES
1295         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1296         rt->rt6i_src.plen = cfg->fc_src_len;
1297 #endif
1298
1299         rt->rt6i_metric = cfg->fc_metric;
1300
1301         /* We cannot add true routes via loopback here,
1302            they would result in kernel looping; promote them to reject routes
1303          */
1304         if ((cfg->fc_flags & RTF_REJECT) ||
1305             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1306                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1307                 /* hold loopback dev/idev if we haven't done so. */
1308                 if (dev != net->loopback_dev) {
1309                         if (dev) {
1310                                 dev_put(dev);
1311                                 in6_dev_put(idev);
1312                         }
1313                         dev = net->loopback_dev;
1314                         dev_hold(dev);
1315                         idev = in6_dev_get(dev);
1316                         if (!idev) {
1317                                 err = -ENODEV;
1318                                 goto out;
1319                         }
1320                 }
1321                 rt->dst.output = ip6_pkt_discard_out;
1322                 rt->dst.input = ip6_pkt_discard;
1323                 rt->dst.error = -ENETUNREACH;
1324                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1325                 goto install_route;
1326         }
1327
1328         if (cfg->fc_flags & RTF_GATEWAY) {
1329                 const struct in6_addr *gw_addr;
1330                 int gwa_type;
1331
1332                 gw_addr = &cfg->fc_gateway;
1333                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1334                 gwa_type = ipv6_addr_type(gw_addr);
1335
1336                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1337                         struct rt6_info *grt;
1338
1339                         /* IPv6 strictly inhibits using not link-local
1340                            addresses as nexthop address.
1341                            Otherwise, router will not able to send redirects.
1342                            It is very good, but in some (rare!) circumstances
1343                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1344                            some exceptions. --ANK
1345                          */
1346                         err = -EINVAL;
1347                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1348                                 goto out;
1349
1350                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1351
1352                         err = -EHOSTUNREACH;
1353                         if (grt == NULL)
1354                                 goto out;
1355                         if (dev) {
1356                                 if (dev != grt->rt6i_dev) {
1357                                         dst_release(&grt->dst);
1358                                         goto out;
1359                                 }
1360                         } else {
1361                                 dev = grt->rt6i_dev;
1362                                 idev = grt->rt6i_idev;
1363                                 dev_hold(dev);
1364                                 in6_dev_hold(grt->rt6i_idev);
1365                         }
1366                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1367                                 err = 0;
1368                         dst_release(&grt->dst);
1369
1370                         if (err)
1371                                 goto out;
1372                 }
1373                 err = -EINVAL;
1374                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1375                         goto out;
1376         }
1377
1378         err = -ENODEV;
1379         if (dev == NULL)
1380                 goto out;
1381
1382         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1383                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1384                         err = -EINVAL;
1385                         goto out;
1386                 }
1387                 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1388                 rt->rt6i_prefsrc.plen = 128;
1389         } else
1390                 rt->rt6i_prefsrc.plen = 0;
1391
1392         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1393                 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1394                 if (IS_ERR(n)) {
1395                         err = PTR_ERR(n);
1396                         goto out;
1397                 }
1398                 dst_set_neighbour(&rt->dst, n);
1399         }
1400
1401         rt->rt6i_flags = cfg->fc_flags;
1402
1403 install_route:
1404         if (cfg->fc_mx) {
1405                 struct nlattr *nla;
1406                 int remaining;
1407
1408                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1409                         int type = nla_type(nla);
1410
1411                         if (type) {
1412                                 if (type > RTAX_MAX) {
1413                                         err = -EINVAL;
1414                                         goto out;
1415                                 }
1416
1417                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1418                         }
1419                 }
1420         }
1421
1422         rt->dst.dev = dev;
1423         rt->rt6i_idev = idev;
1424         rt->rt6i_table = table;
1425
1426         cfg->fc_nlinfo.nl_net = dev_net(dev);
1427
1428         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1429
1430 out:
1431         if (dev)
1432                 dev_put(dev);
1433         if (idev)
1434                 in6_dev_put(idev);
1435         if (rt)
1436                 dst_free(&rt->dst);
1437         return err;
1438 }
1439
1440 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1441 {
1442         int err;
1443         struct fib6_table *table;
1444         struct net *net = dev_net(rt->rt6i_dev);
1445
1446         if (rt == net->ipv6.ip6_null_entry) {
1447                 err = -ENOENT;
1448                 goto out;
1449         }
1450
1451         table = rt->rt6i_table;
1452         write_lock_bh(&table->tb6_lock);
1453         err = fib6_del(rt, info);
1454         write_unlock_bh(&table->tb6_lock);
1455
1456 out:
1457         dst_release(&rt->dst);
1458         return err;
1459 }
1460
1461 int ip6_del_rt(struct rt6_info *rt)
1462 {
1463         struct nl_info info = {
1464                 .nl_net = dev_net(rt->rt6i_dev),
1465         };
1466         return __ip6_del_rt(rt, &info);
1467 }
1468
1469 static int ip6_route_del(struct fib6_config *cfg)
1470 {
1471         struct fib6_table *table;
1472         struct fib6_node *fn;
1473         struct rt6_info *rt;
1474         int err = -ESRCH;
1475
1476         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1477         if (table == NULL)
1478                 return err;
1479
1480         read_lock_bh(&table->tb6_lock);
1481
1482         fn = fib6_locate(&table->tb6_root,
1483                          &cfg->fc_dst, cfg->fc_dst_len,
1484                          &cfg->fc_src, cfg->fc_src_len);
1485
1486         if (fn) {
1487                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1488                         if (cfg->fc_ifindex &&
1489                             (rt->rt6i_dev == NULL ||
1490                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1491                                 continue;
1492                         if (cfg->fc_flags & RTF_GATEWAY &&
1493                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1494                                 continue;
1495                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1496                                 continue;
1497                         dst_hold(&rt->dst);
1498                         read_unlock_bh(&table->tb6_lock);
1499
1500                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1501                 }
1502         }
1503         read_unlock_bh(&table->tb6_lock);
1504
1505         return err;
1506 }
1507
1508 /*
1509  *      Handle redirects
1510  */
1511 struct ip6rd_flowi {
1512         struct flowi6 fl6;
1513         struct in6_addr gateway;
1514 };
1515
1516 static struct rt6_info *__ip6_route_redirect(struct net *net,
1517                                              struct fib6_table *table,
1518                                              struct flowi6 *fl6,
1519                                              int flags)
1520 {
1521         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1522         struct rt6_info *rt;
1523         struct fib6_node *fn;
1524
1525         /*
1526          * Get the "current" route for this destination and
1527          * check if the redirect has come from approriate router.
1528          *
1529          * RFC 2461 specifies that redirects should only be
1530          * accepted if they come from the nexthop to the target.
1531          * Due to the way the routes are chosen, this notion
1532          * is a bit fuzzy and one might need to check all possible
1533          * routes.
1534          */
1535
1536         read_lock_bh(&table->tb6_lock);
1537         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1538 restart:
1539         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1540                 /*
1541                  * Current route is on-link; redirect is always invalid.
1542                  *
1543                  * Seems, previous statement is not true. It could
1544                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1545                  * But then router serving it might decide, that we should
1546                  * know truth 8)8) --ANK (980726).
1547                  */
1548                 if (rt6_check_expired(rt))
1549                         continue;
1550                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1551                         continue;
1552                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1553                         continue;
1554                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1555                         continue;
1556                 break;
1557         }
1558
1559         if (!rt)
1560                 rt = net->ipv6.ip6_null_entry;
1561         BACKTRACK(net, &fl6->saddr);
1562 out:
1563         dst_hold(&rt->dst);
1564
1565         read_unlock_bh(&table->tb6_lock);
1566
1567         return rt;
1568 };
1569
1570 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1571                                            const struct in6_addr *src,
1572                                            const struct in6_addr *gateway,
1573                                            struct net_device *dev)
1574 {
1575         int flags = RT6_LOOKUP_F_HAS_SADDR;
1576         struct net *net = dev_net(dev);
1577         struct ip6rd_flowi rdfl = {
1578                 .fl6 = {
1579                         .flowi6_oif = dev->ifindex,
1580                         .daddr = *dest,
1581                         .saddr = *src,
1582                 },
1583         };
1584
1585         ipv6_addr_copy(&rdfl.gateway, gateway);
1586
1587         if (rt6_need_strict(dest))
1588                 flags |= RT6_LOOKUP_F_IFACE;
1589
1590         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1591                                                    flags, __ip6_route_redirect);
1592 }
1593
1594 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1595                   const struct in6_addr *saddr,
1596                   struct neighbour *neigh, u8 *lladdr, int on_link)
1597 {
1598         struct rt6_info *rt, *nrt = NULL;
1599         struct netevent_redirect netevent;
1600         struct net *net = dev_net(neigh->dev);
1601
1602         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1603
1604         if (rt == net->ipv6.ip6_null_entry) {
1605                 if (net_ratelimit())
1606                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1607                                "for redirect target\n");
1608                 goto out;
1609         }
1610
1611         /*
1612          *      We have finally decided to accept it.
1613          */
1614
1615         neigh_update(neigh, lladdr, NUD_STALE,
1616                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1617                      NEIGH_UPDATE_F_OVERRIDE|
1618                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1619                                      NEIGH_UPDATE_F_ISROUTER))
1620                      );
1621
1622         /*
1623          * Redirect received -> path was valid.
1624          * Look, redirects are sent only in response to data packets,
1625          * so that this nexthop apparently is reachable. --ANK
1626          */
1627         dst_confirm(&rt->dst);
1628
1629         /* Duplicate redirect: silently ignore. */
1630         if (neigh == dst_get_neighbour_raw(&rt->dst))
1631                 goto out;
1632
1633         nrt = ip6_rt_copy(rt, dest);
1634         if (nrt == NULL)
1635                 goto out;
1636
1637         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1638         if (on_link)
1639                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1640
1641         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1642         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1643
1644         if (ip6_ins_rt(nrt))
1645                 goto out;
1646
1647         netevent.old = &rt->dst;
1648         netevent.new = &nrt->dst;
1649         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1650
1651         if (rt->rt6i_flags&RTF_CACHE) {
1652                 ip6_del_rt(rt);
1653                 return;
1654         }
1655
1656 out:
1657         dst_release(&rt->dst);
1658 }
1659
1660 /*
1661  *      Handle ICMP "packet too big" messages
1662  *      i.e. Path MTU discovery
1663  */
1664
1665 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1666                              struct net *net, u32 pmtu, int ifindex)
1667 {
1668         struct rt6_info *rt, *nrt;
1669         int allfrag = 0;
1670 again:
1671         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1672         if (rt == NULL)
1673                 return;
1674
1675         if (rt6_check_expired(rt)) {
1676                 ip6_del_rt(rt);
1677                 goto again;
1678         }
1679
1680         if (pmtu >= dst_mtu(&rt->dst))
1681                 goto out;
1682
1683         if (pmtu < IPV6_MIN_MTU) {
1684                 /*
1685                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1686                  * MTU (1280) and a fragment header should always be included
1687                  * after a node receiving Too Big message reporting PMTU is
1688                  * less than the IPv6 Minimum Link MTU.
1689                  */
1690                 pmtu = IPV6_MIN_MTU;
1691                 allfrag = 1;
1692         }
1693
1694         /* New mtu received -> path was valid.
1695            They are sent only in response to data packets,
1696            so that this nexthop apparently is reachable. --ANK
1697          */
1698         dst_confirm(&rt->dst);
1699
1700         /* Host route. If it is static, it would be better
1701            not to override it, but add new one, so that
1702            when cache entry will expire old pmtu
1703            would return automatically.
1704          */
1705         if (rt->rt6i_flags & RTF_CACHE) {
1706                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1707                 if (allfrag) {
1708                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1709                         features |= RTAX_FEATURE_ALLFRAG;
1710                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1711                 }
1712                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1713                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1714                 goto out;
1715         }
1716
1717         /* Network route.
1718            Two cases are possible:
1719            1. It is connected route. Action: COW
1720            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1721          */
1722         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1723                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1724         else
1725                 nrt = rt6_alloc_clone(rt, daddr);
1726
1727         if (nrt) {
1728                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1729                 if (allfrag) {
1730                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1731                         features |= RTAX_FEATURE_ALLFRAG;
1732                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1733                 }
1734
1735                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1736                  * happened within 5 mins, the recommended timer is 10 mins.
1737                  * Here this route expiration time is set to ip6_rt_mtu_expires
1738                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1739                  * and detecting PMTU increase will be automatically happened.
1740                  */
1741                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1742                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1743
1744                 ip6_ins_rt(nrt);
1745         }
1746 out:
1747         dst_release(&rt->dst);
1748 }
1749
1750 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1751                         struct net_device *dev, u32 pmtu)
1752 {
1753         struct net *net = dev_net(dev);
1754
1755         /*
1756          * RFC 1981 states that a node "MUST reduce the size of the packets it
1757          * is sending along the path" that caused the Packet Too Big message.
1758          * Since it's not possible in the general case to determine which
1759          * interface was used to send the original packet, we update the MTU
1760          * on the interface that will be used to send future packets. We also
1761          * update the MTU on the interface that received the Packet Too Big in
1762          * case the original packet was forced out that interface with
1763          * SO_BINDTODEVICE or similar. This is the next best thing to the
1764          * correct behaviour, which would be to update the MTU on all
1765          * interfaces.
1766          */
1767         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1768         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1769 }
1770
1771 /*
1772  *      Misc support functions
1773  */
1774
1775 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1776                                     const struct in6_addr *dest)
1777 {
1778         struct net *net = dev_net(ort->rt6i_dev);
1779         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1780                                             ort->dst.dev, 0);
1781
1782         if (rt) {
1783                 rt->dst.input = ort->dst.input;
1784                 rt->dst.output = ort->dst.output;
1785                 rt->dst.flags |= DST_HOST;
1786
1787                 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1788                 rt->rt6i_dst.plen = 128;
1789                 dst_copy_metrics(&rt->dst, &ort->dst);
1790                 rt->dst.error = ort->dst.error;
1791                 rt->rt6i_idev = ort->rt6i_idev;
1792                 if (rt->rt6i_idev)
1793                         in6_dev_hold(rt->rt6i_idev);
1794                 rt->dst.lastuse = jiffies;
1795                 rt->rt6i_expires = 0;
1796
1797                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1798                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1799                 rt->rt6i_metric = 0;
1800
1801 #ifdef CONFIG_IPV6_SUBTREES
1802                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1803 #endif
1804                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1805                 rt->rt6i_table = ort->rt6i_table;
1806         }
1807         return rt;
1808 }
1809
1810 #ifdef CONFIG_IPV6_ROUTE_INFO
1811 static struct rt6_info *rt6_get_route_info(struct net *net,
1812                                            const struct in6_addr *prefix, int prefixlen,
1813                                            const struct in6_addr *gwaddr, int ifindex)
1814 {
1815         struct fib6_node *fn;
1816         struct rt6_info *rt = NULL;
1817         struct fib6_table *table;
1818
1819         table = fib6_get_table(net, RT6_TABLE_INFO);
1820         if (table == NULL)
1821                 return NULL;
1822
1823         write_lock_bh(&table->tb6_lock);
1824         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1825         if (!fn)
1826                 goto out;
1827
1828         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1829                 if (rt->rt6i_dev->ifindex != ifindex)
1830                         continue;
1831                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1832                         continue;
1833                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1834                         continue;
1835                 dst_hold(&rt->dst);
1836                 break;
1837         }
1838 out:
1839         write_unlock_bh(&table->tb6_lock);
1840         return rt;
1841 }
1842
1843 static struct rt6_info *rt6_add_route_info(struct net *net,
1844                                            const struct in6_addr *prefix, int prefixlen,
1845                                            const struct in6_addr *gwaddr, int ifindex,
1846                                            unsigned pref)
1847 {
1848         struct fib6_config cfg = {
1849                 .fc_table       = RT6_TABLE_INFO,
1850                 .fc_metric      = IP6_RT_PRIO_USER,
1851                 .fc_ifindex     = ifindex,
1852                 .fc_dst_len     = prefixlen,
1853                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1854                                   RTF_UP | RTF_PREF(pref),
1855                 .fc_nlinfo.pid = 0,
1856                 .fc_nlinfo.nlh = NULL,
1857                 .fc_nlinfo.nl_net = net,
1858         };
1859
1860         ipv6_addr_copy(&cfg.fc_dst, prefix);
1861         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1862
1863         /* We should treat it as a default route if prefix length is 0. */
1864         if (!prefixlen)
1865                 cfg.fc_flags |= RTF_DEFAULT;
1866
1867         ip6_route_add(&cfg);
1868
1869         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1870 }
1871 #endif
1872
1873 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1874 {
1875         struct rt6_info *rt;
1876         struct fib6_table *table;
1877
1878         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1879         if (table == NULL)
1880                 return NULL;
1881
1882         write_lock_bh(&table->tb6_lock);
1883         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1884                 if (dev == rt->rt6i_dev &&
1885                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1886                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1887                         break;
1888         }
1889         if (rt)
1890                 dst_hold(&rt->dst);
1891         write_unlock_bh(&table->tb6_lock);
1892         return rt;
1893 }
1894
1895 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1896                                      struct net_device *dev,
1897                                      unsigned int pref)
1898 {
1899         struct fib6_config cfg = {
1900                 .fc_table       = RT6_TABLE_DFLT,
1901                 .fc_metric      = IP6_RT_PRIO_USER,
1902                 .fc_ifindex     = dev->ifindex,
1903                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1904                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1905                 .fc_nlinfo.pid = 0,
1906                 .fc_nlinfo.nlh = NULL,
1907                 .fc_nlinfo.nl_net = dev_net(dev),
1908         };
1909
1910         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1911
1912         ip6_route_add(&cfg);
1913
1914         return rt6_get_dflt_router(gwaddr, dev);
1915 }
1916
1917 void rt6_purge_dflt_routers(struct net *net)
1918 {
1919         struct rt6_info *rt;
1920         struct fib6_table *table;
1921
1922         /* NOTE: Keep consistent with rt6_get_dflt_router */
1923         table = fib6_get_table(net, RT6_TABLE_DFLT);
1924         if (table == NULL)
1925                 return;
1926
1927 restart:
1928         read_lock_bh(&table->tb6_lock);
1929         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1930                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
1931                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
1932                         dst_hold(&rt->dst);
1933                         read_unlock_bh(&table->tb6_lock);
1934                         ip6_del_rt(rt);
1935                         goto restart;
1936                 }
1937         }
1938         read_unlock_bh(&table->tb6_lock);
1939 }
1940
1941 static void rtmsg_to_fib6_config(struct net *net,
1942                                  struct in6_rtmsg *rtmsg,
1943                                  struct fib6_config *cfg)
1944 {
1945         memset(cfg, 0, sizeof(*cfg));
1946
1947         cfg->fc_table = RT6_TABLE_MAIN;
1948         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1949         cfg->fc_metric = rtmsg->rtmsg_metric;
1950         cfg->fc_expires = rtmsg->rtmsg_info;
1951         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1952         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1953         cfg->fc_flags = rtmsg->rtmsg_flags;
1954
1955         cfg->fc_nlinfo.nl_net = net;
1956
1957         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1958         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1959         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1960 }
1961
1962 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1963 {
1964         struct fib6_config cfg;
1965         struct in6_rtmsg rtmsg;
1966         int err;
1967
1968         switch(cmd) {
1969         case SIOCADDRT:         /* Add a route */
1970         case SIOCDELRT:         /* Delete a route */
1971                 if (!capable(CAP_NET_ADMIN))
1972                         return -EPERM;
1973                 err = copy_from_user(&rtmsg, arg,
1974                                      sizeof(struct in6_rtmsg));
1975                 if (err)
1976                         return -EFAULT;
1977
1978                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1979
1980                 rtnl_lock();
1981                 switch (cmd) {
1982                 case SIOCADDRT:
1983                         err = ip6_route_add(&cfg);
1984                         break;
1985                 case SIOCDELRT:
1986                         err = ip6_route_del(&cfg);
1987                         break;
1988                 default:
1989                         err = -EINVAL;
1990                 }
1991                 rtnl_unlock();
1992
1993                 return err;
1994         }
1995
1996         return -EINVAL;
1997 }
1998
1999 /*
2000  *      Drop the packet on the floor
2001  */
2002
2003 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2004 {
2005         int type;
2006         struct dst_entry *dst = skb_dst(skb);
2007         switch (ipstats_mib_noroutes) {
2008         case IPSTATS_MIB_INNOROUTES:
2009                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2010                 if (type == IPV6_ADDR_ANY) {
2011                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2012                                       IPSTATS_MIB_INADDRERRORS);
2013                         break;
2014                 }
2015                 /* FALLTHROUGH */
2016         case IPSTATS_MIB_OUTNOROUTES:
2017                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2018                               ipstats_mib_noroutes);
2019                 break;
2020         }
2021         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2022         kfree_skb(skb);
2023         return 0;
2024 }
2025
2026 static int ip6_pkt_discard(struct sk_buff *skb)
2027 {
2028         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2029 }
2030
2031 static int ip6_pkt_discard_out(struct sk_buff *skb)
2032 {
2033         skb->dev = skb_dst(skb)->dev;
2034         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2035 }
2036
2037 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2038
2039 static int ip6_pkt_prohibit(struct sk_buff *skb)
2040 {
2041         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2042 }
2043
2044 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2045 {
2046         skb->dev = skb_dst(skb)->dev;
2047         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2048 }
2049
2050 #endif
2051
2052 /*
2053  *      Allocate a dst for local (unicast / anycast) address.
2054  */
2055
2056 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2057                                     const struct in6_addr *addr,
2058                                     int anycast)
2059 {
2060         struct net *net = dev_net(idev->dev);
2061         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2062                                             net->loopback_dev, DST_NOCOUNT);
2063         struct neighbour *neigh;
2064
2065         if (rt == NULL)
2066                 return ERR_PTR(-ENOMEM);
2067
2068         in6_dev_hold(idev);
2069
2070         rt->dst.flags |= DST_HOST;
2071         rt->dst.input = ip6_input;
2072         rt->dst.output = ip6_output;
2073         rt->rt6i_idev = idev;
2074         rt->dst.obsolete = -1;
2075
2076         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2077         if (anycast)
2078                 rt->rt6i_flags |= RTF_ANYCAST;
2079         else
2080                 rt->rt6i_flags |= RTF_LOCAL;
2081         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2082         if (IS_ERR(neigh)) {
2083                 dst_free(&rt->dst);
2084
2085                 return ERR_CAST(neigh);
2086         }
2087         dst_set_neighbour(&rt->dst, neigh);
2088
2089         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2090         rt->rt6i_dst.plen = 128;
2091         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2092
2093         atomic_set(&rt->dst.__refcnt, 1);
2094
2095         return rt;
2096 }
2097
2098 int ip6_route_get_saddr(struct net *net,
2099                         struct rt6_info *rt,
2100                         const struct in6_addr *daddr,
2101                         unsigned int prefs,
2102                         struct in6_addr *saddr)
2103 {
2104         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2105         int err = 0;
2106         if (rt->rt6i_prefsrc.plen)
2107                 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2108         else
2109                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2110                                          daddr, prefs, saddr);
2111         return err;
2112 }
2113
2114 /* remove deleted ip from prefsrc entries */
2115 struct arg_dev_net_ip {
2116         struct net_device *dev;
2117         struct net *net;
2118         struct in6_addr *addr;
2119 };
2120
2121 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2122 {
2123         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2124         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2125         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2126
2127         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2128             rt != net->ipv6.ip6_null_entry &&
2129             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2130                 /* remove prefsrc entry */
2131                 rt->rt6i_prefsrc.plen = 0;
2132         }
2133         return 0;
2134 }
2135
2136 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2137 {
2138         struct net *net = dev_net(ifp->idev->dev);
2139         struct arg_dev_net_ip adni = {
2140                 .dev = ifp->idev->dev,
2141                 .net = net,
2142                 .addr = &ifp->addr,
2143         };
2144         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2145 }
2146
2147 struct arg_dev_net {
2148         struct net_device *dev;
2149         struct net *net;
2150 };
2151
2152 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2153 {
2154         const struct arg_dev_net *adn = arg;
2155         const struct net_device *dev = adn->dev;
2156
2157         if ((rt->rt6i_dev == dev || dev == NULL) &&
2158             rt != adn->net->ipv6.ip6_null_entry) {
2159                 RT6_TRACE("deleted by ifdown %p\n", rt);
2160                 return -1;
2161         }
2162         return 0;
2163 }
2164
2165 void rt6_ifdown(struct net *net, struct net_device *dev)
2166 {
2167         struct arg_dev_net adn = {
2168                 .dev = dev,
2169                 .net = net,
2170         };
2171
2172         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2173         icmp6_clean_all(fib6_ifdown, &adn);
2174 }
2175
2176 struct rt6_mtu_change_arg
2177 {
2178         struct net_device *dev;
2179         unsigned mtu;
2180 };
2181
2182 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2183 {
2184         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2185         struct inet6_dev *idev;
2186
2187         /* In IPv6 pmtu discovery is not optional,
2188            so that RTAX_MTU lock cannot disable it.
2189            We still use this lock to block changes
2190            caused by addrconf/ndisc.
2191         */
2192
2193         idev = __in6_dev_get(arg->dev);
2194         if (idev == NULL)
2195                 return 0;
2196
2197         /* For administrative MTU increase, there is no way to discover
2198            IPv6 PMTU increase, so PMTU increase should be updated here.
2199            Since RFC 1981 doesn't include administrative MTU increase
2200            update PMTU increase is a MUST. (i.e. jumbo frame)
2201          */
2202         /*
2203            If new MTU is less than route PMTU, this new MTU will be the
2204            lowest MTU in the path, update the route PMTU to reflect PMTU
2205            decreases; if new MTU is greater than route PMTU, and the
2206            old MTU is the lowest MTU in the path, update the route PMTU
2207            to reflect the increase. In this case if the other nodes' MTU
2208            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2209            PMTU discouvery.
2210          */
2211         if (rt->rt6i_dev == arg->dev &&
2212             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2213             (dst_mtu(&rt->dst) >= arg->mtu ||
2214              (dst_mtu(&rt->dst) < arg->mtu &&
2215               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2216                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2217         }
2218         return 0;
2219 }
2220
2221 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2222 {
2223         struct rt6_mtu_change_arg arg = {
2224                 .dev = dev,
2225                 .mtu = mtu,
2226         };
2227
2228         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2229 }
2230
2231 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2232         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2233         [RTA_OIF]               = { .type = NLA_U32 },
2234         [RTA_IIF]               = { .type = NLA_U32 },
2235         [RTA_PRIORITY]          = { .type = NLA_U32 },
2236         [RTA_METRICS]           = { .type = NLA_NESTED },
2237 };
2238
2239 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2240                               struct fib6_config *cfg)
2241 {
2242         struct rtmsg *rtm;
2243         struct nlattr *tb[RTA_MAX+1];
2244         int err;
2245
2246         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2247         if (err < 0)
2248                 goto errout;
2249
2250         err = -EINVAL;
2251         rtm = nlmsg_data(nlh);
2252         memset(cfg, 0, sizeof(*cfg));
2253
2254         cfg->fc_table = rtm->rtm_table;
2255         cfg->fc_dst_len = rtm->rtm_dst_len;
2256         cfg->fc_src_len = rtm->rtm_src_len;
2257         cfg->fc_flags = RTF_UP;
2258         cfg->fc_protocol = rtm->rtm_protocol;
2259
2260         if (rtm->rtm_type == RTN_UNREACHABLE)
2261                 cfg->fc_flags |= RTF_REJECT;
2262
2263         if (rtm->rtm_type == RTN_LOCAL)
2264                 cfg->fc_flags |= RTF_LOCAL;
2265
2266         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2267         cfg->fc_nlinfo.nlh = nlh;
2268         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2269
2270         if (tb[RTA_GATEWAY]) {
2271                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2272                 cfg->fc_flags |= RTF_GATEWAY;
2273         }
2274
2275         if (tb[RTA_DST]) {
2276                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2277
2278                 if (nla_len(tb[RTA_DST]) < plen)
2279                         goto errout;
2280
2281                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2282         }
2283
2284         if (tb[RTA_SRC]) {
2285                 int plen = (rtm->rtm_src_len + 7) >> 3;
2286
2287                 if (nla_len(tb[RTA_SRC]) < plen)
2288                         goto errout;
2289
2290                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2291         }
2292
2293         if (tb[RTA_PREFSRC])
2294                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2295
2296         if (tb[RTA_OIF])
2297                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2298
2299         if (tb[RTA_PRIORITY])
2300                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2301
2302         if (tb[RTA_METRICS]) {
2303                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2304                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2305         }
2306
2307         if (tb[RTA_TABLE])
2308                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2309
2310         err = 0;
2311 errout:
2312         return err;
2313 }
2314
2315 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2316 {
2317         struct fib6_config cfg;
2318         int err;
2319
2320         err = rtm_to_fib6_config(skb, nlh, &cfg);
2321         if (err < 0)
2322                 return err;
2323
2324         return ip6_route_del(&cfg);
2325 }
2326
2327 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2328 {
2329         struct fib6_config cfg;
2330         int err;
2331
2332         err = rtm_to_fib6_config(skb, nlh, &cfg);
2333         if (err < 0)
2334                 return err;
2335
2336         return ip6_route_add(&cfg);
2337 }
2338
2339 static inline size_t rt6_nlmsg_size(void)
2340 {
2341         return NLMSG_ALIGN(sizeof(struct rtmsg))
2342                + nla_total_size(16) /* RTA_SRC */
2343                + nla_total_size(16) /* RTA_DST */
2344                + nla_total_size(16) /* RTA_GATEWAY */
2345                + nla_total_size(16) /* RTA_PREFSRC */
2346                + nla_total_size(4) /* RTA_TABLE */
2347                + nla_total_size(4) /* RTA_IIF */
2348                + nla_total_size(4) /* RTA_OIF */
2349                + nla_total_size(4) /* RTA_PRIORITY */
2350                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2351                + nla_total_size(sizeof(struct rta_cacheinfo));
2352 }
2353
2354 static int rt6_fill_node(struct net *net,
2355                          struct sk_buff *skb, struct rt6_info *rt,
2356                          struct in6_addr *dst, struct in6_addr *src,
2357                          int iif, int type, u32 pid, u32 seq,
2358                          int prefix, int nowait, unsigned int flags)
2359 {
2360         struct rtmsg *rtm;
2361         struct nlmsghdr *nlh;
2362         long expires;
2363         u32 table;
2364         struct neighbour *n;
2365
2366         if (prefix) {   /* user wants prefix routes only */
2367                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2368                         /* success since this is not a prefix route */
2369                         return 1;
2370                 }
2371         }
2372
2373         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2374         if (nlh == NULL)
2375                 return -EMSGSIZE;
2376
2377         rtm = nlmsg_data(nlh);
2378         rtm->rtm_family = AF_INET6;
2379         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2380         rtm->rtm_src_len = rt->rt6i_src.plen;
2381         rtm->rtm_tos = 0;
2382         if (rt->rt6i_table)
2383                 table = rt->rt6i_table->tb6_id;
2384         else
2385                 table = RT6_TABLE_UNSPEC;
2386         rtm->rtm_table = table;
2387         NLA_PUT_U32(skb, RTA_TABLE, table);
2388         if (rt->rt6i_flags&RTF_REJECT)
2389                 rtm->rtm_type = RTN_UNREACHABLE;
2390         else if (rt->rt6i_flags&RTF_LOCAL)
2391                 rtm->rtm_type = RTN_LOCAL;
2392         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2393                 rtm->rtm_type = RTN_LOCAL;
2394         else
2395                 rtm->rtm_type = RTN_UNICAST;
2396         rtm->rtm_flags = 0;
2397         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2398         rtm->rtm_protocol = rt->rt6i_protocol;
2399         if (rt->rt6i_flags&RTF_DYNAMIC)
2400                 rtm->rtm_protocol = RTPROT_REDIRECT;
2401         else if (rt->rt6i_flags & RTF_ADDRCONF)
2402                 rtm->rtm_protocol = RTPROT_KERNEL;
2403         else if (rt->rt6i_flags&RTF_DEFAULT)
2404                 rtm->rtm_protocol = RTPROT_RA;
2405
2406         if (rt->rt6i_flags&RTF_CACHE)
2407                 rtm->rtm_flags |= RTM_F_CLONED;
2408
2409         if (dst) {
2410                 NLA_PUT(skb, RTA_DST, 16, dst);
2411                 rtm->rtm_dst_len = 128;
2412         } else if (rtm->rtm_dst_len)
2413                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2414 #ifdef CONFIG_IPV6_SUBTREES
2415         if (src) {
2416                 NLA_PUT(skb, RTA_SRC, 16, src);
2417                 rtm->rtm_src_len = 128;
2418         } else if (rtm->rtm_src_len)
2419                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2420 #endif
2421         if (iif) {
2422 #ifdef CONFIG_IPV6_MROUTE
2423                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2424                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2425                         if (err <= 0) {
2426                                 if (!nowait) {
2427                                         if (err == 0)
2428                                                 return 0;
2429                                         goto nla_put_failure;
2430                                 } else {
2431                                         if (err == -EMSGSIZE)
2432                                                 goto nla_put_failure;
2433                                 }
2434                         }
2435                 } else
2436 #endif
2437                         NLA_PUT_U32(skb, RTA_IIF, iif);
2438         } else if (dst) {
2439                 struct in6_addr saddr_buf;
2440                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2441                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2442         }
2443
2444         if (rt->rt6i_prefsrc.plen) {
2445                 struct in6_addr saddr_buf;
2446                 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2447                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2448         }
2449
2450         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2451                 goto nla_put_failure;
2452
2453         rcu_read_lock();
2454         n = dst_get_neighbour(&rt->dst);
2455         if (n) {
2456                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2457                         rcu_read_unlock();
2458                         goto nla_put_failure;
2459                 }
2460         }
2461         rcu_read_unlock();
2462
2463         if (rt->dst.dev)
2464                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2465
2466         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2467
2468         if (!(rt->rt6i_flags & RTF_EXPIRES))
2469                 expires = 0;
2470         else if (rt->rt6i_expires - jiffies < INT_MAX)
2471                 expires = rt->rt6i_expires - jiffies;
2472         else
2473                 expires = INT_MAX;
2474
2475         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2476                                expires, rt->dst.error) < 0)
2477                 goto nla_put_failure;
2478
2479         return nlmsg_end(skb, nlh);
2480
2481 nla_put_failure:
2482         nlmsg_cancel(skb, nlh);
2483         return -EMSGSIZE;
2484 }
2485
2486 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2487 {
2488         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2489         int prefix;
2490
2491         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2492                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2493                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2494         } else
2495                 prefix = 0;
2496
2497         return rt6_fill_node(arg->net,
2498                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2499                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2500                      prefix, 0, NLM_F_MULTI);
2501 }
2502
2503 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2504 {
2505         struct net *net = sock_net(in_skb->sk);
2506         struct nlattr *tb[RTA_MAX+1];
2507         struct rt6_info *rt;
2508         struct sk_buff *skb;
2509         struct rtmsg *rtm;
2510         struct flowi6 fl6;
2511         int err, iif = 0;
2512
2513         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2514         if (err < 0)
2515                 goto errout;
2516
2517         err = -EINVAL;
2518         memset(&fl6, 0, sizeof(fl6));
2519
2520         if (tb[RTA_SRC]) {
2521                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2522                         goto errout;
2523
2524                 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2525         }
2526
2527         if (tb[RTA_DST]) {
2528                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2529                         goto errout;
2530
2531                 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2532         }
2533
2534         if (tb[RTA_IIF])
2535                 iif = nla_get_u32(tb[RTA_IIF]);
2536
2537         if (tb[RTA_OIF])
2538                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2539
2540         if (iif) {
2541                 struct net_device *dev;
2542                 dev = __dev_get_by_index(net, iif);
2543                 if (!dev) {
2544                         err = -ENODEV;
2545                         goto errout;
2546                 }
2547         }
2548
2549         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2550         if (skb == NULL) {
2551                 err = -ENOBUFS;
2552                 goto errout;
2553         }
2554
2555         /* Reserve room for dummy headers, this skb can pass
2556            through good chunk of routing engine.
2557          */
2558         skb_reset_mac_header(skb);
2559         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2560
2561         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2562         skb_dst_set(skb, &rt->dst);
2563
2564         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2565                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2566                             nlh->nlmsg_seq, 0, 0, 0);
2567         if (err < 0) {
2568                 kfree_skb(skb);
2569                 goto errout;
2570         }
2571
2572         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2573 errout:
2574         return err;
2575 }
2576
2577 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2578 {
2579         struct sk_buff *skb;
2580         struct net *net = info->nl_net;
2581         u32 seq;
2582         int err;
2583
2584         err = -ENOBUFS;
2585         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2586
2587         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2588         if (skb == NULL)
2589                 goto errout;
2590
2591         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2592                                 event, info->pid, seq, 0, 0, 0);
2593         if (err < 0) {
2594                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2595                 WARN_ON(err == -EMSGSIZE);
2596                 kfree_skb(skb);
2597                 goto errout;
2598         }
2599         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2600                     info->nlh, gfp_any());
2601         return;
2602 errout:
2603         if (err < 0)
2604                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2605 }
2606
2607 static int ip6_route_dev_notify(struct notifier_block *this,
2608                                 unsigned long event, void *data)
2609 {
2610         struct net_device *dev = (struct net_device *)data;
2611         struct net *net = dev_net(dev);
2612
2613         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2614                 net->ipv6.ip6_null_entry->dst.dev = dev;
2615                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2616 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2617                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2618                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2619                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2620                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2621 #endif
2622         }
2623
2624         return NOTIFY_OK;
2625 }
2626
2627 /*
2628  *      /proc
2629  */
2630
2631 #ifdef CONFIG_PROC_FS
2632
2633 struct rt6_proc_arg
2634 {
2635         char *buffer;
2636         int offset;
2637         int length;
2638         int skip;
2639         int len;
2640 };
2641
2642 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2643 {
2644         struct seq_file *m = p_arg;
2645         struct neighbour *n;
2646
2647         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2648
2649 #ifdef CONFIG_IPV6_SUBTREES
2650         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2651 #else
2652         seq_puts(m, "00000000000000000000000000000000 00 ");
2653 #endif
2654         rcu_read_lock();
2655         n = dst_get_neighbour(&rt->dst);
2656         if (n) {
2657                 seq_printf(m, "%pi6", n->primary_key);
2658         } else {
2659                 seq_puts(m, "00000000000000000000000000000000");
2660         }
2661         rcu_read_unlock();
2662         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2663                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2664                    rt->dst.__use, rt->rt6i_flags,
2665                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2666         return 0;
2667 }
2668
2669 static int ipv6_route_show(struct seq_file *m, void *v)
2670 {
2671         struct net *net = (struct net *)m->private;
2672         fib6_clean_all(net, rt6_info_route, 0, m);
2673         return 0;
2674 }
2675
2676 static int ipv6_route_open(struct inode *inode, struct file *file)
2677 {
2678         return single_open_net(inode, file, ipv6_route_show);
2679 }
2680
2681 static const struct file_operations ipv6_route_proc_fops = {
2682         .owner          = THIS_MODULE,
2683         .open           = ipv6_route_open,
2684         .read           = seq_read,
2685         .llseek         = seq_lseek,
2686         .release        = single_release_net,
2687 };
2688
2689 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2690 {
2691         struct net *net = (struct net *)seq->private;
2692         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2693                    net->ipv6.rt6_stats->fib_nodes,
2694                    net->ipv6.rt6_stats->fib_route_nodes,
2695                    net->ipv6.rt6_stats->fib_rt_alloc,
2696                    net->ipv6.rt6_stats->fib_rt_entries,
2697                    net->ipv6.rt6_stats->fib_rt_cache,
2698                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2699                    net->ipv6.rt6_stats->fib_discarded_routes);
2700
2701         return 0;
2702 }
2703
2704 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2705 {
2706         return single_open_net(inode, file, rt6_stats_seq_show);
2707 }
2708
2709 static const struct file_operations rt6_stats_seq_fops = {
2710         .owner   = THIS_MODULE,
2711         .open    = rt6_stats_seq_open,
2712         .read    = seq_read,
2713         .llseek  = seq_lseek,
2714         .release = single_release_net,
2715 };
2716 #endif  /* CONFIG_PROC_FS */
2717
2718 #ifdef CONFIG_SYSCTL
2719
2720 static
2721 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2722                               void __user *buffer, size_t *lenp, loff_t *ppos)
2723 {
2724         struct net *net;
2725         int delay;
2726         if (!write)
2727                 return -EINVAL;
2728
2729         net = (struct net *)ctl->extra1;
2730         delay = net->ipv6.sysctl.flush_delay;
2731         proc_dointvec(ctl, write, buffer, lenp, ppos);
2732         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2733         return 0;
2734 }
2735
2736 ctl_table ipv6_route_table_template[] = {
2737         {
2738                 .procname       =       "flush",
2739                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2740                 .maxlen         =       sizeof(int),
2741                 .mode           =       0200,
2742                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2743         },
2744         {
2745                 .procname       =       "gc_thresh",
2746                 .data           =       &ip6_dst_ops_template.gc_thresh,
2747                 .maxlen         =       sizeof(int),
2748                 .mode           =       0644,
2749                 .proc_handler   =       proc_dointvec,
2750         },
2751         {
2752                 .procname       =       "max_size",
2753                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2754                 .maxlen         =       sizeof(int),
2755                 .mode           =       0644,
2756                 .proc_handler   =       proc_dointvec,
2757         },
2758         {
2759                 .procname       =       "gc_min_interval",
2760                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2761                 .maxlen         =       sizeof(int),
2762                 .mode           =       0644,
2763                 .proc_handler   =       proc_dointvec_jiffies,
2764         },
2765         {
2766                 .procname       =       "gc_timeout",
2767                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2768                 .maxlen         =       sizeof(int),
2769                 .mode           =       0644,
2770                 .proc_handler   =       proc_dointvec_jiffies,
2771         },
2772         {
2773                 .procname       =       "gc_interval",
2774                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2775                 .maxlen         =       sizeof(int),
2776                 .mode           =       0644,
2777                 .proc_handler   =       proc_dointvec_jiffies,
2778         },
2779         {
2780                 .procname       =       "gc_elasticity",
2781                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2782                 .maxlen         =       sizeof(int),
2783                 .mode           =       0644,
2784                 .proc_handler   =       proc_dointvec,
2785         },
2786         {
2787                 .procname       =       "mtu_expires",
2788                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2789                 .maxlen         =       sizeof(int),
2790                 .mode           =       0644,
2791                 .proc_handler   =       proc_dointvec_jiffies,
2792         },
2793         {
2794                 .procname       =       "min_adv_mss",
2795                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2796                 .maxlen         =       sizeof(int),
2797                 .mode           =       0644,
2798                 .proc_handler   =       proc_dointvec,
2799         },
2800         {
2801                 .procname       =       "gc_min_interval_ms",
2802                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2803                 .maxlen         =       sizeof(int),
2804                 .mode           =       0644,
2805                 .proc_handler   =       proc_dointvec_ms_jiffies,
2806         },
2807         { }
2808 };
2809
2810 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2811 {
2812         struct ctl_table *table;
2813
2814         table = kmemdup(ipv6_route_table_template,
2815                         sizeof(ipv6_route_table_template),
2816                         GFP_KERNEL);
2817
2818         if (table) {
2819                 table[0].data = &net->ipv6.sysctl.flush_delay;
2820                 table[0].extra1 = net;
2821                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2822                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2823                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2824                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2825                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2826                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2827                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2828                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2829                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2830         }
2831
2832         return table;
2833 }
2834 #endif
2835
2836 static int __net_init ip6_route_net_init(struct net *net)
2837 {
2838         int ret = -ENOMEM;
2839
2840         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2841                sizeof(net->ipv6.ip6_dst_ops));
2842
2843         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2844                 goto out_ip6_dst_ops;
2845
2846         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2847                                            sizeof(*net->ipv6.ip6_null_entry),
2848                                            GFP_KERNEL);
2849         if (!net->ipv6.ip6_null_entry)
2850                 goto out_ip6_dst_entries;
2851         net->ipv6.ip6_null_entry->dst.path =
2852                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2853         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2854         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2855                          ip6_template_metrics, true);
2856
2857 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2858         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2859                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2860                                                GFP_KERNEL);
2861         if (!net->ipv6.ip6_prohibit_entry)
2862                 goto out_ip6_null_entry;
2863         net->ipv6.ip6_prohibit_entry->dst.path =
2864                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2865         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2866         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2867                          ip6_template_metrics, true);
2868
2869         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2870                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2871                                                GFP_KERNEL);
2872         if (!net->ipv6.ip6_blk_hole_entry)
2873                 goto out_ip6_prohibit_entry;
2874         net->ipv6.ip6_blk_hole_entry->dst.path =
2875                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2876         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2877         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2878                          ip6_template_metrics, true);
2879 #endif
2880
2881         net->ipv6.sysctl.flush_delay = 0;
2882         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2883         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2884         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2885         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2886         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2887         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2888         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2889
2890         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2891
2892         ret = 0;
2893 out:
2894         return ret;
2895
2896 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2897 out_ip6_prohibit_entry:
2898         kfree(net->ipv6.ip6_prohibit_entry);
2899 out_ip6_null_entry:
2900         kfree(net->ipv6.ip6_null_entry);
2901 #endif
2902 out_ip6_dst_entries:
2903         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2904 out_ip6_dst_ops:
2905         goto out;
2906 }
2907
2908 static void __net_exit ip6_route_net_exit(struct net *net)
2909 {
2910         kfree(net->ipv6.ip6_null_entry);
2911 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2912         kfree(net->ipv6.ip6_prohibit_entry);
2913         kfree(net->ipv6.ip6_blk_hole_entry);
2914 #endif
2915         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2916 }
2917
2918 static int __net_init ip6_route_net_init_late(struct net *net)
2919 {
2920 #ifdef CONFIG_PROC_FS
2921         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2922         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2923 #endif
2924         return 0;
2925 }
2926
2927 static void __net_exit ip6_route_net_exit_late(struct net *net)
2928 {
2929 #ifdef CONFIG_PROC_FS
2930         proc_net_remove(net, "ipv6_route");
2931         proc_net_remove(net, "rt6_stats");
2932 #endif
2933 }
2934
2935 static struct pernet_operations ip6_route_net_ops = {
2936         .init = ip6_route_net_init,
2937         .exit = ip6_route_net_exit,
2938 };
2939
2940 static struct pernet_operations ip6_route_net_late_ops = {
2941         .init = ip6_route_net_init_late,
2942         .exit = ip6_route_net_exit_late,
2943 };
2944
2945 static struct notifier_block ip6_route_dev_notifier = {
2946         .notifier_call = ip6_route_dev_notify,
2947         .priority = 0,
2948 };
2949
2950 int __init ip6_route_init(void)
2951 {
2952         int ret;
2953
2954         ret = -ENOMEM;
2955         ip6_dst_ops_template.kmem_cachep =
2956                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2957                                   SLAB_HWCACHE_ALIGN, NULL);
2958         if (!ip6_dst_ops_template.kmem_cachep)
2959                 goto out;
2960
2961         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2962         if (ret)
2963                 goto out_kmem_cache;
2964
2965         ret = register_pernet_subsys(&ip6_route_net_ops);
2966         if (ret)
2967                 goto out_dst_entries;
2968
2969         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2970
2971         /* Registering of the loopback is done before this portion of code,
2972          * the loopback reference in rt6_info will not be taken, do it
2973          * manually for init_net */
2974         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2975         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2976   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2977         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2978         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2979         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2980         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2981   #endif
2982         ret = fib6_init();
2983         if (ret)
2984                 goto out_register_subsys;
2985
2986         ret = xfrm6_init();
2987         if (ret)
2988                 goto out_fib6_init;
2989
2990         ret = fib6_rules_init();
2991         if (ret)
2992                 goto xfrm6_init;
2993
2994         ret = register_pernet_subsys(&ip6_route_net_late_ops);
2995         if (ret)
2996                 goto fib6_rules_init;
2997
2998         ret = -ENOBUFS;
2999         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3000             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3001             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3002                 goto out_register_late_subsys;
3003
3004         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3005         if (ret)
3006                 goto out_register_late_subsys;
3007
3008 out:
3009         return ret;
3010
3011 out_register_late_subsys:
3012         unregister_pernet_subsys(&ip6_route_net_late_ops);
3013 fib6_rules_init:
3014         fib6_rules_cleanup();
3015 xfrm6_init:
3016         xfrm6_fini();
3017 out_fib6_init:
3018         fib6_gc_cleanup();
3019 out_register_subsys:
3020         unregister_pernet_subsys(&ip6_route_net_ops);
3021 out_dst_entries:
3022         dst_entries_destroy(&ip6_dst_blackhole_ops);
3023 out_kmem_cache:
3024         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3025         goto out;
3026 }
3027
3028 void ip6_route_cleanup(void)
3029 {
3030         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3031         unregister_pernet_subsys(&ip6_route_net_late_ops);
3032         fib6_rules_cleanup();
3033         xfrm6_fini();
3034         fib6_gc_cleanup();
3035         unregister_pernet_subsys(&ip6_route_net_ops);
3036         dst_entries_destroy(&ip6_dst_blackhole_ops);
3037         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3038 }