ipv6: restrict neighbor entry creation to output flow
[pandora-kernel.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 /* Set to 3 to get tracing. */
66 #define RT6_DEBUG 2
67
68 #if RT6_DEBUG >= 3
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #else
72 #define RDBG(x)
73 #define RT6_TRACE(x...) do { ; } while (0)
74 #endif
75
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77                                     const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int      ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void             ip6_dst_destroy(struct dst_entry *);
83 static void             ip6_dst_ifdown(struct dst_entry *,
84                                        struct net_device *dev, int how);
85 static int               ip6_dst_gc(struct dst_ops *ops);
86
87 static int              ip6_pkt_discard(struct sk_buff *skb);
88 static int              ip6_pkt_discard_out(struct sk_buff *skb);
89 static void             ip6_link_failure(struct sk_buff *skb);
90 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
91
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94                                            const struct in6_addr *prefix, int prefixlen,
95                                            const struct in6_addr *gwaddr, int ifindex,
96                                            unsigned pref);
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98                                            const struct in6_addr *prefix, int prefixlen,
99                                            const struct in6_addr *gwaddr, int ifindex);
100 #endif
101
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
103 {
104         struct rt6_info *rt = (struct rt6_info *) dst;
105         struct inet_peer *peer;
106         u32 *p = NULL;
107
108         if (!(rt->dst.flags & DST_HOST))
109                 return NULL;
110
111         if (!rt->rt6i_peer)
112                 rt6_bind_peer(rt, 1);
113
114         peer = rt->rt6i_peer;
115         if (peer) {
116                 u32 *old_p = __DST_METRICS_PTR(old);
117                 unsigned long prev, new;
118
119                 p = peer->metrics;
120                 if (inet_metrics_new(peer))
121                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
122
123                 new = (unsigned long) p;
124                 prev = cmpxchg(&dst->_metrics, old, new);
125
126                 if (prev != old) {
127                         p = __DST_METRICS_PTR(prev);
128                         if (prev & DST_METRICS_READ_ONLY)
129                                 p = NULL;
130                 }
131         }
132         return p;
133 }
134
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
136 {
137         return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
138 }
139
140 static struct dst_ops ip6_dst_ops_template = {
141         .family                 =       AF_INET6,
142         .protocol               =       cpu_to_be16(ETH_P_IPV6),
143         .gc                     =       ip6_dst_gc,
144         .gc_thresh              =       1024,
145         .check                  =       ip6_dst_check,
146         .default_advmss         =       ip6_default_advmss,
147         .mtu                    =       ip6_mtu,
148         .cow_metrics            =       ipv6_cow_metrics,
149         .destroy                =       ip6_dst_destroy,
150         .ifdown                 =       ip6_dst_ifdown,
151         .negative_advice        =       ip6_negative_advice,
152         .link_failure           =       ip6_link_failure,
153         .update_pmtu            =       ip6_rt_update_pmtu,
154         .local_out              =       __ip6_local_out,
155         .neigh_lookup           =       ip6_neigh_lookup,
156 };
157
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
159 {
160         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
161
162         return mtu ? : dst->dev->mtu;
163 }
164
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
166 {
167 }
168
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
170                                          unsigned long old)
171 {
172         return NULL;
173 }
174
175 static struct dst_ops ip6_dst_blackhole_ops = {
176         .family                 =       AF_INET6,
177         .protocol               =       cpu_to_be16(ETH_P_IPV6),
178         .destroy                =       ip6_dst_destroy,
179         .check                  =       ip6_dst_check,
180         .mtu                    =       ip6_blackhole_mtu,
181         .default_advmss         =       ip6_default_advmss,
182         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
183         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
184         .neigh_lookup           =       ip6_neigh_lookup,
185 };
186
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188         [RTAX_HOPLIMIT - 1] = 0,
189 };
190
191 static struct rt6_info ip6_null_entry_template = {
192         .dst = {
193                 .__refcnt       = ATOMIC_INIT(1),
194                 .__use          = 1,
195                 .obsolete       = -1,
196                 .error          = -ENETUNREACH,
197                 .input          = ip6_pkt_discard,
198                 .output         = ip6_pkt_discard_out,
199         },
200         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
201         .rt6i_protocol  = RTPROT_KERNEL,
202         .rt6i_metric    = ~(u32) 0,
203         .rt6i_ref       = ATOMIC_INIT(1),
204 };
205
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
207
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
210
211 static struct rt6_info ip6_prohibit_entry_template = {
212         .dst = {
213                 .__refcnt       = ATOMIC_INIT(1),
214                 .__use          = 1,
215                 .obsolete       = -1,
216                 .error          = -EACCES,
217                 .input          = ip6_pkt_prohibit,
218                 .output         = ip6_pkt_prohibit_out,
219         },
220         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
221         .rt6i_protocol  = RTPROT_KERNEL,
222         .rt6i_metric    = ~(u32) 0,
223         .rt6i_ref       = ATOMIC_INIT(1),
224 };
225
226 static struct rt6_info ip6_blk_hole_entry_template = {
227         .dst = {
228                 .__refcnt       = ATOMIC_INIT(1),
229                 .__use          = 1,
230                 .obsolete       = -1,
231                 .error          = -EINVAL,
232                 .input          = dst_discard,
233                 .output         = dst_discard,
234         },
235         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
236         .rt6i_protocol  = RTPROT_KERNEL,
237         .rt6i_metric    = ~(u32) 0,
238         .rt6i_ref       = ATOMIC_INIT(1),
239 };
240
241 #endif
242
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245                                              struct net_device *dev,
246                                              int flags)
247 {
248         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
249
250         if (rt != NULL)
251                 memset(&rt->rt6i_table, 0,
252                         sizeof(*rt) - sizeof(struct dst_entry));
253
254         return rt;
255 }
256
257 static void ip6_dst_destroy(struct dst_entry *dst)
258 {
259         struct rt6_info *rt = (struct rt6_info *)dst;
260         struct inet6_dev *idev = rt->rt6i_idev;
261         struct inet_peer *peer = rt->rt6i_peer;
262
263         if (!(rt->dst.flags & DST_HOST))
264                 dst_destroy_metrics_generic(dst);
265
266         if (idev != NULL) {
267                 rt->rt6i_idev = NULL;
268                 in6_dev_put(idev);
269         }
270         if (peer) {
271                 rt->rt6i_peer = NULL;
272                 inet_putpeer(peer);
273         }
274 }
275
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
277
278 static u32 rt6_peer_genid(void)
279 {
280         return atomic_read(&__rt6_peer_genid);
281 }
282
283 void rt6_bind_peer(struct rt6_info *rt, int create)
284 {
285         struct inet_peer *peer;
286
287         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
289                 inet_putpeer(peer);
290         else
291                 rt->rt6i_peer_genid = rt6_peer_genid();
292 }
293
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
295                            int how)
296 {
297         struct rt6_info *rt = (struct rt6_info *)dst;
298         struct inet6_dev *idev = rt->rt6i_idev;
299         struct net_device *loopback_dev =
300                 dev_net(dev)->loopback_dev;
301
302         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
303                 struct inet6_dev *loopback_idev =
304                         in6_dev_get(loopback_dev);
305                 if (loopback_idev != NULL) {
306                         rt->rt6i_idev = loopback_idev;
307                         in6_dev_put(idev);
308                 }
309         }
310 }
311
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
313 {
314         return (rt->rt6i_flags & RTF_EXPIRES) &&
315                 time_after(jiffies, rt->rt6i_expires);
316 }
317
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
319 {
320         return ipv6_addr_type(daddr) &
321                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
322 }
323
324 /*
325  *      Route lookup. Any table->tb6_lock is implied.
326  */
327
328 static inline struct rt6_info *rt6_device_match(struct net *net,
329                                                     struct rt6_info *rt,
330                                                     const struct in6_addr *saddr,
331                                                     int oif,
332                                                     int flags)
333 {
334         struct rt6_info *local = NULL;
335         struct rt6_info *sprt;
336
337         if (!oif && ipv6_addr_any(saddr))
338                 goto out;
339
340         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341                 struct net_device *dev = sprt->rt6i_dev;
342
343                 if (oif) {
344                         if (dev->ifindex == oif)
345                                 return sprt;
346                         if (dev->flags & IFF_LOOPBACK) {
347                                 if (sprt->rt6i_idev == NULL ||
348                                     sprt->rt6i_idev->dev->ifindex != oif) {
349                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
350                                                 continue;
351                                         if (local && (!oif ||
352                                                       local->rt6i_idev->dev->ifindex == oif))
353                                                 continue;
354                                 }
355                                 local = sprt;
356                         }
357                 } else {
358                         if (ipv6_chk_addr(net, saddr, dev,
359                                           flags & RT6_LOOKUP_F_IFACE))
360                                 return sprt;
361                 }
362         }
363
364         if (oif) {
365                 if (local)
366                         return local;
367
368                 if (flags & RT6_LOOKUP_F_IFACE)
369                         return net->ipv6.ip6_null_entry;
370         }
371 out:
372         return rt;
373 }
374
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
377 {
378         struct neighbour *neigh;
379         /*
380          * Okay, this does not seem to be appropriate
381          * for now, however, we need to check if it
382          * is really so; aka Router Reachability Probing.
383          *
384          * Router Reachability Probe MUST be rate-limited
385          * to no more than one per minute.
386          */
387         rcu_read_lock();
388         neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389         if (!neigh || (neigh->nud_state & NUD_VALID))
390                 goto out;
391         read_lock_bh(&neigh->lock);
392         if (!(neigh->nud_state & NUD_VALID) &&
393             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394                 struct in6_addr mcaddr;
395                 struct in6_addr *target;
396
397                 neigh->updated = jiffies;
398                 read_unlock_bh(&neigh->lock);
399
400                 target = (struct in6_addr *)&neigh->primary_key;
401                 addrconf_addr_solict_mult(target, &mcaddr);
402                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
403         } else {
404                 read_unlock_bh(&neigh->lock);
405         }
406 out:
407         rcu_read_unlock();
408 }
409 #else
410 static inline void rt6_probe(struct rt6_info *rt)
411 {
412 }
413 #endif
414
415 /*
416  * Default Router Selection (RFC 2461 6.3.6)
417  */
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
419 {
420         struct net_device *dev = rt->rt6i_dev;
421         if (!oif || dev->ifindex == oif)
422                 return 2;
423         if ((dev->flags & IFF_LOOPBACK) &&
424             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
425                 return 1;
426         return 0;
427 }
428
429 static inline int rt6_check_neigh(struct rt6_info *rt)
430 {
431         struct neighbour *neigh;
432         int m;
433
434         rcu_read_lock();
435         neigh = dst_get_neighbour(&rt->dst);
436         if (rt->rt6i_flags & RTF_NONEXTHOP ||
437             !(rt->rt6i_flags & RTF_GATEWAY))
438                 m = 1;
439         else if (neigh) {
440                 read_lock_bh(&neigh->lock);
441                 if (neigh->nud_state & NUD_VALID)
442                         m = 2;
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444                 else if (neigh->nud_state & NUD_FAILED)
445                         m = 0;
446 #endif
447                 else
448                         m = 1;
449                 read_unlock_bh(&neigh->lock);
450         } else
451                 m = 0;
452         rcu_read_unlock();
453         return m;
454 }
455
456 static int rt6_score_route(struct rt6_info *rt, int oif,
457                            int strict)
458 {
459         int m, n;
460
461         m = rt6_check_dev(rt, oif);
462         if (!m && (strict & RT6_LOOKUP_F_IFACE))
463                 return -1;
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
466 #endif
467         n = rt6_check_neigh(rt);
468         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
469                 return -1;
470         return m;
471 }
472
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474                                    int *mpri, struct rt6_info *match)
475 {
476         int m;
477
478         if (rt6_check_expired(rt))
479                 goto out;
480
481         m = rt6_score_route(rt, oif, strict);
482         if (m < 0)
483                 goto out;
484
485         if (m > *mpri) {
486                 if (strict & RT6_LOOKUP_F_REACHABLE)
487                         rt6_probe(match);
488                 *mpri = m;
489                 match = rt;
490         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
491                 rt6_probe(rt);
492         }
493
494 out:
495         return match;
496 }
497
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499                                      struct rt6_info *rr_head,
500                                      u32 metric, int oif, int strict)
501 {
502         struct rt6_info *rt, *match;
503         int mpri = -1;
504
505         match = NULL;
506         for (rt = rr_head; rt && rt->rt6i_metric == metric;
507              rt = rt->dst.rt6_next)
508                 match = find_match(rt, oif, strict, &mpri, match);
509         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510              rt = rt->dst.rt6_next)
511                 match = find_match(rt, oif, strict, &mpri, match);
512
513         return match;
514 }
515
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
517 {
518         struct rt6_info *match, *rt0;
519         struct net *net;
520
521         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522                   __func__, fn->leaf, oif);
523
524         rt0 = fn->rr_ptr;
525         if (!rt0)
526                 fn->rr_ptr = rt0 = fn->leaf;
527
528         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
529
530         if (!match &&
531             (strict & RT6_LOOKUP_F_REACHABLE)) {
532                 struct rt6_info *next = rt0->dst.rt6_next;
533
534                 /* no entries matched; do round-robin */
535                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
536                         next = fn->leaf;
537
538                 if (next != rt0)
539                         fn->rr_ptr = next;
540         }
541
542         RT6_TRACE("%s() => %p\n",
543                   __func__, match);
544
545         net = dev_net(rt0->rt6i_dev);
546         return match ? match : net->ipv6.ip6_null_entry;
547 }
548
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551                   const struct in6_addr *gwaddr)
552 {
553         struct net *net = dev_net(dev);
554         struct route_info *rinfo = (struct route_info *) opt;
555         struct in6_addr prefix_buf, *prefix;
556         unsigned int pref;
557         unsigned long lifetime;
558         struct rt6_info *rt;
559
560         if (len < sizeof(struct route_info)) {
561                 return -EINVAL;
562         }
563
564         /* Sanity check for prefix_len and length */
565         if (rinfo->length > 3) {
566                 return -EINVAL;
567         } else if (rinfo->prefix_len > 128) {
568                 return -EINVAL;
569         } else if (rinfo->prefix_len > 64) {
570                 if (rinfo->length < 2) {
571                         return -EINVAL;
572                 }
573         } else if (rinfo->prefix_len > 0) {
574                 if (rinfo->length < 1) {
575                         return -EINVAL;
576                 }
577         }
578
579         pref = rinfo->route_pref;
580         if (pref == ICMPV6_ROUTER_PREF_INVALID)
581                 return -EINVAL;
582
583         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
584
585         if (rinfo->length == 3)
586                 prefix = (struct in6_addr *)rinfo->prefix;
587         else {
588                 /* this function is safe */
589                 ipv6_addr_prefix(&prefix_buf,
590                                  (struct in6_addr *)rinfo->prefix,
591                                  rinfo->prefix_len);
592                 prefix = &prefix_buf;
593         }
594
595         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
596                                 dev->ifindex);
597
598         if (rt && !lifetime) {
599                 ip6_del_rt(rt);
600                 rt = NULL;
601         }
602
603         if (!rt && lifetime)
604                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
605                                         pref);
606         else if (rt)
607                 rt->rt6i_flags = RTF_ROUTEINFO |
608                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
609
610         if (rt) {
611                 if (!addrconf_finite_timeout(lifetime)) {
612                         rt->rt6i_flags &= ~RTF_EXPIRES;
613                 } else {
614                         rt->rt6i_expires = jiffies + HZ * lifetime;
615                         rt->rt6i_flags |= RTF_EXPIRES;
616                 }
617                 dst_release(&rt->dst);
618         }
619         return 0;
620 }
621 #endif
622
623 #define BACKTRACK(__net, saddr)                 \
624 do { \
625         if (rt == __net->ipv6.ip6_null_entry) { \
626                 struct fib6_node *pn; \
627                 while (1) { \
628                         if (fn->fn_flags & RTN_TL_ROOT) \
629                                 goto out; \
630                         pn = fn->parent; \
631                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
632                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
633                         else \
634                                 fn = pn; \
635                         if (fn->fn_flags & RTN_RTINFO) \
636                                 goto restart; \
637                 } \
638         } \
639 } while(0)
640
641 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
642                                              struct fib6_table *table,
643                                              struct flowi6 *fl6, int flags)
644 {
645         struct fib6_node *fn;
646         struct rt6_info *rt;
647
648         read_lock_bh(&table->tb6_lock);
649         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
650 restart:
651         rt = fn->leaf;
652         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
653         BACKTRACK(net, &fl6->saddr);
654 out:
655         dst_use(&rt->dst, jiffies);
656         read_unlock_bh(&table->tb6_lock);
657         return rt;
658
659 }
660
661 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
662                             const struct in6_addr *saddr, int oif, int strict)
663 {
664         struct flowi6 fl6 = {
665                 .flowi6_oif = oif,
666                 .daddr = *daddr,
667         };
668         struct dst_entry *dst;
669         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
670
671         if (saddr) {
672                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
673                 flags |= RT6_LOOKUP_F_HAS_SADDR;
674         }
675
676         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
677         if (dst->error == 0)
678                 return (struct rt6_info *) dst;
679
680         dst_release(dst);
681
682         return NULL;
683 }
684
685 EXPORT_SYMBOL(rt6_lookup);
686
687 /* ip6_ins_rt is called with FREE table->tb6_lock.
688    It takes new route entry, the addition fails by any reason the
689    route is freed. In any case, if caller does not hold it, it may
690    be destroyed.
691  */
692
693 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
694 {
695         int err;
696         struct fib6_table *table;
697
698         table = rt->rt6i_table;
699         write_lock_bh(&table->tb6_lock);
700         err = fib6_add(&table->tb6_root, rt, info);
701         write_unlock_bh(&table->tb6_lock);
702
703         return err;
704 }
705
706 int ip6_ins_rt(struct rt6_info *rt)
707 {
708         struct nl_info info = {
709                 .nl_net = dev_net(rt->rt6i_dev),
710         };
711         return __ip6_ins_rt(rt, &info);
712 }
713
714 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
715                                       const struct in6_addr *daddr,
716                                       const struct in6_addr *saddr)
717 {
718         struct rt6_info *rt;
719
720         /*
721          *      Clone the route.
722          */
723
724         rt = ip6_rt_copy(ort, daddr);
725
726         if (rt) {
727                 struct neighbour *neigh;
728                 int attempts = !in_softirq();
729
730                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
731                         if (ort->rt6i_dst.plen != 128 &&
732                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
733                                 rt->rt6i_flags |= RTF_ANYCAST;
734                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
735                 }
736
737                 rt->rt6i_flags |= RTF_CACHE;
738
739 #ifdef CONFIG_IPV6_SUBTREES
740                 if (rt->rt6i_src.plen && saddr) {
741                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
742                         rt->rt6i_src.plen = 128;
743                 }
744 #endif
745
746         retry:
747                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
748                 if (IS_ERR(neigh)) {
749                         struct net *net = dev_net(rt->rt6i_dev);
750                         int saved_rt_min_interval =
751                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
752                         int saved_rt_elasticity =
753                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
754
755                         if (attempts-- > 0) {
756                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
757                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
758
759                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
760
761                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
762                                         saved_rt_elasticity;
763                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
764                                         saved_rt_min_interval;
765                                 goto retry;
766                         }
767
768                         if (net_ratelimit())
769                                 printk(KERN_WARNING
770                                        "ipv6: Neighbour table overflow.\n");
771                         dst_free(&rt->dst);
772                         return NULL;
773                 }
774                 dst_set_neighbour(&rt->dst, neigh);
775
776         }
777
778         return rt;
779 }
780
781 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
782                                         const struct in6_addr *daddr)
783 {
784         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
785
786         if (rt) {
787                 rt->rt6i_flags |= RTF_CACHE;
788                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
789         }
790         return rt;
791 }
792
793 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
794                                       struct flowi6 *fl6, int flags, bool input)
795 {
796         struct fib6_node *fn;
797         struct rt6_info *rt, *nrt;
798         int strict = 0;
799         int attempts = 3;
800         int err;
801         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
802         int local = RTF_NONEXTHOP;
803
804         strict |= flags & RT6_LOOKUP_F_IFACE;
805         if (input)
806                 local |= RTF_LOCAL;
807
808 relookup:
809         read_lock_bh(&table->tb6_lock);
810
811 restart_2:
812         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
813
814 restart:
815         rt = rt6_select(fn, oif, strict | reachable);
816
817         BACKTRACK(net, &fl6->saddr);
818         if (rt == net->ipv6.ip6_null_entry ||
819             rt->rt6i_flags & RTF_CACHE)
820                 goto out;
821
822         dst_hold(&rt->dst);
823         read_unlock_bh(&table->tb6_lock);
824
825         if (!dst_get_neighbour_raw(&rt->dst)
826             && !(rt->rt6i_flags & local))
827                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
828         else if (!(rt->dst.flags & DST_HOST))
829                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
830         else
831                 goto out2;
832
833         dst_release(&rt->dst);
834         rt = nrt ? : net->ipv6.ip6_null_entry;
835
836         dst_hold(&rt->dst);
837         if (nrt) {
838                 err = ip6_ins_rt(nrt);
839                 if (!err)
840                         goto out2;
841         }
842
843         if (--attempts <= 0)
844                 goto out2;
845
846         /*
847          * Race condition! In the gap, when table->tb6_lock was
848          * released someone could insert this route.  Relookup.
849          */
850         dst_release(&rt->dst);
851         goto relookup;
852
853 out:
854         if (reachable) {
855                 reachable = 0;
856                 goto restart_2;
857         }
858         dst_hold(&rt->dst);
859         read_unlock_bh(&table->tb6_lock);
860 out2:
861         rt->dst.lastuse = jiffies;
862         rt->dst.__use++;
863
864         return rt;
865 }
866
867 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
868                                             struct flowi6 *fl6, int flags)
869 {
870         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags, true);
871 }
872
873 void ip6_route_input(struct sk_buff *skb)
874 {
875         const struct ipv6hdr *iph = ipv6_hdr(skb);
876         struct net *net = dev_net(skb->dev);
877         int flags = RT6_LOOKUP_F_HAS_SADDR;
878         struct flowi6 fl6 = {
879                 .flowi6_iif = skb->dev->ifindex,
880                 .daddr = iph->daddr,
881                 .saddr = iph->saddr,
882                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
883                 .flowi6_mark = skb->mark,
884                 .flowi6_proto = iph->nexthdr,
885         };
886
887         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
888                 flags |= RT6_LOOKUP_F_IFACE;
889
890         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
891 }
892
893 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
894                                              struct flowi6 *fl6, int flags)
895 {
896         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags, false);
897 }
898
899 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
900                                     struct flowi6 *fl6)
901 {
902         int flags = 0;
903
904         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
905                 flags |= RT6_LOOKUP_F_IFACE;
906
907         if (!ipv6_addr_any(&fl6->saddr))
908                 flags |= RT6_LOOKUP_F_HAS_SADDR;
909         else if (sk)
910                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
911
912         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
913 }
914
915 EXPORT_SYMBOL(ip6_route_output);
916
917 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
918 {
919         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
920         struct dst_entry *new = NULL;
921
922         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
923         if (rt) {
924                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
925
926                 new = &rt->dst;
927
928                 new->__use = 1;
929                 new->input = dst_discard;
930                 new->output = dst_discard;
931
932                 if (dst_metrics_read_only(&ort->dst))
933                         new->_metrics = ort->dst._metrics;
934                 else
935                         dst_copy_metrics(new, &ort->dst);
936                 rt->rt6i_idev = ort->rt6i_idev;
937                 if (rt->rt6i_idev)
938                         in6_dev_hold(rt->rt6i_idev);
939                 rt->rt6i_expires = 0;
940
941                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
942                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
943                 rt->rt6i_metric = 0;
944
945                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
946 #ifdef CONFIG_IPV6_SUBTREES
947                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
948 #endif
949
950                 dst_free(new);
951         }
952
953         dst_release(dst_orig);
954         return new ? new : ERR_PTR(-ENOMEM);
955 }
956
957 /*
958  *      Destination cache support functions
959  */
960
961 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
962 {
963         struct rt6_info *rt;
964
965         rt = (struct rt6_info *) dst;
966
967         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
968                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
969                         if (!rt->rt6i_peer)
970                                 rt6_bind_peer(rt, 0);
971                         rt->rt6i_peer_genid = rt6_peer_genid();
972                 }
973                 return dst;
974         }
975         return NULL;
976 }
977
978 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
979 {
980         struct rt6_info *rt = (struct rt6_info *) dst;
981
982         if (rt) {
983                 if (rt->rt6i_flags & RTF_CACHE) {
984                         if (rt6_check_expired(rt)) {
985                                 ip6_del_rt(rt);
986                                 dst = NULL;
987                         }
988                 } else {
989                         dst_release(dst);
990                         dst = NULL;
991                 }
992         }
993         return dst;
994 }
995
996 static void ip6_link_failure(struct sk_buff *skb)
997 {
998         struct rt6_info *rt;
999
1000         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1001
1002         rt = (struct rt6_info *) skb_dst(skb);
1003         if (rt) {
1004                 if (rt->rt6i_flags&RTF_CACHE) {
1005                         dst_set_expires(&rt->dst, 0);
1006                         rt->rt6i_flags |= RTF_EXPIRES;
1007                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1008                         rt->rt6i_node->fn_sernum = -1;
1009         }
1010 }
1011
1012 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1013 {
1014         struct rt6_info *rt6 = (struct rt6_info*)dst;
1015
1016         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1017                 rt6->rt6i_flags |= RTF_MODIFIED;
1018                 if (mtu < IPV6_MIN_MTU) {
1019                         u32 features = dst_metric(dst, RTAX_FEATURES);
1020                         mtu = IPV6_MIN_MTU;
1021                         features |= RTAX_FEATURE_ALLFRAG;
1022                         dst_metric_set(dst, RTAX_FEATURES, features);
1023                 }
1024                 dst_metric_set(dst, RTAX_MTU, mtu);
1025         }
1026 }
1027
1028 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1029 {
1030         struct net_device *dev = dst->dev;
1031         unsigned int mtu = dst_mtu(dst);
1032         struct net *net = dev_net(dev);
1033
1034         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1035
1036         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1037                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1038
1039         /*
1040          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1041          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1042          * IPV6_MAXPLEN is also valid and means: "any MSS,
1043          * rely only on pmtu discovery"
1044          */
1045         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1046                 mtu = IPV6_MAXPLEN;
1047         return mtu;
1048 }
1049
1050 static unsigned int ip6_mtu(const struct dst_entry *dst)
1051 {
1052         struct inet6_dev *idev;
1053         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1054
1055         if (mtu)
1056                 return mtu;
1057
1058         mtu = IPV6_MIN_MTU;
1059
1060         rcu_read_lock();
1061         idev = __in6_dev_get(dst->dev);
1062         if (idev)
1063                 mtu = idev->cnf.mtu6;
1064         rcu_read_unlock();
1065
1066         return mtu;
1067 }
1068
1069 static struct dst_entry *icmp6_dst_gc_list;
1070 static DEFINE_SPINLOCK(icmp6_dst_lock);
1071
1072 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1073                                   struct neighbour *neigh,
1074                                   const struct in6_addr *addr)
1075 {
1076         struct rt6_info *rt;
1077         struct inet6_dev *idev = in6_dev_get(dev);
1078         struct net *net = dev_net(dev);
1079
1080         if (unlikely(idev == NULL))
1081                 return NULL;
1082
1083         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1084         if (unlikely(rt == NULL)) {
1085                 in6_dev_put(idev);
1086                 goto out;
1087         }
1088
1089         if (neigh)
1090                 neigh_hold(neigh);
1091         else {
1092                 neigh = ndisc_get_neigh(dev, addr);
1093                 if (IS_ERR(neigh))
1094                         neigh = NULL;
1095         }
1096
1097         rt->dst.flags |= DST_HOST;
1098         rt->dst.output  = ip6_output;
1099         dst_set_neighbour(&rt->dst, neigh);
1100         atomic_set(&rt->dst.__refcnt, 1);
1101         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1102         rt->rt6i_dst.plen = 128;
1103         rt->rt6i_idev     = idev;
1104         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1105
1106         spin_lock_bh(&icmp6_dst_lock);
1107         rt->dst.next = icmp6_dst_gc_list;
1108         icmp6_dst_gc_list = &rt->dst;
1109         spin_unlock_bh(&icmp6_dst_lock);
1110
1111         fib6_force_start_gc(net);
1112
1113 out:
1114         return &rt->dst;
1115 }
1116
1117 int icmp6_dst_gc(void)
1118 {
1119         struct dst_entry *dst, **pprev;
1120         int more = 0;
1121
1122         spin_lock_bh(&icmp6_dst_lock);
1123         pprev = &icmp6_dst_gc_list;
1124
1125         while ((dst = *pprev) != NULL) {
1126                 if (!atomic_read(&dst->__refcnt)) {
1127                         *pprev = dst->next;
1128                         dst_free(dst);
1129                 } else {
1130                         pprev = &dst->next;
1131                         ++more;
1132                 }
1133         }
1134
1135         spin_unlock_bh(&icmp6_dst_lock);
1136
1137         return more;
1138 }
1139
1140 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1141                             void *arg)
1142 {
1143         struct dst_entry *dst, **pprev;
1144
1145         spin_lock_bh(&icmp6_dst_lock);
1146         pprev = &icmp6_dst_gc_list;
1147         while ((dst = *pprev) != NULL) {
1148                 struct rt6_info *rt = (struct rt6_info *) dst;
1149                 if (func(rt, arg)) {
1150                         *pprev = dst->next;
1151                         dst_free(dst);
1152                 } else {
1153                         pprev = &dst->next;
1154                 }
1155         }
1156         spin_unlock_bh(&icmp6_dst_lock);
1157 }
1158
1159 static int ip6_dst_gc(struct dst_ops *ops)
1160 {
1161         unsigned long now = jiffies;
1162         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1163         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1164         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1165         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1166         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1167         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1168         int entries;
1169
1170         entries = dst_entries_get_fast(ops);
1171         if (time_after(rt_last_gc + rt_min_interval, now) &&
1172             entries <= rt_max_size)
1173                 goto out;
1174
1175         net->ipv6.ip6_rt_gc_expire++;
1176         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1177         net->ipv6.ip6_rt_last_gc = now;
1178         entries = dst_entries_get_slow(ops);
1179         if (entries < ops->gc_thresh)
1180                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1181 out:
1182         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1183         return entries > rt_max_size;
1184 }
1185
1186 /* Clean host part of a prefix. Not necessary in radix tree,
1187    but results in cleaner routing tables.
1188
1189    Remove it only when all the things will work!
1190  */
1191
1192 int ip6_dst_hoplimit(struct dst_entry *dst)
1193 {
1194         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1195         if (hoplimit == 0) {
1196                 struct net_device *dev = dst->dev;
1197                 struct inet6_dev *idev;
1198
1199                 rcu_read_lock();
1200                 idev = __in6_dev_get(dev);
1201                 if (idev)
1202                         hoplimit = idev->cnf.hop_limit;
1203                 else
1204                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1205                 rcu_read_unlock();
1206         }
1207         return hoplimit;
1208 }
1209 EXPORT_SYMBOL(ip6_dst_hoplimit);
1210
1211 /*
1212  *
1213  */
1214
1215 int ip6_route_add(struct fib6_config *cfg)
1216 {
1217         int err;
1218         struct net *net = cfg->fc_nlinfo.nl_net;
1219         struct rt6_info *rt = NULL;
1220         struct net_device *dev = NULL;
1221         struct inet6_dev *idev = NULL;
1222         struct fib6_table *table;
1223         int addr_type;
1224
1225         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1226                 return -EINVAL;
1227 #ifndef CONFIG_IPV6_SUBTREES
1228         if (cfg->fc_src_len)
1229                 return -EINVAL;
1230 #endif
1231         if (cfg->fc_ifindex) {
1232                 err = -ENODEV;
1233                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1234                 if (!dev)
1235                         goto out;
1236                 idev = in6_dev_get(dev);
1237                 if (!idev)
1238                         goto out;
1239         }
1240
1241         if (cfg->fc_metric == 0)
1242                 cfg->fc_metric = IP6_RT_PRIO_USER;
1243
1244         table = fib6_new_table(net, cfg->fc_table);
1245         if (table == NULL) {
1246                 err = -ENOBUFS;
1247                 goto out;
1248         }
1249
1250         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1251
1252         if (rt == NULL) {
1253                 err = -ENOMEM;
1254                 goto out;
1255         }
1256
1257         rt->dst.obsolete = -1;
1258         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1259                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1260                                 0;
1261
1262         if (cfg->fc_protocol == RTPROT_UNSPEC)
1263                 cfg->fc_protocol = RTPROT_BOOT;
1264         rt->rt6i_protocol = cfg->fc_protocol;
1265
1266         addr_type = ipv6_addr_type(&cfg->fc_dst);
1267
1268         if (addr_type & IPV6_ADDR_MULTICAST)
1269                 rt->dst.input = ip6_mc_input;
1270         else if (cfg->fc_flags & RTF_LOCAL)
1271                 rt->dst.input = ip6_input;
1272         else
1273                 rt->dst.input = ip6_forward;
1274
1275         rt->dst.output = ip6_output;
1276
1277         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1278         rt->rt6i_dst.plen = cfg->fc_dst_len;
1279         if (rt->rt6i_dst.plen == 128)
1280                rt->dst.flags |= DST_HOST;
1281
1282         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1283                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1284                 if (!metrics) {
1285                         err = -ENOMEM;
1286                         goto out;
1287                 }
1288                 dst_init_metrics(&rt->dst, metrics, 0);
1289         }
1290 #ifdef CONFIG_IPV6_SUBTREES
1291         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1292         rt->rt6i_src.plen = cfg->fc_src_len;
1293 #endif
1294
1295         rt->rt6i_metric = cfg->fc_metric;
1296
1297         /* We cannot add true routes via loopback here,
1298            they would result in kernel looping; promote them to reject routes
1299          */
1300         if ((cfg->fc_flags & RTF_REJECT) ||
1301             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1302                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1303                 /* hold loopback dev/idev if we haven't done so. */
1304                 if (dev != net->loopback_dev) {
1305                         if (dev) {
1306                                 dev_put(dev);
1307                                 in6_dev_put(idev);
1308                         }
1309                         dev = net->loopback_dev;
1310                         dev_hold(dev);
1311                         idev = in6_dev_get(dev);
1312                         if (!idev) {
1313                                 err = -ENODEV;
1314                                 goto out;
1315                         }
1316                 }
1317                 rt->dst.output = ip6_pkt_discard_out;
1318                 rt->dst.input = ip6_pkt_discard;
1319                 rt->dst.error = -ENETUNREACH;
1320                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1321                 goto install_route;
1322         }
1323
1324         if (cfg->fc_flags & RTF_GATEWAY) {
1325                 const struct in6_addr *gw_addr;
1326                 int gwa_type;
1327
1328                 gw_addr = &cfg->fc_gateway;
1329                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1330                 gwa_type = ipv6_addr_type(gw_addr);
1331
1332                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1333                         struct rt6_info *grt;
1334
1335                         /* IPv6 strictly inhibits using not link-local
1336                            addresses as nexthop address.
1337                            Otherwise, router will not able to send redirects.
1338                            It is very good, but in some (rare!) circumstances
1339                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1340                            some exceptions. --ANK
1341                          */
1342                         err = -EINVAL;
1343                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1344                                 goto out;
1345
1346                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1347
1348                         err = -EHOSTUNREACH;
1349                         if (grt == NULL)
1350                                 goto out;
1351                         if (dev) {
1352                                 if (dev != grt->rt6i_dev) {
1353                                         dst_release(&grt->dst);
1354                                         goto out;
1355                                 }
1356                         } else {
1357                                 dev = grt->rt6i_dev;
1358                                 idev = grt->rt6i_idev;
1359                                 dev_hold(dev);
1360                                 in6_dev_hold(grt->rt6i_idev);
1361                         }
1362                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1363                                 err = 0;
1364                         dst_release(&grt->dst);
1365
1366                         if (err)
1367                                 goto out;
1368                 }
1369                 err = -EINVAL;
1370                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1371                         goto out;
1372         }
1373
1374         err = -ENODEV;
1375         if (dev == NULL)
1376                 goto out;
1377
1378         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1379                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1380                         err = -EINVAL;
1381                         goto out;
1382                 }
1383                 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1384                 rt->rt6i_prefsrc.plen = 128;
1385         } else
1386                 rt->rt6i_prefsrc.plen = 0;
1387
1388         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1389                 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1390                 if (IS_ERR(n)) {
1391                         err = PTR_ERR(n);
1392                         goto out;
1393                 }
1394                 dst_set_neighbour(&rt->dst, n);
1395         }
1396
1397         rt->rt6i_flags = cfg->fc_flags;
1398
1399 install_route:
1400         if (cfg->fc_mx) {
1401                 struct nlattr *nla;
1402                 int remaining;
1403
1404                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1405                         int type = nla_type(nla);
1406
1407                         if (type) {
1408                                 if (type > RTAX_MAX) {
1409                                         err = -EINVAL;
1410                                         goto out;
1411                                 }
1412
1413                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1414                         }
1415                 }
1416         }
1417
1418         rt->dst.dev = dev;
1419         rt->rt6i_idev = idev;
1420         rt->rt6i_table = table;
1421
1422         cfg->fc_nlinfo.nl_net = dev_net(dev);
1423
1424         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1425
1426 out:
1427         if (dev)
1428                 dev_put(dev);
1429         if (idev)
1430                 in6_dev_put(idev);
1431         if (rt)
1432                 dst_free(&rt->dst);
1433         return err;
1434 }
1435
1436 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1437 {
1438         int err;
1439         struct fib6_table *table;
1440         struct net *net = dev_net(rt->rt6i_dev);
1441
1442         if (rt == net->ipv6.ip6_null_entry) {
1443                 err = -ENOENT;
1444                 goto out;
1445         }
1446
1447         table = rt->rt6i_table;
1448         write_lock_bh(&table->tb6_lock);
1449         err = fib6_del(rt, info);
1450         write_unlock_bh(&table->tb6_lock);
1451
1452 out:
1453         dst_release(&rt->dst);
1454         return err;
1455 }
1456
1457 int ip6_del_rt(struct rt6_info *rt)
1458 {
1459         struct nl_info info = {
1460                 .nl_net = dev_net(rt->rt6i_dev),
1461         };
1462         return __ip6_del_rt(rt, &info);
1463 }
1464
1465 static int ip6_route_del(struct fib6_config *cfg)
1466 {
1467         struct fib6_table *table;
1468         struct fib6_node *fn;
1469         struct rt6_info *rt;
1470         int err = -ESRCH;
1471
1472         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1473         if (table == NULL)
1474                 return err;
1475
1476         read_lock_bh(&table->tb6_lock);
1477
1478         fn = fib6_locate(&table->tb6_root,
1479                          &cfg->fc_dst, cfg->fc_dst_len,
1480                          &cfg->fc_src, cfg->fc_src_len);
1481
1482         if (fn) {
1483                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1484                         if (cfg->fc_ifindex &&
1485                             (rt->rt6i_dev == NULL ||
1486                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1487                                 continue;
1488                         if (cfg->fc_flags & RTF_GATEWAY &&
1489                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1490                                 continue;
1491                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1492                                 continue;
1493                         dst_hold(&rt->dst);
1494                         read_unlock_bh(&table->tb6_lock);
1495
1496                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1497                 }
1498         }
1499         read_unlock_bh(&table->tb6_lock);
1500
1501         return err;
1502 }
1503
1504 /*
1505  *      Handle redirects
1506  */
1507 struct ip6rd_flowi {
1508         struct flowi6 fl6;
1509         struct in6_addr gateway;
1510 };
1511
1512 static struct rt6_info *__ip6_route_redirect(struct net *net,
1513                                              struct fib6_table *table,
1514                                              struct flowi6 *fl6,
1515                                              int flags)
1516 {
1517         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1518         struct rt6_info *rt;
1519         struct fib6_node *fn;
1520
1521         /*
1522          * Get the "current" route for this destination and
1523          * check if the redirect has come from approriate router.
1524          *
1525          * RFC 2461 specifies that redirects should only be
1526          * accepted if they come from the nexthop to the target.
1527          * Due to the way the routes are chosen, this notion
1528          * is a bit fuzzy and one might need to check all possible
1529          * routes.
1530          */
1531
1532         read_lock_bh(&table->tb6_lock);
1533         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1534 restart:
1535         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1536                 /*
1537                  * Current route is on-link; redirect is always invalid.
1538                  *
1539                  * Seems, previous statement is not true. It could
1540                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1541                  * But then router serving it might decide, that we should
1542                  * know truth 8)8) --ANK (980726).
1543                  */
1544                 if (rt6_check_expired(rt))
1545                         continue;
1546                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1547                         continue;
1548                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1549                         continue;
1550                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1551                         continue;
1552                 break;
1553         }
1554
1555         if (!rt)
1556                 rt = net->ipv6.ip6_null_entry;
1557         BACKTRACK(net, &fl6->saddr);
1558 out:
1559         dst_hold(&rt->dst);
1560
1561         read_unlock_bh(&table->tb6_lock);
1562
1563         return rt;
1564 };
1565
1566 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1567                                            const struct in6_addr *src,
1568                                            const struct in6_addr *gateway,
1569                                            struct net_device *dev)
1570 {
1571         int flags = RT6_LOOKUP_F_HAS_SADDR;
1572         struct net *net = dev_net(dev);
1573         struct ip6rd_flowi rdfl = {
1574                 .fl6 = {
1575                         .flowi6_oif = dev->ifindex,
1576                         .daddr = *dest,
1577                         .saddr = *src,
1578                 },
1579         };
1580
1581         ipv6_addr_copy(&rdfl.gateway, gateway);
1582
1583         if (rt6_need_strict(dest))
1584                 flags |= RT6_LOOKUP_F_IFACE;
1585
1586         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1587                                                    flags, __ip6_route_redirect);
1588 }
1589
1590 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1591                   const struct in6_addr *saddr,
1592                   struct neighbour *neigh, u8 *lladdr, int on_link)
1593 {
1594         struct rt6_info *rt, *nrt = NULL;
1595         struct netevent_redirect netevent;
1596         struct net *net = dev_net(neigh->dev);
1597
1598         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1599
1600         if (rt == net->ipv6.ip6_null_entry) {
1601                 if (net_ratelimit())
1602                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1603                                "for redirect target\n");
1604                 goto out;
1605         }
1606
1607         /*
1608          *      We have finally decided to accept it.
1609          */
1610
1611         neigh_update(neigh, lladdr, NUD_STALE,
1612                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1613                      NEIGH_UPDATE_F_OVERRIDE|
1614                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1615                                      NEIGH_UPDATE_F_ISROUTER))
1616                      );
1617
1618         /*
1619          * Redirect received -> path was valid.
1620          * Look, redirects are sent only in response to data packets,
1621          * so that this nexthop apparently is reachable. --ANK
1622          */
1623         dst_confirm(&rt->dst);
1624
1625         /* Duplicate redirect: silently ignore. */
1626         if (neigh == dst_get_neighbour_raw(&rt->dst))
1627                 goto out;
1628
1629         nrt = ip6_rt_copy(rt, dest);
1630         if (nrt == NULL)
1631                 goto out;
1632
1633         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1634         if (on_link)
1635                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1636
1637         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1638         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1639
1640         if (ip6_ins_rt(nrt))
1641                 goto out;
1642
1643         netevent.old = &rt->dst;
1644         netevent.new = &nrt->dst;
1645         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1646
1647         if (rt->rt6i_flags&RTF_CACHE) {
1648                 ip6_del_rt(rt);
1649                 return;
1650         }
1651
1652 out:
1653         dst_release(&rt->dst);
1654 }
1655
1656 /*
1657  *      Handle ICMP "packet too big" messages
1658  *      i.e. Path MTU discovery
1659  */
1660
1661 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1662                              struct net *net, u32 pmtu, int ifindex)
1663 {
1664         struct rt6_info *rt, *nrt;
1665         int allfrag = 0;
1666 again:
1667         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1668         if (rt == NULL)
1669                 return;
1670
1671         if (rt6_check_expired(rt)) {
1672                 ip6_del_rt(rt);
1673                 goto again;
1674         }
1675
1676         if (pmtu >= dst_mtu(&rt->dst))
1677                 goto out;
1678
1679         if (pmtu < IPV6_MIN_MTU) {
1680                 /*
1681                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1682                  * MTU (1280) and a fragment header should always be included
1683                  * after a node receiving Too Big message reporting PMTU is
1684                  * less than the IPv6 Minimum Link MTU.
1685                  */
1686                 pmtu = IPV6_MIN_MTU;
1687                 allfrag = 1;
1688         }
1689
1690         /* New mtu received -> path was valid.
1691            They are sent only in response to data packets,
1692            so that this nexthop apparently is reachable. --ANK
1693          */
1694         dst_confirm(&rt->dst);
1695
1696         /* Host route. If it is static, it would be better
1697            not to override it, but add new one, so that
1698            when cache entry will expire old pmtu
1699            would return automatically.
1700          */
1701         if (rt->rt6i_flags & RTF_CACHE) {
1702                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1703                 if (allfrag) {
1704                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1705                         features |= RTAX_FEATURE_ALLFRAG;
1706                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1707                 }
1708                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1709                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1710                 goto out;
1711         }
1712
1713         /* Network route.
1714            Two cases are possible:
1715            1. It is connected route. Action: COW
1716            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1717          */
1718         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1719                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1720         else
1721                 nrt = rt6_alloc_clone(rt, daddr);
1722
1723         if (nrt) {
1724                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1725                 if (allfrag) {
1726                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1727                         features |= RTAX_FEATURE_ALLFRAG;
1728                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1729                 }
1730
1731                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1732                  * happened within 5 mins, the recommended timer is 10 mins.
1733                  * Here this route expiration time is set to ip6_rt_mtu_expires
1734                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1735                  * and detecting PMTU increase will be automatically happened.
1736                  */
1737                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1738                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1739
1740                 ip6_ins_rt(nrt);
1741         }
1742 out:
1743         dst_release(&rt->dst);
1744 }
1745
1746 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1747                         struct net_device *dev, u32 pmtu)
1748 {
1749         struct net *net = dev_net(dev);
1750
1751         /*
1752          * RFC 1981 states that a node "MUST reduce the size of the packets it
1753          * is sending along the path" that caused the Packet Too Big message.
1754          * Since it's not possible in the general case to determine which
1755          * interface was used to send the original packet, we update the MTU
1756          * on the interface that will be used to send future packets. We also
1757          * update the MTU on the interface that received the Packet Too Big in
1758          * case the original packet was forced out that interface with
1759          * SO_BINDTODEVICE or similar. This is the next best thing to the
1760          * correct behaviour, which would be to update the MTU on all
1761          * interfaces.
1762          */
1763         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1764         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1765 }
1766
1767 /*
1768  *      Misc support functions
1769  */
1770
1771 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1772                                     const struct in6_addr *dest)
1773 {
1774         struct net *net = dev_net(ort->rt6i_dev);
1775         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1776                                             ort->dst.dev, 0);
1777
1778         if (rt) {
1779                 rt->dst.input = ort->dst.input;
1780                 rt->dst.output = ort->dst.output;
1781                 rt->dst.flags |= DST_HOST;
1782
1783                 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1784                 rt->rt6i_dst.plen = 128;
1785                 dst_copy_metrics(&rt->dst, &ort->dst);
1786                 rt->dst.error = ort->dst.error;
1787                 rt->rt6i_idev = ort->rt6i_idev;
1788                 if (rt->rt6i_idev)
1789                         in6_dev_hold(rt->rt6i_idev);
1790                 rt->dst.lastuse = jiffies;
1791                 rt->rt6i_expires = 0;
1792
1793                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1794                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1795                 rt->rt6i_metric = 0;
1796
1797 #ifdef CONFIG_IPV6_SUBTREES
1798                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1799 #endif
1800                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1801                 rt->rt6i_table = ort->rt6i_table;
1802         }
1803         return rt;
1804 }
1805
1806 #ifdef CONFIG_IPV6_ROUTE_INFO
1807 static struct rt6_info *rt6_get_route_info(struct net *net,
1808                                            const struct in6_addr *prefix, int prefixlen,
1809                                            const struct in6_addr *gwaddr, int ifindex)
1810 {
1811         struct fib6_node *fn;
1812         struct rt6_info *rt = NULL;
1813         struct fib6_table *table;
1814
1815         table = fib6_get_table(net, RT6_TABLE_INFO);
1816         if (table == NULL)
1817                 return NULL;
1818
1819         write_lock_bh(&table->tb6_lock);
1820         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1821         if (!fn)
1822                 goto out;
1823
1824         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1825                 if (rt->rt6i_dev->ifindex != ifindex)
1826                         continue;
1827                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1828                         continue;
1829                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1830                         continue;
1831                 dst_hold(&rt->dst);
1832                 break;
1833         }
1834 out:
1835         write_unlock_bh(&table->tb6_lock);
1836         return rt;
1837 }
1838
1839 static struct rt6_info *rt6_add_route_info(struct net *net,
1840                                            const struct in6_addr *prefix, int prefixlen,
1841                                            const struct in6_addr *gwaddr, int ifindex,
1842                                            unsigned pref)
1843 {
1844         struct fib6_config cfg = {
1845                 .fc_table       = RT6_TABLE_INFO,
1846                 .fc_metric      = IP6_RT_PRIO_USER,
1847                 .fc_ifindex     = ifindex,
1848                 .fc_dst_len     = prefixlen,
1849                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1850                                   RTF_UP | RTF_PREF(pref),
1851                 .fc_nlinfo.pid = 0,
1852                 .fc_nlinfo.nlh = NULL,
1853                 .fc_nlinfo.nl_net = net,
1854         };
1855
1856         ipv6_addr_copy(&cfg.fc_dst, prefix);
1857         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1858
1859         /* We should treat it as a default route if prefix length is 0. */
1860         if (!prefixlen)
1861                 cfg.fc_flags |= RTF_DEFAULT;
1862
1863         ip6_route_add(&cfg);
1864
1865         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1866 }
1867 #endif
1868
1869 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1870 {
1871         struct rt6_info *rt;
1872         struct fib6_table *table;
1873
1874         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1875         if (table == NULL)
1876                 return NULL;
1877
1878         write_lock_bh(&table->tb6_lock);
1879         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1880                 if (dev == rt->rt6i_dev &&
1881                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1882                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1883                         break;
1884         }
1885         if (rt)
1886                 dst_hold(&rt->dst);
1887         write_unlock_bh(&table->tb6_lock);
1888         return rt;
1889 }
1890
1891 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1892                                      struct net_device *dev,
1893                                      unsigned int pref)
1894 {
1895         struct fib6_config cfg = {
1896                 .fc_table       = RT6_TABLE_DFLT,
1897                 .fc_metric      = IP6_RT_PRIO_USER,
1898                 .fc_ifindex     = dev->ifindex,
1899                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1900                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1901                 .fc_nlinfo.pid = 0,
1902                 .fc_nlinfo.nlh = NULL,
1903                 .fc_nlinfo.nl_net = dev_net(dev),
1904         };
1905
1906         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1907
1908         ip6_route_add(&cfg);
1909
1910         return rt6_get_dflt_router(gwaddr, dev);
1911 }
1912
1913 void rt6_purge_dflt_routers(struct net *net)
1914 {
1915         struct rt6_info *rt;
1916         struct fib6_table *table;
1917
1918         /* NOTE: Keep consistent with rt6_get_dflt_router */
1919         table = fib6_get_table(net, RT6_TABLE_DFLT);
1920         if (table == NULL)
1921                 return;
1922
1923 restart:
1924         read_lock_bh(&table->tb6_lock);
1925         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1926                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
1927                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
1928                         dst_hold(&rt->dst);
1929                         read_unlock_bh(&table->tb6_lock);
1930                         ip6_del_rt(rt);
1931                         goto restart;
1932                 }
1933         }
1934         read_unlock_bh(&table->tb6_lock);
1935 }
1936
1937 static void rtmsg_to_fib6_config(struct net *net,
1938                                  struct in6_rtmsg *rtmsg,
1939                                  struct fib6_config *cfg)
1940 {
1941         memset(cfg, 0, sizeof(*cfg));
1942
1943         cfg->fc_table = RT6_TABLE_MAIN;
1944         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1945         cfg->fc_metric = rtmsg->rtmsg_metric;
1946         cfg->fc_expires = rtmsg->rtmsg_info;
1947         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1948         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1949         cfg->fc_flags = rtmsg->rtmsg_flags;
1950
1951         cfg->fc_nlinfo.nl_net = net;
1952
1953         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1954         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1955         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1956 }
1957
1958 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1959 {
1960         struct fib6_config cfg;
1961         struct in6_rtmsg rtmsg;
1962         int err;
1963
1964         switch(cmd) {
1965         case SIOCADDRT:         /* Add a route */
1966         case SIOCDELRT:         /* Delete a route */
1967                 if (!capable(CAP_NET_ADMIN))
1968                         return -EPERM;
1969                 err = copy_from_user(&rtmsg, arg,
1970                                      sizeof(struct in6_rtmsg));
1971                 if (err)
1972                         return -EFAULT;
1973
1974                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1975
1976                 rtnl_lock();
1977                 switch (cmd) {
1978                 case SIOCADDRT:
1979                         err = ip6_route_add(&cfg);
1980                         break;
1981                 case SIOCDELRT:
1982                         err = ip6_route_del(&cfg);
1983                         break;
1984                 default:
1985                         err = -EINVAL;
1986                 }
1987                 rtnl_unlock();
1988
1989                 return err;
1990         }
1991
1992         return -EINVAL;
1993 }
1994
1995 /*
1996  *      Drop the packet on the floor
1997  */
1998
1999 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2000 {
2001         int type;
2002         struct dst_entry *dst = skb_dst(skb);
2003         switch (ipstats_mib_noroutes) {
2004         case IPSTATS_MIB_INNOROUTES:
2005                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2006                 if (type == IPV6_ADDR_ANY) {
2007                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2008                                       IPSTATS_MIB_INADDRERRORS);
2009                         break;
2010                 }
2011                 /* FALLTHROUGH */
2012         case IPSTATS_MIB_OUTNOROUTES:
2013                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2014                               ipstats_mib_noroutes);
2015                 break;
2016         }
2017         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2018         kfree_skb(skb);
2019         return 0;
2020 }
2021
2022 static int ip6_pkt_discard(struct sk_buff *skb)
2023 {
2024         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2025 }
2026
2027 static int ip6_pkt_discard_out(struct sk_buff *skb)
2028 {
2029         skb->dev = skb_dst(skb)->dev;
2030         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2031 }
2032
2033 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2034
2035 static int ip6_pkt_prohibit(struct sk_buff *skb)
2036 {
2037         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2038 }
2039
2040 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2041 {
2042         skb->dev = skb_dst(skb)->dev;
2043         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2044 }
2045
2046 #endif
2047
2048 /*
2049  *      Allocate a dst for local (unicast / anycast) address.
2050  */
2051
2052 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2053                                     const struct in6_addr *addr,
2054                                     int anycast)
2055 {
2056         struct net *net = dev_net(idev->dev);
2057         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2058                                             net->loopback_dev, 0);
2059         struct neighbour *neigh;
2060
2061         if (rt == NULL) {
2062                 if (net_ratelimit())
2063                         pr_warning("IPv6:  Maximum number of routes reached,"
2064                                    " consider increasing route/max_size.\n");
2065                 return ERR_PTR(-ENOMEM);
2066         }
2067
2068         in6_dev_hold(idev);
2069
2070         rt->dst.flags |= DST_HOST;
2071         rt->dst.input = ip6_input;
2072         rt->dst.output = ip6_output;
2073         rt->rt6i_idev = idev;
2074         rt->dst.obsolete = -1;
2075
2076         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2077         if (anycast)
2078                 rt->rt6i_flags |= RTF_ANYCAST;
2079         else
2080                 rt->rt6i_flags |= RTF_LOCAL;
2081         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2082         if (IS_ERR(neigh)) {
2083                 dst_free(&rt->dst);
2084
2085                 return ERR_CAST(neigh);
2086         }
2087         dst_set_neighbour(&rt->dst, neigh);
2088
2089         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2090         rt->rt6i_dst.plen = 128;
2091         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2092
2093         atomic_set(&rt->dst.__refcnt, 1);
2094
2095         return rt;
2096 }
2097
2098 int ip6_route_get_saddr(struct net *net,
2099                         struct rt6_info *rt,
2100                         const struct in6_addr *daddr,
2101                         unsigned int prefs,
2102                         struct in6_addr *saddr)
2103 {
2104         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2105         int err = 0;
2106         if (rt->rt6i_prefsrc.plen)
2107                 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2108         else
2109                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2110                                          daddr, prefs, saddr);
2111         return err;
2112 }
2113
2114 /* remove deleted ip from prefsrc entries */
2115 struct arg_dev_net_ip {
2116         struct net_device *dev;
2117         struct net *net;
2118         struct in6_addr *addr;
2119 };
2120
2121 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2122 {
2123         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2124         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2125         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2126
2127         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2128             rt != net->ipv6.ip6_null_entry &&
2129             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2130                 /* remove prefsrc entry */
2131                 rt->rt6i_prefsrc.plen = 0;
2132         }
2133         return 0;
2134 }
2135
2136 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2137 {
2138         struct net *net = dev_net(ifp->idev->dev);
2139         struct arg_dev_net_ip adni = {
2140                 .dev = ifp->idev->dev,
2141                 .net = net,
2142                 .addr = &ifp->addr,
2143         };
2144         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2145 }
2146
2147 struct arg_dev_net {
2148         struct net_device *dev;
2149         struct net *net;
2150 };
2151
2152 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2153 {
2154         const struct arg_dev_net *adn = arg;
2155         const struct net_device *dev = adn->dev;
2156
2157         if ((rt->rt6i_dev == dev || dev == NULL) &&
2158             rt != adn->net->ipv6.ip6_null_entry) {
2159                 RT6_TRACE("deleted by ifdown %p\n", rt);
2160                 return -1;
2161         }
2162         return 0;
2163 }
2164
2165 void rt6_ifdown(struct net *net, struct net_device *dev)
2166 {
2167         struct arg_dev_net adn = {
2168                 .dev = dev,
2169                 .net = net,
2170         };
2171
2172         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2173         icmp6_clean_all(fib6_ifdown, &adn);
2174 }
2175
2176 struct rt6_mtu_change_arg
2177 {
2178         struct net_device *dev;
2179         unsigned mtu;
2180 };
2181
2182 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2183 {
2184         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2185         struct inet6_dev *idev;
2186
2187         /* In IPv6 pmtu discovery is not optional,
2188            so that RTAX_MTU lock cannot disable it.
2189            We still use this lock to block changes
2190            caused by addrconf/ndisc.
2191         */
2192
2193         idev = __in6_dev_get(arg->dev);
2194         if (idev == NULL)
2195                 return 0;
2196
2197         /* For administrative MTU increase, there is no way to discover
2198            IPv6 PMTU increase, so PMTU increase should be updated here.
2199            Since RFC 1981 doesn't include administrative MTU increase
2200            update PMTU increase is a MUST. (i.e. jumbo frame)
2201          */
2202         /*
2203            If new MTU is less than route PMTU, this new MTU will be the
2204            lowest MTU in the path, update the route PMTU to reflect PMTU
2205            decreases; if new MTU is greater than route PMTU, and the
2206            old MTU is the lowest MTU in the path, update the route PMTU
2207            to reflect the increase. In this case if the other nodes' MTU
2208            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2209            PMTU discouvery.
2210          */
2211         if (rt->rt6i_dev == arg->dev &&
2212             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2213             (dst_mtu(&rt->dst) >= arg->mtu ||
2214              (dst_mtu(&rt->dst) < arg->mtu &&
2215               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2216                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2217         }
2218         return 0;
2219 }
2220
2221 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2222 {
2223         struct rt6_mtu_change_arg arg = {
2224                 .dev = dev,
2225                 .mtu = mtu,
2226         };
2227
2228         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2229 }
2230
2231 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2232         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2233         [RTA_OIF]               = { .type = NLA_U32 },
2234         [RTA_IIF]               = { .type = NLA_U32 },
2235         [RTA_PRIORITY]          = { .type = NLA_U32 },
2236         [RTA_METRICS]           = { .type = NLA_NESTED },
2237 };
2238
2239 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2240                               struct fib6_config *cfg)
2241 {
2242         struct rtmsg *rtm;
2243         struct nlattr *tb[RTA_MAX+1];
2244         int err;
2245
2246         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2247         if (err < 0)
2248                 goto errout;
2249
2250         err = -EINVAL;
2251         rtm = nlmsg_data(nlh);
2252         memset(cfg, 0, sizeof(*cfg));
2253
2254         cfg->fc_table = rtm->rtm_table;
2255         cfg->fc_dst_len = rtm->rtm_dst_len;
2256         cfg->fc_src_len = rtm->rtm_src_len;
2257         cfg->fc_flags = RTF_UP;
2258         cfg->fc_protocol = rtm->rtm_protocol;
2259
2260         if (rtm->rtm_type == RTN_UNREACHABLE)
2261                 cfg->fc_flags |= RTF_REJECT;
2262
2263         if (rtm->rtm_type == RTN_LOCAL)
2264                 cfg->fc_flags |= RTF_LOCAL;
2265
2266         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2267         cfg->fc_nlinfo.nlh = nlh;
2268         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2269
2270         if (tb[RTA_GATEWAY]) {
2271                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2272                 cfg->fc_flags |= RTF_GATEWAY;
2273         }
2274
2275         if (tb[RTA_DST]) {
2276                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2277
2278                 if (nla_len(tb[RTA_DST]) < plen)
2279                         goto errout;
2280
2281                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2282         }
2283
2284         if (tb[RTA_SRC]) {
2285                 int plen = (rtm->rtm_src_len + 7) >> 3;
2286
2287                 if (nla_len(tb[RTA_SRC]) < plen)
2288                         goto errout;
2289
2290                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2291         }
2292
2293         if (tb[RTA_PREFSRC])
2294                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2295
2296         if (tb[RTA_OIF])
2297                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2298
2299         if (tb[RTA_PRIORITY])
2300                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2301
2302         if (tb[RTA_METRICS]) {
2303                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2304                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2305         }
2306
2307         if (tb[RTA_TABLE])
2308                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2309
2310         err = 0;
2311 errout:
2312         return err;
2313 }
2314
2315 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2316 {
2317         struct fib6_config cfg;
2318         int err;
2319
2320         err = rtm_to_fib6_config(skb, nlh, &cfg);
2321         if (err < 0)
2322                 return err;
2323
2324         return ip6_route_del(&cfg);
2325 }
2326
2327 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2328 {
2329         struct fib6_config cfg;
2330         int err;
2331
2332         err = rtm_to_fib6_config(skb, nlh, &cfg);
2333         if (err < 0)
2334                 return err;
2335
2336         return ip6_route_add(&cfg);
2337 }
2338
2339 static inline size_t rt6_nlmsg_size(void)
2340 {
2341         return NLMSG_ALIGN(sizeof(struct rtmsg))
2342                + nla_total_size(16) /* RTA_SRC */
2343                + nla_total_size(16) /* RTA_DST */
2344                + nla_total_size(16) /* RTA_GATEWAY */
2345                + nla_total_size(16) /* RTA_PREFSRC */
2346                + nla_total_size(4) /* RTA_TABLE */
2347                + nla_total_size(4) /* RTA_IIF */
2348                + nla_total_size(4) /* RTA_OIF */
2349                + nla_total_size(4) /* RTA_PRIORITY */
2350                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2351                + nla_total_size(sizeof(struct rta_cacheinfo));
2352 }
2353
2354 static int rt6_fill_node(struct net *net,
2355                          struct sk_buff *skb, struct rt6_info *rt,
2356                          struct in6_addr *dst, struct in6_addr *src,
2357                          int iif, int type, u32 pid, u32 seq,
2358                          int prefix, int nowait, unsigned int flags)
2359 {
2360         struct rtmsg *rtm;
2361         struct nlmsghdr *nlh;
2362         long expires;
2363         u32 table;
2364         struct neighbour *n;
2365
2366         if (prefix) {   /* user wants prefix routes only */
2367                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2368                         /* success since this is not a prefix route */
2369                         return 1;
2370                 }
2371         }
2372
2373         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2374         if (nlh == NULL)
2375                 return -EMSGSIZE;
2376
2377         rtm = nlmsg_data(nlh);
2378         rtm->rtm_family = AF_INET6;
2379         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2380         rtm->rtm_src_len = rt->rt6i_src.plen;
2381         rtm->rtm_tos = 0;
2382         if (rt->rt6i_table)
2383                 table = rt->rt6i_table->tb6_id;
2384         else
2385                 table = RT6_TABLE_UNSPEC;
2386         rtm->rtm_table = table;
2387         NLA_PUT_U32(skb, RTA_TABLE, table);
2388         if (rt->rt6i_flags&RTF_REJECT)
2389                 rtm->rtm_type = RTN_UNREACHABLE;
2390         else if (rt->rt6i_flags&RTF_LOCAL)
2391                 rtm->rtm_type = RTN_LOCAL;
2392         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2393                 rtm->rtm_type = RTN_LOCAL;
2394         else
2395                 rtm->rtm_type = RTN_UNICAST;
2396         rtm->rtm_flags = 0;
2397         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2398         rtm->rtm_protocol = rt->rt6i_protocol;
2399         if (rt->rt6i_flags&RTF_DYNAMIC)
2400                 rtm->rtm_protocol = RTPROT_REDIRECT;
2401         else if (rt->rt6i_flags & RTF_ADDRCONF)
2402                 rtm->rtm_protocol = RTPROT_KERNEL;
2403         else if (rt->rt6i_flags&RTF_DEFAULT)
2404                 rtm->rtm_protocol = RTPROT_RA;
2405
2406         if (rt->rt6i_flags&RTF_CACHE)
2407                 rtm->rtm_flags |= RTM_F_CLONED;
2408
2409         if (dst) {
2410                 NLA_PUT(skb, RTA_DST, 16, dst);
2411                 rtm->rtm_dst_len = 128;
2412         } else if (rtm->rtm_dst_len)
2413                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2414 #ifdef CONFIG_IPV6_SUBTREES
2415         if (src) {
2416                 NLA_PUT(skb, RTA_SRC, 16, src);
2417                 rtm->rtm_src_len = 128;
2418         } else if (rtm->rtm_src_len)
2419                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2420 #endif
2421         if (iif) {
2422 #ifdef CONFIG_IPV6_MROUTE
2423                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2424                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2425                         if (err <= 0) {
2426                                 if (!nowait) {
2427                                         if (err == 0)
2428                                                 return 0;
2429                                         goto nla_put_failure;
2430                                 } else {
2431                                         if (err == -EMSGSIZE)
2432                                                 goto nla_put_failure;
2433                                 }
2434                         }
2435                 } else
2436 #endif
2437                         NLA_PUT_U32(skb, RTA_IIF, iif);
2438         } else if (dst) {
2439                 struct in6_addr saddr_buf;
2440                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2441                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2442         }
2443
2444         if (rt->rt6i_prefsrc.plen) {
2445                 struct in6_addr saddr_buf;
2446                 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2447                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2448         }
2449
2450         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2451                 goto nla_put_failure;
2452
2453         rcu_read_lock();
2454         n = dst_get_neighbour(&rt->dst);
2455         if (n) {
2456                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2457                         rcu_read_unlock();
2458                         goto nla_put_failure;
2459                 }
2460         }
2461         rcu_read_unlock();
2462
2463         if (rt->dst.dev)
2464                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2465
2466         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2467
2468         if (!(rt->rt6i_flags & RTF_EXPIRES))
2469                 expires = 0;
2470         else if (rt->rt6i_expires - jiffies < INT_MAX)
2471                 expires = rt->rt6i_expires - jiffies;
2472         else
2473                 expires = INT_MAX;
2474
2475         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2476                                expires, rt->dst.error) < 0)
2477                 goto nla_put_failure;
2478
2479         return nlmsg_end(skb, nlh);
2480
2481 nla_put_failure:
2482         nlmsg_cancel(skb, nlh);
2483         return -EMSGSIZE;
2484 }
2485
2486 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2487 {
2488         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2489         int prefix;
2490
2491         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2492                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2493                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2494         } else
2495                 prefix = 0;
2496
2497         return rt6_fill_node(arg->net,
2498                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2499                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2500                      prefix, 0, NLM_F_MULTI);
2501 }
2502
2503 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2504 {
2505         struct net *net = sock_net(in_skb->sk);
2506         struct nlattr *tb[RTA_MAX+1];
2507         struct rt6_info *rt;
2508         struct sk_buff *skb;
2509         struct rtmsg *rtm;
2510         struct flowi6 fl6;
2511         int err, iif = 0;
2512
2513         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2514         if (err < 0)
2515                 goto errout;
2516
2517         err = -EINVAL;
2518         memset(&fl6, 0, sizeof(fl6));
2519
2520         if (tb[RTA_SRC]) {
2521                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2522                         goto errout;
2523
2524                 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2525         }
2526
2527         if (tb[RTA_DST]) {
2528                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2529                         goto errout;
2530
2531                 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2532         }
2533
2534         if (tb[RTA_IIF])
2535                 iif = nla_get_u32(tb[RTA_IIF]);
2536
2537         if (tb[RTA_OIF])
2538                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2539
2540         if (iif) {
2541                 struct net_device *dev;
2542                 dev = __dev_get_by_index(net, iif);
2543                 if (!dev) {
2544                         err = -ENODEV;
2545                         goto errout;
2546                 }
2547         }
2548
2549         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2550         if (skb == NULL) {
2551                 err = -ENOBUFS;
2552                 goto errout;
2553         }
2554
2555         /* Reserve room for dummy headers, this skb can pass
2556            through good chunk of routing engine.
2557          */
2558         skb_reset_mac_header(skb);
2559         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2560
2561         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2562         skb_dst_set(skb, &rt->dst);
2563
2564         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2565                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2566                             nlh->nlmsg_seq, 0, 0, 0);
2567         if (err < 0) {
2568                 kfree_skb(skb);
2569                 goto errout;
2570         }
2571
2572         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2573 errout:
2574         return err;
2575 }
2576
2577 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2578 {
2579         struct sk_buff *skb;
2580         struct net *net = info->nl_net;
2581         u32 seq;
2582         int err;
2583
2584         err = -ENOBUFS;
2585         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2586
2587         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2588         if (skb == NULL)
2589                 goto errout;
2590
2591         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2592                                 event, info->pid, seq, 0, 0, 0);
2593         if (err < 0) {
2594                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2595                 WARN_ON(err == -EMSGSIZE);
2596                 kfree_skb(skb);
2597                 goto errout;
2598         }
2599         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2600                     info->nlh, gfp_any());
2601         return;
2602 errout:
2603         if (err < 0)
2604                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2605 }
2606
2607 static int ip6_route_dev_notify(struct notifier_block *this,
2608                                 unsigned long event, void *data)
2609 {
2610         struct net_device *dev = (struct net_device *)data;
2611         struct net *net = dev_net(dev);
2612
2613         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2614                 net->ipv6.ip6_null_entry->dst.dev = dev;
2615                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2616 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2617                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2618                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2619                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2620                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2621 #endif
2622         }
2623
2624         return NOTIFY_OK;
2625 }
2626
2627 /*
2628  *      /proc
2629  */
2630
2631 #ifdef CONFIG_PROC_FS
2632
2633 struct rt6_proc_arg
2634 {
2635         char *buffer;
2636         int offset;
2637         int length;
2638         int skip;
2639         int len;
2640 };
2641
2642 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2643 {
2644         struct seq_file *m = p_arg;
2645         struct neighbour *n;
2646
2647         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2648
2649 #ifdef CONFIG_IPV6_SUBTREES
2650         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2651 #else
2652         seq_puts(m, "00000000000000000000000000000000 00 ");
2653 #endif
2654         rcu_read_lock();
2655         n = dst_get_neighbour(&rt->dst);
2656         if (n) {
2657                 seq_printf(m, "%pi6", n->primary_key);
2658         } else {
2659                 seq_puts(m, "00000000000000000000000000000000");
2660         }
2661         rcu_read_unlock();
2662         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2663                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2664                    rt->dst.__use, rt->rt6i_flags,
2665                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2666         return 0;
2667 }
2668
2669 static int ipv6_route_show(struct seq_file *m, void *v)
2670 {
2671         struct net *net = (struct net *)m->private;
2672         fib6_clean_all(net, rt6_info_route, 0, m);
2673         return 0;
2674 }
2675
2676 static int ipv6_route_open(struct inode *inode, struct file *file)
2677 {
2678         return single_open_net(inode, file, ipv6_route_show);
2679 }
2680
2681 static const struct file_operations ipv6_route_proc_fops = {
2682         .owner          = THIS_MODULE,
2683         .open           = ipv6_route_open,
2684         .read           = seq_read,
2685         .llseek         = seq_lseek,
2686         .release        = single_release_net,
2687 };
2688
2689 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2690 {
2691         struct net *net = (struct net *)seq->private;
2692         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2693                    net->ipv6.rt6_stats->fib_nodes,
2694                    net->ipv6.rt6_stats->fib_route_nodes,
2695                    net->ipv6.rt6_stats->fib_rt_alloc,
2696                    net->ipv6.rt6_stats->fib_rt_entries,
2697                    net->ipv6.rt6_stats->fib_rt_cache,
2698                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2699                    net->ipv6.rt6_stats->fib_discarded_routes);
2700
2701         return 0;
2702 }
2703
2704 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2705 {
2706         return single_open_net(inode, file, rt6_stats_seq_show);
2707 }
2708
2709 static const struct file_operations rt6_stats_seq_fops = {
2710         .owner   = THIS_MODULE,
2711         .open    = rt6_stats_seq_open,
2712         .read    = seq_read,
2713         .llseek  = seq_lseek,
2714         .release = single_release_net,
2715 };
2716 #endif  /* CONFIG_PROC_FS */
2717
2718 #ifdef CONFIG_SYSCTL
2719
2720 static
2721 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2722                               void __user *buffer, size_t *lenp, loff_t *ppos)
2723 {
2724         struct net *net;
2725         int delay;
2726         if (!write)
2727                 return -EINVAL;
2728
2729         net = (struct net *)ctl->extra1;
2730         delay = net->ipv6.sysctl.flush_delay;
2731         proc_dointvec(ctl, write, buffer, lenp, ppos);
2732         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2733         return 0;
2734 }
2735
2736 ctl_table ipv6_route_table_template[] = {
2737         {
2738                 .procname       =       "flush",
2739                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2740                 .maxlen         =       sizeof(int),
2741                 .mode           =       0200,
2742                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2743         },
2744         {
2745                 .procname       =       "gc_thresh",
2746                 .data           =       &ip6_dst_ops_template.gc_thresh,
2747                 .maxlen         =       sizeof(int),
2748                 .mode           =       0644,
2749                 .proc_handler   =       proc_dointvec,
2750         },
2751         {
2752                 .procname       =       "max_size",
2753                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2754                 .maxlen         =       sizeof(int),
2755                 .mode           =       0644,
2756                 .proc_handler   =       proc_dointvec,
2757         },
2758         {
2759                 .procname       =       "gc_min_interval",
2760                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2761                 .maxlen         =       sizeof(int),
2762                 .mode           =       0644,
2763                 .proc_handler   =       proc_dointvec_jiffies,
2764         },
2765         {
2766                 .procname       =       "gc_timeout",
2767                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2768                 .maxlen         =       sizeof(int),
2769                 .mode           =       0644,
2770                 .proc_handler   =       proc_dointvec_jiffies,
2771         },
2772         {
2773                 .procname       =       "gc_interval",
2774                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2775                 .maxlen         =       sizeof(int),
2776                 .mode           =       0644,
2777                 .proc_handler   =       proc_dointvec_jiffies,
2778         },
2779         {
2780                 .procname       =       "gc_elasticity",
2781                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2782                 .maxlen         =       sizeof(int),
2783                 .mode           =       0644,
2784                 .proc_handler   =       proc_dointvec,
2785         },
2786         {
2787                 .procname       =       "mtu_expires",
2788                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2789                 .maxlen         =       sizeof(int),
2790                 .mode           =       0644,
2791                 .proc_handler   =       proc_dointvec_jiffies,
2792         },
2793         {
2794                 .procname       =       "min_adv_mss",
2795                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2796                 .maxlen         =       sizeof(int),
2797                 .mode           =       0644,
2798                 .proc_handler   =       proc_dointvec,
2799         },
2800         {
2801                 .procname       =       "gc_min_interval_ms",
2802                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2803                 .maxlen         =       sizeof(int),
2804                 .mode           =       0644,
2805                 .proc_handler   =       proc_dointvec_ms_jiffies,
2806         },
2807         { }
2808 };
2809
2810 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2811 {
2812         struct ctl_table *table;
2813
2814         table = kmemdup(ipv6_route_table_template,
2815                         sizeof(ipv6_route_table_template),
2816                         GFP_KERNEL);
2817
2818         if (table) {
2819                 table[0].data = &net->ipv6.sysctl.flush_delay;
2820                 table[0].extra1 = net;
2821                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2822                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2823                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2824                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2825                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2826                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2827                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2828                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2829                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2830         }
2831
2832         return table;
2833 }
2834 #endif
2835
2836 static int __net_init ip6_route_net_init(struct net *net)
2837 {
2838         int ret = -ENOMEM;
2839
2840         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2841                sizeof(net->ipv6.ip6_dst_ops));
2842
2843         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2844                 goto out_ip6_dst_ops;
2845
2846         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2847                                            sizeof(*net->ipv6.ip6_null_entry),
2848                                            GFP_KERNEL);
2849         if (!net->ipv6.ip6_null_entry)
2850                 goto out_ip6_dst_entries;
2851         net->ipv6.ip6_null_entry->dst.path =
2852                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2853         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2854         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2855                          ip6_template_metrics, true);
2856
2857 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2858         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2859                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2860                                                GFP_KERNEL);
2861         if (!net->ipv6.ip6_prohibit_entry)
2862                 goto out_ip6_null_entry;
2863         net->ipv6.ip6_prohibit_entry->dst.path =
2864                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2865         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2866         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2867                          ip6_template_metrics, true);
2868
2869         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2870                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2871                                                GFP_KERNEL);
2872         if (!net->ipv6.ip6_blk_hole_entry)
2873                 goto out_ip6_prohibit_entry;
2874         net->ipv6.ip6_blk_hole_entry->dst.path =
2875                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2876         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2877         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2878                          ip6_template_metrics, true);
2879 #endif
2880
2881         net->ipv6.sysctl.flush_delay = 0;
2882         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2883         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2884         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2885         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2886         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2887         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2888         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2889
2890         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2891
2892         ret = 0;
2893 out:
2894         return ret;
2895
2896 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2897 out_ip6_prohibit_entry:
2898         kfree(net->ipv6.ip6_prohibit_entry);
2899 out_ip6_null_entry:
2900         kfree(net->ipv6.ip6_null_entry);
2901 #endif
2902 out_ip6_dst_entries:
2903         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2904 out_ip6_dst_ops:
2905         goto out;
2906 }
2907
2908 static void __net_exit ip6_route_net_exit(struct net *net)
2909 {
2910         kfree(net->ipv6.ip6_null_entry);
2911 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2912         kfree(net->ipv6.ip6_prohibit_entry);
2913         kfree(net->ipv6.ip6_blk_hole_entry);
2914 #endif
2915         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2916 }
2917
2918 static int __net_init ip6_route_net_init_late(struct net *net)
2919 {
2920 #ifdef CONFIG_PROC_FS
2921         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2922         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2923 #endif
2924         return 0;
2925 }
2926
2927 static void __net_exit ip6_route_net_exit_late(struct net *net)
2928 {
2929 #ifdef CONFIG_PROC_FS
2930         proc_net_remove(net, "ipv6_route");
2931         proc_net_remove(net, "rt6_stats");
2932 #endif
2933 }
2934
2935 static struct pernet_operations ip6_route_net_ops = {
2936         .init = ip6_route_net_init,
2937         .exit = ip6_route_net_exit,
2938 };
2939
2940 static struct pernet_operations ip6_route_net_late_ops = {
2941         .init = ip6_route_net_init_late,
2942         .exit = ip6_route_net_exit_late,
2943 };
2944
2945 static struct notifier_block ip6_route_dev_notifier = {
2946         .notifier_call = ip6_route_dev_notify,
2947         .priority = 0,
2948 };
2949
2950 int __init ip6_route_init(void)
2951 {
2952         int ret;
2953
2954         ret = -ENOMEM;
2955         ip6_dst_ops_template.kmem_cachep =
2956                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2957                                   SLAB_HWCACHE_ALIGN, NULL);
2958         if (!ip6_dst_ops_template.kmem_cachep)
2959                 goto out;
2960
2961         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2962         if (ret)
2963                 goto out_kmem_cache;
2964
2965         ret = register_pernet_subsys(&ip6_route_net_ops);
2966         if (ret)
2967                 goto out_dst_entries;
2968
2969         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2970
2971         /* Registering of the loopback is done before this portion of code,
2972          * the loopback reference in rt6_info will not be taken, do it
2973          * manually for init_net */
2974         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2975         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2976   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2977         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2978         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2979         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2980         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2981   #endif
2982         ret = fib6_init();
2983         if (ret)
2984                 goto out_register_subsys;
2985
2986         ret = xfrm6_init();
2987         if (ret)
2988                 goto out_fib6_init;
2989
2990         ret = fib6_rules_init();
2991         if (ret)
2992                 goto xfrm6_init;
2993
2994         ret = register_pernet_subsys(&ip6_route_net_late_ops);
2995         if (ret)
2996                 goto fib6_rules_init;
2997
2998         ret = -ENOBUFS;
2999         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3000             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3001             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3002                 goto out_register_late_subsys;
3003
3004         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3005         if (ret)
3006                 goto out_register_late_subsys;
3007
3008 out:
3009         return ret;
3010
3011 out_register_late_subsys:
3012         unregister_pernet_subsys(&ip6_route_net_late_ops);
3013 fib6_rules_init:
3014         fib6_rules_cleanup();
3015 xfrm6_init:
3016         xfrm6_fini();
3017 out_fib6_init:
3018         fib6_gc_cleanup();
3019 out_register_subsys:
3020         unregister_pernet_subsys(&ip6_route_net_ops);
3021 out_dst_entries:
3022         dst_entries_destroy(&ip6_dst_blackhole_ops);
3023 out_kmem_cache:
3024         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3025         goto out;
3026 }
3027
3028 void ip6_route_cleanup(void)
3029 {
3030         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3031         unregister_pernet_subsys(&ip6_route_net_late_ops);
3032         fib6_rules_cleanup();
3033         xfrm6_fini();
3034         fib6_gc_cleanup();
3035         unregister_pernet_subsys(&ip6_route_net_ops);
3036         dst_entries_destroy(&ip6_dst_blackhole_ops);
3037         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3038 }