net/ipv6: Correct PIM6 mrt_lock handling
[pandora-kernel.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 /* Set to 3 to get tracing. */
66 #define RT6_DEBUG 2
67
68 #if RT6_DEBUG >= 3
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #else
72 #define RDBG(x)
73 #define RT6_TRACE(x...) do { ; } while (0)
74 #endif
75
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77                                     const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int      ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void             ip6_dst_destroy(struct dst_entry *);
83 static void             ip6_dst_ifdown(struct dst_entry *,
84                                        struct net_device *dev, int how);
85 static int               ip6_dst_gc(struct dst_ops *ops);
86
87 static int              ip6_pkt_discard(struct sk_buff *skb);
88 static int              ip6_pkt_discard_out(struct sk_buff *skb);
89 static void             ip6_link_failure(struct sk_buff *skb);
90 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
91
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94                                            const struct in6_addr *prefix, int prefixlen,
95                                            const struct in6_addr *gwaddr, int ifindex,
96                                            unsigned pref);
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98                                            const struct in6_addr *prefix, int prefixlen,
99                                            const struct in6_addr *gwaddr, int ifindex);
100 #endif
101
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
103 {
104         struct rt6_info *rt = (struct rt6_info *) dst;
105         struct inet_peer *peer;
106         u32 *p = NULL;
107
108         if (!(rt->dst.flags & DST_HOST))
109                 return dst_cow_metrics_generic(dst, old);
110
111         if (!rt->rt6i_peer)
112                 rt6_bind_peer(rt, 1);
113
114         peer = rt->rt6i_peer;
115         if (peer) {
116                 u32 *old_p = __DST_METRICS_PTR(old);
117                 unsigned long prev, new;
118
119                 p = peer->metrics;
120                 if (inet_metrics_new(peer))
121                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
122
123                 new = (unsigned long) p;
124                 prev = cmpxchg(&dst->_metrics, old, new);
125
126                 if (prev != old) {
127                         p = __DST_METRICS_PTR(prev);
128                         if (prev & DST_METRICS_READ_ONLY)
129                                 p = NULL;
130                 }
131         }
132         return p;
133 }
134
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
136 {
137         return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
138 }
139
140 static struct dst_ops ip6_dst_ops_template = {
141         .family                 =       AF_INET6,
142         .protocol               =       cpu_to_be16(ETH_P_IPV6),
143         .gc                     =       ip6_dst_gc,
144         .gc_thresh              =       1024,
145         .check                  =       ip6_dst_check,
146         .default_advmss         =       ip6_default_advmss,
147         .mtu                    =       ip6_mtu,
148         .cow_metrics            =       ipv6_cow_metrics,
149         .destroy                =       ip6_dst_destroy,
150         .ifdown                 =       ip6_dst_ifdown,
151         .negative_advice        =       ip6_negative_advice,
152         .link_failure           =       ip6_link_failure,
153         .update_pmtu            =       ip6_rt_update_pmtu,
154         .local_out              =       __ip6_local_out,
155         .neigh_lookup           =       ip6_neigh_lookup,
156 };
157
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
159 {
160         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
161
162         return mtu ? : dst->dev->mtu;
163 }
164
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
166 {
167 }
168
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
170                                          unsigned long old)
171 {
172         return NULL;
173 }
174
175 static struct dst_ops ip6_dst_blackhole_ops = {
176         .family                 =       AF_INET6,
177         .protocol               =       cpu_to_be16(ETH_P_IPV6),
178         .destroy                =       ip6_dst_destroy,
179         .check                  =       ip6_dst_check,
180         .mtu                    =       ip6_blackhole_mtu,
181         .default_advmss         =       ip6_default_advmss,
182         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
183         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
184         .neigh_lookup           =       ip6_neigh_lookup,
185 };
186
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188         [RTAX_HOPLIMIT - 1] = 0,
189 };
190
191 static struct rt6_info ip6_null_entry_template = {
192         .dst = {
193                 .__refcnt       = ATOMIC_INIT(1),
194                 .__use          = 1,
195                 .obsolete       = -1,
196                 .error          = -ENETUNREACH,
197                 .input          = ip6_pkt_discard,
198                 .output         = ip6_pkt_discard_out,
199         },
200         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
201         .rt6i_protocol  = RTPROT_KERNEL,
202         .rt6i_metric    = ~(u32) 0,
203         .rt6i_ref       = ATOMIC_INIT(1),
204 };
205
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
207
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
210
211 static struct rt6_info ip6_prohibit_entry_template = {
212         .dst = {
213                 .__refcnt       = ATOMIC_INIT(1),
214                 .__use          = 1,
215                 .obsolete       = -1,
216                 .error          = -EACCES,
217                 .input          = ip6_pkt_prohibit,
218                 .output         = ip6_pkt_prohibit_out,
219         },
220         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
221         .rt6i_protocol  = RTPROT_KERNEL,
222         .rt6i_metric    = ~(u32) 0,
223         .rt6i_ref       = ATOMIC_INIT(1),
224 };
225
226 static struct rt6_info ip6_blk_hole_entry_template = {
227         .dst = {
228                 .__refcnt       = ATOMIC_INIT(1),
229                 .__use          = 1,
230                 .obsolete       = -1,
231                 .error          = -EINVAL,
232                 .input          = dst_discard,
233                 .output         = dst_discard,
234         },
235         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
236         .rt6i_protocol  = RTPROT_KERNEL,
237         .rt6i_metric    = ~(u32) 0,
238         .rt6i_ref       = ATOMIC_INIT(1),
239 };
240
241 #endif
242
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245                                              struct net_device *dev,
246                                              int flags)
247 {
248         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
249
250         if (rt != NULL)
251                 memset(&rt->rt6i_table, 0,
252                         sizeof(*rt) - sizeof(struct dst_entry));
253
254         return rt;
255 }
256
257 static void ip6_dst_destroy(struct dst_entry *dst)
258 {
259         struct rt6_info *rt = (struct rt6_info *)dst;
260         struct inet6_dev *idev = rt->rt6i_idev;
261         struct inet_peer *peer = rt->rt6i_peer;
262
263         if (!(rt->dst.flags & DST_HOST))
264                 dst_destroy_metrics_generic(dst);
265
266         if (idev != NULL) {
267                 rt->rt6i_idev = NULL;
268                 in6_dev_put(idev);
269         }
270         if (peer) {
271                 rt->rt6i_peer = NULL;
272                 inet_putpeer(peer);
273         }
274 }
275
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
277
278 static u32 rt6_peer_genid(void)
279 {
280         return atomic_read(&__rt6_peer_genid);
281 }
282
283 void rt6_bind_peer(struct rt6_info *rt, int create)
284 {
285         struct inet_peer *peer;
286
287         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
289                 inet_putpeer(peer);
290         else
291                 rt->rt6i_peer_genid = rt6_peer_genid();
292 }
293
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
295                            int how)
296 {
297         struct rt6_info *rt = (struct rt6_info *)dst;
298         struct inet6_dev *idev = rt->rt6i_idev;
299         struct net_device *loopback_dev =
300                 dev_net(dev)->loopback_dev;
301
302         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
303                 struct inet6_dev *loopback_idev =
304                         in6_dev_get(loopback_dev);
305                 if (loopback_idev != NULL) {
306                         rt->rt6i_idev = loopback_idev;
307                         in6_dev_put(idev);
308                 }
309         }
310 }
311
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
313 {
314         return (rt->rt6i_flags & RTF_EXPIRES) &&
315                 time_after(jiffies, rt->rt6i_expires);
316 }
317
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
319 {
320         return ipv6_addr_type(daddr) &
321                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
322 }
323
324 /*
325  *      Route lookup. Any table->tb6_lock is implied.
326  */
327
328 static inline struct rt6_info *rt6_device_match(struct net *net,
329                                                     struct rt6_info *rt,
330                                                     const struct in6_addr *saddr,
331                                                     int oif,
332                                                     int flags)
333 {
334         struct rt6_info *local = NULL;
335         struct rt6_info *sprt;
336
337         if (!oif && ipv6_addr_any(saddr))
338                 goto out;
339
340         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341                 struct net_device *dev = sprt->rt6i_dev;
342
343                 if (oif) {
344                         if (dev->ifindex == oif)
345                                 return sprt;
346                         if (dev->flags & IFF_LOOPBACK) {
347                                 if (sprt->rt6i_idev == NULL ||
348                                     sprt->rt6i_idev->dev->ifindex != oif) {
349                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
350                                                 continue;
351                                         if (local && (!oif ||
352                                                       local->rt6i_idev->dev->ifindex == oif))
353                                                 continue;
354                                 }
355                                 local = sprt;
356                         }
357                 } else {
358                         if (ipv6_chk_addr(net, saddr, dev,
359                                           flags & RT6_LOOKUP_F_IFACE))
360                                 return sprt;
361                 }
362         }
363
364         if (oif) {
365                 if (local)
366                         return local;
367
368                 if (flags & RT6_LOOKUP_F_IFACE)
369                         return net->ipv6.ip6_null_entry;
370         }
371 out:
372         return rt;
373 }
374
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
377 {
378         struct neighbour *neigh;
379         /*
380          * Okay, this does not seem to be appropriate
381          * for now, however, we need to check if it
382          * is really so; aka Router Reachability Probing.
383          *
384          * Router Reachability Probe MUST be rate-limited
385          * to no more than one per minute.
386          */
387         rcu_read_lock();
388         neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389         if (!neigh || (neigh->nud_state & NUD_VALID))
390                 goto out;
391         read_lock_bh(&neigh->lock);
392         if (!(neigh->nud_state & NUD_VALID) &&
393             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394                 struct in6_addr mcaddr;
395                 struct in6_addr *target;
396
397                 neigh->updated = jiffies;
398                 read_unlock_bh(&neigh->lock);
399
400                 target = (struct in6_addr *)&neigh->primary_key;
401                 addrconf_addr_solict_mult(target, &mcaddr);
402                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
403         } else {
404                 read_unlock_bh(&neigh->lock);
405         }
406 out:
407         rcu_read_unlock();
408 }
409 #else
410 static inline void rt6_probe(struct rt6_info *rt)
411 {
412 }
413 #endif
414
415 /*
416  * Default Router Selection (RFC 2461 6.3.6)
417  */
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
419 {
420         struct net_device *dev = rt->rt6i_dev;
421         if (!oif || dev->ifindex == oif)
422                 return 2;
423         if ((dev->flags & IFF_LOOPBACK) &&
424             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
425                 return 1;
426         return 0;
427 }
428
429 static inline int rt6_check_neigh(struct rt6_info *rt)
430 {
431         struct neighbour *neigh;
432         int m;
433
434         rcu_read_lock();
435         neigh = dst_get_neighbour(&rt->dst);
436         if (rt->rt6i_flags & RTF_NONEXTHOP ||
437             !(rt->rt6i_flags & RTF_GATEWAY))
438                 m = 1;
439         else if (neigh) {
440                 read_lock_bh(&neigh->lock);
441                 if (neigh->nud_state & NUD_VALID)
442                         m = 2;
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444                 else if (neigh->nud_state & NUD_FAILED)
445                         m = 0;
446 #endif
447                 else
448                         m = 1;
449                 read_unlock_bh(&neigh->lock);
450         } else
451                 m = 0;
452         rcu_read_unlock();
453         return m;
454 }
455
456 static int rt6_score_route(struct rt6_info *rt, int oif,
457                            int strict)
458 {
459         int m, n;
460
461         m = rt6_check_dev(rt, oif);
462         if (!m && (strict & RT6_LOOKUP_F_IFACE))
463                 return -1;
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
466 #endif
467         n = rt6_check_neigh(rt);
468         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
469                 return -1;
470         return m;
471 }
472
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474                                    int *mpri, struct rt6_info *match)
475 {
476         int m;
477
478         if (rt6_check_expired(rt))
479                 goto out;
480
481         m = rt6_score_route(rt, oif, strict);
482         if (m < 0)
483                 goto out;
484
485         if (m > *mpri) {
486                 if (strict & RT6_LOOKUP_F_REACHABLE)
487                         rt6_probe(match);
488                 *mpri = m;
489                 match = rt;
490         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
491                 rt6_probe(rt);
492         }
493
494 out:
495         return match;
496 }
497
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499                                      struct rt6_info *rr_head,
500                                      u32 metric, int oif, int strict)
501 {
502         struct rt6_info *rt, *match;
503         int mpri = -1;
504
505         match = NULL;
506         for (rt = rr_head; rt && rt->rt6i_metric == metric;
507              rt = rt->dst.rt6_next)
508                 match = find_match(rt, oif, strict, &mpri, match);
509         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510              rt = rt->dst.rt6_next)
511                 match = find_match(rt, oif, strict, &mpri, match);
512
513         return match;
514 }
515
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
517 {
518         struct rt6_info *match, *rt0;
519         struct net *net;
520
521         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522                   __func__, fn->leaf, oif);
523
524         rt0 = fn->rr_ptr;
525         if (!rt0)
526                 fn->rr_ptr = rt0 = fn->leaf;
527
528         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
529
530         if (!match &&
531             (strict & RT6_LOOKUP_F_REACHABLE)) {
532                 struct rt6_info *next = rt0->dst.rt6_next;
533
534                 /* no entries matched; do round-robin */
535                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
536                         next = fn->leaf;
537
538                 if (next != rt0)
539                         fn->rr_ptr = next;
540         }
541
542         RT6_TRACE("%s() => %p\n",
543                   __func__, match);
544
545         net = dev_net(rt0->rt6i_dev);
546         return match ? match : net->ipv6.ip6_null_entry;
547 }
548
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551                   const struct in6_addr *gwaddr)
552 {
553         struct net *net = dev_net(dev);
554         struct route_info *rinfo = (struct route_info *) opt;
555         struct in6_addr prefix_buf, *prefix;
556         unsigned int pref;
557         unsigned long lifetime;
558         struct rt6_info *rt;
559
560         if (len < sizeof(struct route_info)) {
561                 return -EINVAL;
562         }
563
564         /* Sanity check for prefix_len and length */
565         if (rinfo->length > 3) {
566                 return -EINVAL;
567         } else if (rinfo->prefix_len > 128) {
568                 return -EINVAL;
569         } else if (rinfo->prefix_len > 64) {
570                 if (rinfo->length < 2) {
571                         return -EINVAL;
572                 }
573         } else if (rinfo->prefix_len > 0) {
574                 if (rinfo->length < 1) {
575                         return -EINVAL;
576                 }
577         }
578
579         pref = rinfo->route_pref;
580         if (pref == ICMPV6_ROUTER_PREF_INVALID)
581                 return -EINVAL;
582
583         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
584
585         if (rinfo->length == 3)
586                 prefix = (struct in6_addr *)rinfo->prefix;
587         else {
588                 /* this function is safe */
589                 ipv6_addr_prefix(&prefix_buf,
590                                  (struct in6_addr *)rinfo->prefix,
591                                  rinfo->prefix_len);
592                 prefix = &prefix_buf;
593         }
594
595         if (rinfo->prefix_len == 0)
596                 rt = rt6_get_dflt_router(gwaddr, dev);
597         else
598                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
599                                         gwaddr, dev->ifindex);
600
601         if (rt && !lifetime) {
602                 ip6_del_rt(rt);
603                 rt = NULL;
604         }
605
606         if (!rt && lifetime)
607                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
608                                         pref);
609         else if (rt)
610                 rt->rt6i_flags = RTF_ROUTEINFO |
611                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
612
613         if (rt) {
614                 if (!addrconf_finite_timeout(lifetime)) {
615                         rt->rt6i_flags &= ~RTF_EXPIRES;
616                 } else {
617                         rt->rt6i_expires = jiffies + HZ * lifetime;
618                         rt->rt6i_flags |= RTF_EXPIRES;
619                 }
620                 dst_release(&rt->dst);
621         }
622         return 0;
623 }
624 #endif
625
626 #define BACKTRACK(__net, saddr)                 \
627 do { \
628         if (rt == __net->ipv6.ip6_null_entry) { \
629                 struct fib6_node *pn; \
630                 while (1) { \
631                         if (fn->fn_flags & RTN_TL_ROOT) \
632                                 goto out; \
633                         pn = fn->parent; \
634                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
635                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
636                         else \
637                                 fn = pn; \
638                         if (fn->fn_flags & RTN_RTINFO) \
639                                 goto restart; \
640                 } \
641         } \
642 } while(0)
643
644 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
645                                              struct fib6_table *table,
646                                              struct flowi6 *fl6, int flags)
647 {
648         struct fib6_node *fn;
649         struct rt6_info *rt;
650
651         read_lock_bh(&table->tb6_lock);
652         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
653 restart:
654         rt = fn->leaf;
655         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
656         BACKTRACK(net, &fl6->saddr);
657 out:
658         dst_use(&rt->dst, jiffies);
659         read_unlock_bh(&table->tb6_lock);
660         return rt;
661
662 }
663
664 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
665                             const struct in6_addr *saddr, int oif, int strict)
666 {
667         struct flowi6 fl6 = {
668                 .flowi6_oif = oif,
669                 .daddr = *daddr,
670         };
671         struct dst_entry *dst;
672         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
673
674         if (saddr) {
675                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
676                 flags |= RT6_LOOKUP_F_HAS_SADDR;
677         }
678
679         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
680         if (dst->error == 0)
681                 return (struct rt6_info *) dst;
682
683         dst_release(dst);
684
685         return NULL;
686 }
687
688 EXPORT_SYMBOL(rt6_lookup);
689
690 /* ip6_ins_rt is called with FREE table->tb6_lock.
691    It takes new route entry, the addition fails by any reason the
692    route is freed. In any case, if caller does not hold it, it may
693    be destroyed.
694  */
695
696 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
697 {
698         int err;
699         struct fib6_table *table;
700
701         table = rt->rt6i_table;
702         write_lock_bh(&table->tb6_lock);
703         err = fib6_add(&table->tb6_root, rt, info);
704         write_unlock_bh(&table->tb6_lock);
705
706         return err;
707 }
708
709 int ip6_ins_rt(struct rt6_info *rt)
710 {
711         struct nl_info info = {
712                 .nl_net = dev_net(rt->rt6i_dev),
713         };
714         return __ip6_ins_rt(rt, &info);
715 }
716
717 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
718                                       const struct in6_addr *daddr,
719                                       const struct in6_addr *saddr)
720 {
721         struct rt6_info *rt;
722
723         /*
724          *      Clone the route.
725          */
726
727         rt = ip6_rt_copy(ort, daddr);
728
729         if (rt) {
730                 struct neighbour *neigh;
731                 int attempts = !in_softirq();
732
733                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
734                         if (ort->rt6i_dst.plen != 128 &&
735                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
736                                 rt->rt6i_flags |= RTF_ANYCAST;
737                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
738                 }
739
740                 rt->rt6i_flags |= RTF_CACHE;
741
742 #ifdef CONFIG_IPV6_SUBTREES
743                 if (rt->rt6i_src.plen && saddr) {
744                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
745                         rt->rt6i_src.plen = 128;
746                 }
747 #endif
748
749         retry:
750                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
751                 if (IS_ERR(neigh)) {
752                         struct net *net = dev_net(rt->rt6i_dev);
753                         int saved_rt_min_interval =
754                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
755                         int saved_rt_elasticity =
756                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
757
758                         if (attempts-- > 0) {
759                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
760                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
761
762                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
763
764                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
765                                         saved_rt_elasticity;
766                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
767                                         saved_rt_min_interval;
768                                 goto retry;
769                         }
770
771                         if (net_ratelimit())
772                                 printk(KERN_WARNING
773                                        "ipv6: Neighbour table overflow.\n");
774                         dst_free(&rt->dst);
775                         return NULL;
776                 }
777                 dst_set_neighbour(&rt->dst, neigh);
778
779         }
780
781         return rt;
782 }
783
784 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
785                                         const struct in6_addr *daddr)
786 {
787         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
788
789         if (rt) {
790                 rt->rt6i_flags |= RTF_CACHE;
791                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
792         }
793         return rt;
794 }
795
796 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
797                                       struct flowi6 *fl6, int flags, bool input)
798 {
799         struct fib6_node *fn;
800         struct rt6_info *rt, *nrt;
801         int strict = 0;
802         int attempts = 3;
803         int err;
804         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
805         int local = RTF_NONEXTHOP;
806
807         strict |= flags & RT6_LOOKUP_F_IFACE;
808         if (input)
809                 local |= RTF_LOCAL;
810
811 relookup:
812         read_lock_bh(&table->tb6_lock);
813
814 restart_2:
815         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
816
817 restart:
818         rt = rt6_select(fn, oif, strict | reachable);
819
820         BACKTRACK(net, &fl6->saddr);
821         if (rt == net->ipv6.ip6_null_entry ||
822             rt->rt6i_flags & RTF_CACHE)
823                 goto out;
824
825         dst_hold(&rt->dst);
826         read_unlock_bh(&table->tb6_lock);
827
828         if (!dst_get_neighbour_raw(&rt->dst)
829             && !(rt->rt6i_flags & local))
830                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
831         else if (!(rt->dst.flags & DST_HOST))
832                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
833         else
834                 goto out2;
835
836         dst_release(&rt->dst);
837         rt = nrt ? : net->ipv6.ip6_null_entry;
838
839         dst_hold(&rt->dst);
840         if (nrt) {
841                 err = ip6_ins_rt(nrt);
842                 if (!err)
843                         goto out2;
844         }
845
846         if (--attempts <= 0)
847                 goto out2;
848
849         /*
850          * Race condition! In the gap, when table->tb6_lock was
851          * released someone could insert this route.  Relookup.
852          */
853         dst_release(&rt->dst);
854         goto relookup;
855
856 out:
857         if (reachable) {
858                 reachable = 0;
859                 goto restart_2;
860         }
861         dst_hold(&rt->dst);
862         read_unlock_bh(&table->tb6_lock);
863 out2:
864         rt->dst.lastuse = jiffies;
865         rt->dst.__use++;
866
867         return rt;
868 }
869
870 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
871                                             struct flowi6 *fl6, int flags)
872 {
873         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags, true);
874 }
875
876 void ip6_route_input(struct sk_buff *skb)
877 {
878         const struct ipv6hdr *iph = ipv6_hdr(skb);
879         struct net *net = dev_net(skb->dev);
880         int flags = RT6_LOOKUP_F_HAS_SADDR;
881         struct flowi6 fl6 = {
882                 .flowi6_iif = skb->dev->ifindex,
883                 .daddr = iph->daddr,
884                 .saddr = iph->saddr,
885                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
886                 .flowi6_mark = skb->mark,
887                 .flowi6_proto = iph->nexthdr,
888         };
889
890         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
891                 flags |= RT6_LOOKUP_F_IFACE;
892
893         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
894 }
895
896 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
897                                              struct flowi6 *fl6, int flags)
898 {
899         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags, false);
900 }
901
902 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
903                                     struct flowi6 *fl6)
904 {
905         int flags = 0;
906
907         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
908                 flags |= RT6_LOOKUP_F_IFACE;
909
910         if (!ipv6_addr_any(&fl6->saddr))
911                 flags |= RT6_LOOKUP_F_HAS_SADDR;
912         else if (sk)
913                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
914
915         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
916 }
917
918 EXPORT_SYMBOL(ip6_route_output);
919
920 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
921 {
922         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
923         struct dst_entry *new = NULL;
924
925         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
926         if (rt) {
927                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
928
929                 new = &rt->dst;
930
931                 new->__use = 1;
932                 new->input = dst_discard;
933                 new->output = dst_discard;
934
935                 if (dst_metrics_read_only(&ort->dst))
936                         new->_metrics = ort->dst._metrics;
937                 else
938                         dst_copy_metrics(new, &ort->dst);
939                 rt->rt6i_idev = ort->rt6i_idev;
940                 if (rt->rt6i_idev)
941                         in6_dev_hold(rt->rt6i_idev);
942                 rt->rt6i_expires = 0;
943
944                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
945                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
946                 rt->rt6i_metric = 0;
947
948                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
949 #ifdef CONFIG_IPV6_SUBTREES
950                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
951 #endif
952
953                 dst_free(new);
954         }
955
956         dst_release(dst_orig);
957         return new ? new : ERR_PTR(-ENOMEM);
958 }
959
960 /*
961  *      Destination cache support functions
962  */
963
964 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
965 {
966         struct rt6_info *rt;
967
968         rt = (struct rt6_info *) dst;
969
970         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
971                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
972                         if (!rt->rt6i_peer)
973                                 rt6_bind_peer(rt, 0);
974                         rt->rt6i_peer_genid = rt6_peer_genid();
975                 }
976                 return dst;
977         }
978         return NULL;
979 }
980
981 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
982 {
983         struct rt6_info *rt = (struct rt6_info *) dst;
984
985         if (rt) {
986                 if (rt->rt6i_flags & RTF_CACHE) {
987                         if (rt6_check_expired(rt)) {
988                                 ip6_del_rt(rt);
989                                 dst = NULL;
990                         }
991                 } else {
992                         dst_release(dst);
993                         dst = NULL;
994                 }
995         }
996         return dst;
997 }
998
999 static void ip6_link_failure(struct sk_buff *skb)
1000 {
1001         struct rt6_info *rt;
1002
1003         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1004
1005         rt = (struct rt6_info *) skb_dst(skb);
1006         if (rt) {
1007                 if (rt->rt6i_flags&RTF_CACHE) {
1008                         dst_set_expires(&rt->dst, 0);
1009                         rt->rt6i_flags |= RTF_EXPIRES;
1010                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1011                         rt->rt6i_node->fn_sernum = -1;
1012         }
1013 }
1014
1015 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1016 {
1017         struct rt6_info *rt6 = (struct rt6_info*)dst;
1018
1019         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1020                 rt6->rt6i_flags |= RTF_MODIFIED;
1021                 if (mtu < IPV6_MIN_MTU)
1022                         mtu = IPV6_MIN_MTU;
1023
1024                 dst_metric_set(dst, RTAX_MTU, mtu);
1025         }
1026 }
1027
1028 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1029 {
1030         struct net_device *dev = dst->dev;
1031         unsigned int mtu = dst_mtu(dst);
1032         struct net *net = dev_net(dev);
1033
1034         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1035
1036         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1037                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1038
1039         /*
1040          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1041          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1042          * IPV6_MAXPLEN is also valid and means: "any MSS,
1043          * rely only on pmtu discovery"
1044          */
1045         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1046                 mtu = IPV6_MAXPLEN;
1047         return mtu;
1048 }
1049
1050 static unsigned int ip6_mtu(const struct dst_entry *dst)
1051 {
1052         struct inet6_dev *idev;
1053         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1054
1055         if (mtu)
1056                 goto out;
1057
1058         mtu = IPV6_MIN_MTU;
1059
1060         rcu_read_lock();
1061         idev = __in6_dev_get(dst->dev);
1062         if (idev)
1063                 mtu = idev->cnf.mtu6;
1064         rcu_read_unlock();
1065
1066 out:
1067         return min_t(unsigned int, mtu, IP6_MAX_MTU);
1068 }
1069
1070 static struct dst_entry *icmp6_dst_gc_list;
1071 static DEFINE_SPINLOCK(icmp6_dst_lock);
1072
1073 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1074                                   struct neighbour *neigh,
1075                                   const struct in6_addr *addr)
1076 {
1077         struct rt6_info *rt;
1078         struct inet6_dev *idev = in6_dev_get(dev);
1079         struct net *net = dev_net(dev);
1080
1081         if (unlikely(idev == NULL))
1082                 return NULL;
1083
1084         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1085         if (unlikely(rt == NULL)) {
1086                 in6_dev_put(idev);
1087                 goto out;
1088         }
1089
1090         if (neigh)
1091                 neigh_hold(neigh);
1092         else {
1093                 neigh = ndisc_get_neigh(dev, addr);
1094                 if (IS_ERR(neigh))
1095                         neigh = NULL;
1096         }
1097
1098         rt->dst.flags |= DST_HOST;
1099         rt->dst.output  = ip6_output;
1100         dst_set_neighbour(&rt->dst, neigh);
1101         atomic_set(&rt->dst.__refcnt, 1);
1102         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1103         rt->rt6i_dst.plen = 128;
1104         rt->rt6i_idev     = idev;
1105         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1106
1107         spin_lock_bh(&icmp6_dst_lock);
1108         rt->dst.next = icmp6_dst_gc_list;
1109         icmp6_dst_gc_list = &rt->dst;
1110         spin_unlock_bh(&icmp6_dst_lock);
1111
1112         fib6_force_start_gc(net);
1113
1114 out:
1115         return &rt->dst;
1116 }
1117
1118 int icmp6_dst_gc(void)
1119 {
1120         struct dst_entry *dst, **pprev;
1121         int more = 0;
1122
1123         spin_lock_bh(&icmp6_dst_lock);
1124         pprev = &icmp6_dst_gc_list;
1125
1126         while ((dst = *pprev) != NULL) {
1127                 if (!atomic_read(&dst->__refcnt)) {
1128                         *pprev = dst->next;
1129                         dst_free(dst);
1130                 } else {
1131                         pprev = &dst->next;
1132                         ++more;
1133                 }
1134         }
1135
1136         spin_unlock_bh(&icmp6_dst_lock);
1137
1138         return more;
1139 }
1140
1141 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1142                             void *arg)
1143 {
1144         struct dst_entry *dst, **pprev;
1145
1146         spin_lock_bh(&icmp6_dst_lock);
1147         pprev = &icmp6_dst_gc_list;
1148         while ((dst = *pprev) != NULL) {
1149                 struct rt6_info *rt = (struct rt6_info *) dst;
1150                 if (func(rt, arg)) {
1151                         *pprev = dst->next;
1152                         dst_free(dst);
1153                 } else {
1154                         pprev = &dst->next;
1155                 }
1156         }
1157         spin_unlock_bh(&icmp6_dst_lock);
1158 }
1159
1160 static int ip6_dst_gc(struct dst_ops *ops)
1161 {
1162         unsigned long now = jiffies;
1163         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1164         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1165         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1166         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1167         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1168         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1169         int entries;
1170
1171         entries = dst_entries_get_fast(ops);
1172         if (time_after(rt_last_gc + rt_min_interval, now) &&
1173             entries <= rt_max_size)
1174                 goto out;
1175
1176         net->ipv6.ip6_rt_gc_expire++;
1177         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1178         net->ipv6.ip6_rt_last_gc = now;
1179         entries = dst_entries_get_slow(ops);
1180         if (entries < ops->gc_thresh)
1181                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1182 out:
1183         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1184         return entries > rt_max_size;
1185 }
1186
1187 /* Clean host part of a prefix. Not necessary in radix tree,
1188    but results in cleaner routing tables.
1189
1190    Remove it only when all the things will work!
1191  */
1192
1193 int ip6_dst_hoplimit(struct dst_entry *dst)
1194 {
1195         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1196         if (hoplimit == 0) {
1197                 struct net_device *dev = dst->dev;
1198                 struct inet6_dev *idev;
1199
1200                 rcu_read_lock();
1201                 idev = __in6_dev_get(dev);
1202                 if (idev)
1203                         hoplimit = idev->cnf.hop_limit;
1204                 else
1205                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1206                 rcu_read_unlock();
1207         }
1208         return hoplimit;
1209 }
1210 EXPORT_SYMBOL(ip6_dst_hoplimit);
1211
1212 /*
1213  *
1214  */
1215
1216 int ip6_route_add(struct fib6_config *cfg)
1217 {
1218         int err;
1219         struct net *net = cfg->fc_nlinfo.nl_net;
1220         struct rt6_info *rt = NULL;
1221         struct net_device *dev = NULL;
1222         struct inet6_dev *idev = NULL;
1223         struct fib6_table *table;
1224         int addr_type;
1225
1226         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1227                 return -EINVAL;
1228 #ifndef CONFIG_IPV6_SUBTREES
1229         if (cfg->fc_src_len)
1230                 return -EINVAL;
1231 #endif
1232         if (cfg->fc_ifindex) {
1233                 err = -ENODEV;
1234                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1235                 if (!dev)
1236                         goto out;
1237                 idev = in6_dev_get(dev);
1238                 if (!idev)
1239                         goto out;
1240         }
1241
1242         if (cfg->fc_metric == 0)
1243                 cfg->fc_metric = IP6_RT_PRIO_USER;
1244
1245         table = fib6_new_table(net, cfg->fc_table);
1246         if (table == NULL) {
1247                 err = -ENOBUFS;
1248                 goto out;
1249         }
1250
1251         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1252
1253         if (rt == NULL) {
1254                 err = -ENOMEM;
1255                 goto out;
1256         }
1257
1258         rt->dst.obsolete = -1;
1259         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1260                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1261                                 0;
1262
1263         if (cfg->fc_protocol == RTPROT_UNSPEC)
1264                 cfg->fc_protocol = RTPROT_BOOT;
1265         rt->rt6i_protocol = cfg->fc_protocol;
1266
1267         addr_type = ipv6_addr_type(&cfg->fc_dst);
1268
1269         if (addr_type & IPV6_ADDR_MULTICAST)
1270                 rt->dst.input = ip6_mc_input;
1271         else if (cfg->fc_flags & RTF_LOCAL)
1272                 rt->dst.input = ip6_input;
1273         else
1274                 rt->dst.input = ip6_forward;
1275
1276         rt->dst.output = ip6_output;
1277
1278         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1279         rt->rt6i_dst.plen = cfg->fc_dst_len;
1280         if (rt->rt6i_dst.plen == 128)
1281                rt->dst.flags |= DST_HOST;
1282
1283         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1284                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1285                 if (!metrics) {
1286                         err = -ENOMEM;
1287                         goto out;
1288                 }
1289                 dst_init_metrics(&rt->dst, metrics, 0);
1290         }
1291 #ifdef CONFIG_IPV6_SUBTREES
1292         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1293         rt->rt6i_src.plen = cfg->fc_src_len;
1294 #endif
1295
1296         rt->rt6i_metric = cfg->fc_metric;
1297
1298         /* We cannot add true routes via loopback here,
1299            they would result in kernel looping; promote them to reject routes
1300          */
1301         if ((cfg->fc_flags & RTF_REJECT) ||
1302             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1303                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1304                 /* hold loopback dev/idev if we haven't done so. */
1305                 if (dev != net->loopback_dev) {
1306                         if (dev) {
1307                                 dev_put(dev);
1308                                 in6_dev_put(idev);
1309                         }
1310                         dev = net->loopback_dev;
1311                         dev_hold(dev);
1312                         idev = in6_dev_get(dev);
1313                         if (!idev) {
1314                                 err = -ENODEV;
1315                                 goto out;
1316                         }
1317                 }
1318                 rt->dst.output = ip6_pkt_discard_out;
1319                 rt->dst.input = ip6_pkt_discard;
1320                 rt->dst.error = -ENETUNREACH;
1321                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1322                 goto install_route;
1323         }
1324
1325         if (cfg->fc_flags & RTF_GATEWAY) {
1326                 const struct in6_addr *gw_addr;
1327                 int gwa_type;
1328
1329                 gw_addr = &cfg->fc_gateway;
1330                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1331                 gwa_type = ipv6_addr_type(gw_addr);
1332
1333                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1334                         struct rt6_info *grt;
1335
1336                         /* IPv6 strictly inhibits using not link-local
1337                            addresses as nexthop address.
1338                            Otherwise, router will not able to send redirects.
1339                            It is very good, but in some (rare!) circumstances
1340                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1341                            some exceptions. --ANK
1342                          */
1343                         err = -EINVAL;
1344                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1345                                 goto out;
1346
1347                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1348
1349                         err = -EHOSTUNREACH;
1350                         if (grt == NULL)
1351                                 goto out;
1352                         if (dev) {
1353                                 if (dev != grt->rt6i_dev) {
1354                                         dst_release(&grt->dst);
1355                                         goto out;
1356                                 }
1357                         } else {
1358                                 dev = grt->rt6i_dev;
1359                                 idev = grt->rt6i_idev;
1360                                 dev_hold(dev);
1361                                 in6_dev_hold(grt->rt6i_idev);
1362                         }
1363                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1364                                 err = 0;
1365                         dst_release(&grt->dst);
1366
1367                         if (err)
1368                                 goto out;
1369                 }
1370                 err = -EINVAL;
1371                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1372                         goto out;
1373         }
1374
1375         err = -ENODEV;
1376         if (dev == NULL)
1377                 goto out;
1378
1379         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1380                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1381                         err = -EINVAL;
1382                         goto out;
1383                 }
1384                 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1385                 rt->rt6i_prefsrc.plen = 128;
1386         } else
1387                 rt->rt6i_prefsrc.plen = 0;
1388
1389         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1390                 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1391                 if (IS_ERR(n)) {
1392                         err = PTR_ERR(n);
1393                         goto out;
1394                 }
1395                 dst_set_neighbour(&rt->dst, n);
1396         }
1397
1398         rt->rt6i_flags = cfg->fc_flags;
1399
1400 install_route:
1401         if (cfg->fc_mx) {
1402                 struct nlattr *nla;
1403                 int remaining;
1404
1405                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1406                         int type = nla_type(nla);
1407
1408                         if (type) {
1409                                 if (type > RTAX_MAX) {
1410                                         err = -EINVAL;
1411                                         goto out;
1412                                 }
1413
1414                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1415                         }
1416                 }
1417         }
1418
1419         rt->dst.dev = dev;
1420         rt->rt6i_idev = idev;
1421         rt->rt6i_table = table;
1422
1423         cfg->fc_nlinfo.nl_net = dev_net(dev);
1424
1425         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1426
1427 out:
1428         if (dev)
1429                 dev_put(dev);
1430         if (idev)
1431                 in6_dev_put(idev);
1432         if (rt)
1433                 dst_free(&rt->dst);
1434         return err;
1435 }
1436
1437 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1438 {
1439         int err;
1440         struct fib6_table *table;
1441         struct net *net = dev_net(rt->rt6i_dev);
1442
1443         if (rt == net->ipv6.ip6_null_entry) {
1444                 err = -ENOENT;
1445                 goto out;
1446         }
1447
1448         table = rt->rt6i_table;
1449         write_lock_bh(&table->tb6_lock);
1450         err = fib6_del(rt, info);
1451         write_unlock_bh(&table->tb6_lock);
1452
1453 out:
1454         dst_release(&rt->dst);
1455         return err;
1456 }
1457
1458 int ip6_del_rt(struct rt6_info *rt)
1459 {
1460         struct nl_info info = {
1461                 .nl_net = dev_net(rt->rt6i_dev),
1462         };
1463         return __ip6_del_rt(rt, &info);
1464 }
1465
1466 static int ip6_route_del(struct fib6_config *cfg)
1467 {
1468         struct fib6_table *table;
1469         struct fib6_node *fn;
1470         struct rt6_info *rt;
1471         int err = -ESRCH;
1472
1473         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1474         if (table == NULL)
1475                 return err;
1476
1477         read_lock_bh(&table->tb6_lock);
1478
1479         fn = fib6_locate(&table->tb6_root,
1480                          &cfg->fc_dst, cfg->fc_dst_len,
1481                          &cfg->fc_src, cfg->fc_src_len);
1482
1483         if (fn) {
1484                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1485                         if (cfg->fc_ifindex &&
1486                             (rt->rt6i_dev == NULL ||
1487                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1488                                 continue;
1489                         if (cfg->fc_flags & RTF_GATEWAY &&
1490                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1491                                 continue;
1492                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1493                                 continue;
1494                         dst_hold(&rt->dst);
1495                         read_unlock_bh(&table->tb6_lock);
1496
1497                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1498                 }
1499         }
1500         read_unlock_bh(&table->tb6_lock);
1501
1502         return err;
1503 }
1504
1505 /*
1506  *      Handle redirects
1507  */
1508 struct ip6rd_flowi {
1509         struct flowi6 fl6;
1510         struct in6_addr gateway;
1511 };
1512
1513 static struct rt6_info *__ip6_route_redirect(struct net *net,
1514                                              struct fib6_table *table,
1515                                              struct flowi6 *fl6,
1516                                              int flags)
1517 {
1518         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1519         struct rt6_info *rt;
1520         struct fib6_node *fn;
1521
1522         /*
1523          * Get the "current" route for this destination and
1524          * check if the redirect has come from approriate router.
1525          *
1526          * RFC 2461 specifies that redirects should only be
1527          * accepted if they come from the nexthop to the target.
1528          * Due to the way the routes are chosen, this notion
1529          * is a bit fuzzy and one might need to check all possible
1530          * routes.
1531          */
1532
1533         read_lock_bh(&table->tb6_lock);
1534         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1535 restart:
1536         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1537                 /*
1538                  * Current route is on-link; redirect is always invalid.
1539                  *
1540                  * Seems, previous statement is not true. It could
1541                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1542                  * But then router serving it might decide, that we should
1543                  * know truth 8)8) --ANK (980726).
1544                  */
1545                 if (rt6_check_expired(rt))
1546                         continue;
1547                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1548                         continue;
1549                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1550                         continue;
1551                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1552                         continue;
1553                 break;
1554         }
1555
1556         if (!rt)
1557                 rt = net->ipv6.ip6_null_entry;
1558         BACKTRACK(net, &fl6->saddr);
1559 out:
1560         dst_hold(&rt->dst);
1561
1562         read_unlock_bh(&table->tb6_lock);
1563
1564         return rt;
1565 };
1566
1567 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1568                                            const struct in6_addr *src,
1569                                            const struct in6_addr *gateway,
1570                                            struct net_device *dev)
1571 {
1572         int flags = RT6_LOOKUP_F_HAS_SADDR;
1573         struct net *net = dev_net(dev);
1574         struct ip6rd_flowi rdfl = {
1575                 .fl6 = {
1576                         .flowi6_oif = dev->ifindex,
1577                         .daddr = *dest,
1578                         .saddr = *src,
1579                 },
1580         };
1581
1582         ipv6_addr_copy(&rdfl.gateway, gateway);
1583
1584         if (rt6_need_strict(dest))
1585                 flags |= RT6_LOOKUP_F_IFACE;
1586
1587         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1588                                                    flags, __ip6_route_redirect);
1589 }
1590
1591 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1592                   const struct in6_addr *saddr,
1593                   struct neighbour *neigh, u8 *lladdr, int on_link)
1594 {
1595         struct rt6_info *rt, *nrt = NULL;
1596         struct netevent_redirect netevent;
1597         struct net *net = dev_net(neigh->dev);
1598
1599         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1600
1601         if (rt == net->ipv6.ip6_null_entry) {
1602                 if (net_ratelimit())
1603                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1604                                "for redirect target\n");
1605                 goto out;
1606         }
1607
1608         /*
1609          *      We have finally decided to accept it.
1610          */
1611
1612         neigh_update(neigh, lladdr, NUD_STALE,
1613                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1614                      NEIGH_UPDATE_F_OVERRIDE|
1615                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1616                                      NEIGH_UPDATE_F_ISROUTER))
1617                      );
1618
1619         /*
1620          * Redirect received -> path was valid.
1621          * Look, redirects are sent only in response to data packets,
1622          * so that this nexthop apparently is reachable. --ANK
1623          */
1624         dst_confirm(&rt->dst);
1625
1626         /* Duplicate redirect: silently ignore. */
1627         if (neigh == dst_get_neighbour_raw(&rt->dst))
1628                 goto out;
1629
1630         nrt = ip6_rt_copy(rt, dest);
1631         if (nrt == NULL)
1632                 goto out;
1633
1634         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1635         if (on_link)
1636                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1637
1638         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1639         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1640
1641         if (ip6_ins_rt(nrt))
1642                 goto out;
1643
1644         netevent.old = &rt->dst;
1645         netevent.new = &nrt->dst;
1646         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1647
1648         if (rt->rt6i_flags&RTF_CACHE) {
1649                 ip6_del_rt(rt);
1650                 return;
1651         }
1652
1653 out:
1654         dst_release(&rt->dst);
1655 }
1656
1657 /*
1658  *      Handle ICMP "packet too big" messages
1659  *      i.e. Path MTU discovery
1660  */
1661
1662 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1663                              struct net *net, u32 pmtu, int ifindex)
1664 {
1665         struct rt6_info *rt, *nrt;
1666         int allfrag = 0;
1667 again:
1668         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1669         if (rt == NULL)
1670                 return;
1671
1672         if (rt6_check_expired(rt)) {
1673                 ip6_del_rt(rt);
1674                 goto again;
1675         }
1676
1677         if (pmtu >= dst_mtu(&rt->dst))
1678                 goto out;
1679
1680         if (pmtu < IPV6_MIN_MTU) {
1681                 /*
1682                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1683                  * MTU (1280) and a fragment header should always be included
1684                  * after a node receiving Too Big message reporting PMTU is
1685                  * less than the IPv6 Minimum Link MTU.
1686                  */
1687                 pmtu = IPV6_MIN_MTU;
1688                 allfrag = 1;
1689         }
1690
1691         /* New mtu received -> path was valid.
1692            They are sent only in response to data packets,
1693            so that this nexthop apparently is reachable. --ANK
1694          */
1695         dst_confirm(&rt->dst);
1696
1697         /* Host route. If it is static, it would be better
1698            not to override it, but add new one, so that
1699            when cache entry will expire old pmtu
1700            would return automatically.
1701          */
1702         if (rt->rt6i_flags & RTF_CACHE) {
1703                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1704                 if (allfrag) {
1705                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1706                         features |= RTAX_FEATURE_ALLFRAG;
1707                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1708                 }
1709                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1710                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1711                 goto out;
1712         }
1713
1714         /* Network route.
1715            Two cases are possible:
1716            1. It is connected route. Action: COW
1717            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1718          */
1719         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1720                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1721         else
1722                 nrt = rt6_alloc_clone(rt, daddr);
1723
1724         if (nrt) {
1725                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1726                 if (allfrag) {
1727                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1728                         features |= RTAX_FEATURE_ALLFRAG;
1729                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1730                 }
1731
1732                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1733                  * happened within 5 mins, the recommended timer is 10 mins.
1734                  * Here this route expiration time is set to ip6_rt_mtu_expires
1735                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1736                  * and detecting PMTU increase will be automatically happened.
1737                  */
1738                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1739                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1740
1741                 ip6_ins_rt(nrt);
1742         }
1743 out:
1744         dst_release(&rt->dst);
1745 }
1746
1747 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1748                         struct net_device *dev, u32 pmtu)
1749 {
1750         struct net *net = dev_net(dev);
1751
1752         /*
1753          * RFC 1981 states that a node "MUST reduce the size of the packets it
1754          * is sending along the path" that caused the Packet Too Big message.
1755          * Since it's not possible in the general case to determine which
1756          * interface was used to send the original packet, we update the MTU
1757          * on the interface that will be used to send future packets. We also
1758          * update the MTU on the interface that received the Packet Too Big in
1759          * case the original packet was forced out that interface with
1760          * SO_BINDTODEVICE or similar. This is the next best thing to the
1761          * correct behaviour, which would be to update the MTU on all
1762          * interfaces.
1763          */
1764         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1765         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1766 }
1767
1768 /*
1769  *      Misc support functions
1770  */
1771
1772 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1773                                     const struct in6_addr *dest)
1774 {
1775         struct net *net = dev_net(ort->rt6i_dev);
1776         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1777                                             ort->dst.dev, 0);
1778
1779         if (rt) {
1780                 rt->dst.input = ort->dst.input;
1781                 rt->dst.output = ort->dst.output;
1782                 rt->dst.flags |= DST_HOST;
1783
1784                 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1785                 rt->rt6i_dst.plen = 128;
1786                 dst_copy_metrics(&rt->dst, &ort->dst);
1787                 rt->dst.error = ort->dst.error;
1788                 rt->rt6i_idev = ort->rt6i_idev;
1789                 if (rt->rt6i_idev)
1790                         in6_dev_hold(rt->rt6i_idev);
1791                 rt->dst.lastuse = jiffies;
1792                 rt->rt6i_expires = 0;
1793
1794                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1795                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1796                 rt->rt6i_metric = 0;
1797
1798 #ifdef CONFIG_IPV6_SUBTREES
1799                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1800 #endif
1801                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1802                 rt->rt6i_table = ort->rt6i_table;
1803         }
1804         return rt;
1805 }
1806
1807 #ifdef CONFIG_IPV6_ROUTE_INFO
1808 static struct rt6_info *rt6_get_route_info(struct net *net,
1809                                            const struct in6_addr *prefix, int prefixlen,
1810                                            const struct in6_addr *gwaddr, int ifindex)
1811 {
1812         struct fib6_node *fn;
1813         struct rt6_info *rt = NULL;
1814         struct fib6_table *table;
1815
1816         table = fib6_get_table(net, RT6_TABLE_INFO);
1817         if (table == NULL)
1818                 return NULL;
1819
1820         write_lock_bh(&table->tb6_lock);
1821         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1822         if (!fn)
1823                 goto out;
1824
1825         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1826                 if (rt->rt6i_dev->ifindex != ifindex)
1827                         continue;
1828                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1829                         continue;
1830                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1831                         continue;
1832                 dst_hold(&rt->dst);
1833                 break;
1834         }
1835 out:
1836         write_unlock_bh(&table->tb6_lock);
1837         return rt;
1838 }
1839
1840 static struct rt6_info *rt6_add_route_info(struct net *net,
1841                                            const struct in6_addr *prefix, int prefixlen,
1842                                            const struct in6_addr *gwaddr, int ifindex,
1843                                            unsigned pref)
1844 {
1845         struct fib6_config cfg = {
1846                 .fc_table       = RT6_TABLE_INFO,
1847                 .fc_metric      = IP6_RT_PRIO_USER,
1848                 .fc_ifindex     = ifindex,
1849                 .fc_dst_len     = prefixlen,
1850                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1851                                   RTF_UP | RTF_PREF(pref),
1852                 .fc_nlinfo.pid = 0,
1853                 .fc_nlinfo.nlh = NULL,
1854                 .fc_nlinfo.nl_net = net,
1855         };
1856
1857         ipv6_addr_copy(&cfg.fc_dst, prefix);
1858         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1859
1860         /* We should treat it as a default route if prefix length is 0. */
1861         if (!prefixlen)
1862                 cfg.fc_flags |= RTF_DEFAULT;
1863
1864         ip6_route_add(&cfg);
1865
1866         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1867 }
1868 #endif
1869
1870 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1871 {
1872         struct rt6_info *rt;
1873         struct fib6_table *table;
1874
1875         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1876         if (table == NULL)
1877                 return NULL;
1878
1879         write_lock_bh(&table->tb6_lock);
1880         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1881                 if (dev == rt->rt6i_dev &&
1882                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1883                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1884                         break;
1885         }
1886         if (rt)
1887                 dst_hold(&rt->dst);
1888         write_unlock_bh(&table->tb6_lock);
1889         return rt;
1890 }
1891
1892 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1893                                      struct net_device *dev,
1894                                      unsigned int pref)
1895 {
1896         struct fib6_config cfg = {
1897                 .fc_table       = RT6_TABLE_DFLT,
1898                 .fc_metric      = IP6_RT_PRIO_USER,
1899                 .fc_ifindex     = dev->ifindex,
1900                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1901                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1902                 .fc_nlinfo.pid = 0,
1903                 .fc_nlinfo.nlh = NULL,
1904                 .fc_nlinfo.nl_net = dev_net(dev),
1905         };
1906
1907         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1908
1909         ip6_route_add(&cfg);
1910
1911         return rt6_get_dflt_router(gwaddr, dev);
1912 }
1913
1914 void rt6_purge_dflt_routers(struct net *net)
1915 {
1916         struct rt6_info *rt;
1917         struct fib6_table *table;
1918
1919         /* NOTE: Keep consistent with rt6_get_dflt_router */
1920         table = fib6_get_table(net, RT6_TABLE_DFLT);
1921         if (table == NULL)
1922                 return;
1923
1924 restart:
1925         read_lock_bh(&table->tb6_lock);
1926         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1927                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
1928                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
1929                         dst_hold(&rt->dst);
1930                         read_unlock_bh(&table->tb6_lock);
1931                         ip6_del_rt(rt);
1932                         goto restart;
1933                 }
1934         }
1935         read_unlock_bh(&table->tb6_lock);
1936 }
1937
1938 static void rtmsg_to_fib6_config(struct net *net,
1939                                  struct in6_rtmsg *rtmsg,
1940                                  struct fib6_config *cfg)
1941 {
1942         memset(cfg, 0, sizeof(*cfg));
1943
1944         cfg->fc_table = RT6_TABLE_MAIN;
1945         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1946         cfg->fc_metric = rtmsg->rtmsg_metric;
1947         cfg->fc_expires = rtmsg->rtmsg_info;
1948         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1949         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1950         cfg->fc_flags = rtmsg->rtmsg_flags;
1951
1952         cfg->fc_nlinfo.nl_net = net;
1953
1954         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1955         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1956         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1957 }
1958
1959 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1960 {
1961         struct fib6_config cfg;
1962         struct in6_rtmsg rtmsg;
1963         int err;
1964
1965         switch(cmd) {
1966         case SIOCADDRT:         /* Add a route */
1967         case SIOCDELRT:         /* Delete a route */
1968                 if (!capable(CAP_NET_ADMIN))
1969                         return -EPERM;
1970                 err = copy_from_user(&rtmsg, arg,
1971                                      sizeof(struct in6_rtmsg));
1972                 if (err)
1973                         return -EFAULT;
1974
1975                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1976
1977                 rtnl_lock();
1978                 switch (cmd) {
1979                 case SIOCADDRT:
1980                         err = ip6_route_add(&cfg);
1981                         break;
1982                 case SIOCDELRT:
1983                         err = ip6_route_del(&cfg);
1984                         break;
1985                 default:
1986                         err = -EINVAL;
1987                 }
1988                 rtnl_unlock();
1989
1990                 return err;
1991         }
1992
1993         return -EINVAL;
1994 }
1995
1996 /*
1997  *      Drop the packet on the floor
1998  */
1999
2000 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2001 {
2002         int type;
2003         struct dst_entry *dst = skb_dst(skb);
2004         switch (ipstats_mib_noroutes) {
2005         case IPSTATS_MIB_INNOROUTES:
2006                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2007                 if (type == IPV6_ADDR_ANY) {
2008                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2009                                       IPSTATS_MIB_INADDRERRORS);
2010                         break;
2011                 }
2012                 /* FALLTHROUGH */
2013         case IPSTATS_MIB_OUTNOROUTES:
2014                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2015                               ipstats_mib_noroutes);
2016                 break;
2017         }
2018         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2019         kfree_skb(skb);
2020         return 0;
2021 }
2022
2023 static int ip6_pkt_discard(struct sk_buff *skb)
2024 {
2025         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2026 }
2027
2028 static int ip6_pkt_discard_out(struct sk_buff *skb)
2029 {
2030         skb->dev = skb_dst(skb)->dev;
2031         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2032 }
2033
2034 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2035
2036 static int ip6_pkt_prohibit(struct sk_buff *skb)
2037 {
2038         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2039 }
2040
2041 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2042 {
2043         skb->dev = skb_dst(skb)->dev;
2044         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2045 }
2046
2047 #endif
2048
2049 /*
2050  *      Allocate a dst for local (unicast / anycast) address.
2051  */
2052
2053 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2054                                     const struct in6_addr *addr,
2055                                     int anycast)
2056 {
2057         struct net *net = dev_net(idev->dev);
2058         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2059                                             net->loopback_dev, DST_NOCOUNT);
2060         struct neighbour *neigh;
2061
2062         if (rt == NULL)
2063                 return ERR_PTR(-ENOMEM);
2064
2065         in6_dev_hold(idev);
2066
2067         rt->dst.flags |= DST_HOST;
2068         rt->dst.input = ip6_input;
2069         rt->dst.output = ip6_output;
2070         rt->rt6i_idev = idev;
2071         rt->dst.obsolete = -1;
2072
2073         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2074         if (anycast)
2075                 rt->rt6i_flags |= RTF_ANYCAST;
2076         else
2077                 rt->rt6i_flags |= RTF_LOCAL;
2078         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2079         if (IS_ERR(neigh)) {
2080                 dst_free(&rt->dst);
2081
2082                 return ERR_CAST(neigh);
2083         }
2084         dst_set_neighbour(&rt->dst, neigh);
2085
2086         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2087         rt->rt6i_dst.plen = 128;
2088         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2089
2090         atomic_set(&rt->dst.__refcnt, 1);
2091
2092         return rt;
2093 }
2094
2095 int ip6_route_get_saddr(struct net *net,
2096                         struct rt6_info *rt,
2097                         const struct in6_addr *daddr,
2098                         unsigned int prefs,
2099                         struct in6_addr *saddr)
2100 {
2101         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2102         int err = 0;
2103         if (rt->rt6i_prefsrc.plen)
2104                 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2105         else
2106                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2107                                          daddr, prefs, saddr);
2108         return err;
2109 }
2110
2111 /* remove deleted ip from prefsrc entries */
2112 struct arg_dev_net_ip {
2113         struct net_device *dev;
2114         struct net *net;
2115         struct in6_addr *addr;
2116 };
2117
2118 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2119 {
2120         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2121         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2122         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2123
2124         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2125             rt != net->ipv6.ip6_null_entry &&
2126             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2127                 /* remove prefsrc entry */
2128                 rt->rt6i_prefsrc.plen = 0;
2129         }
2130         return 0;
2131 }
2132
2133 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2134 {
2135         struct net *net = dev_net(ifp->idev->dev);
2136         struct arg_dev_net_ip adni = {
2137                 .dev = ifp->idev->dev,
2138                 .net = net,
2139                 .addr = &ifp->addr,
2140         };
2141         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2142 }
2143
2144 struct arg_dev_net {
2145         struct net_device *dev;
2146         struct net *net;
2147 };
2148
2149 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2150 {
2151         const struct arg_dev_net *adn = arg;
2152         const struct net_device *dev = adn->dev;
2153
2154         if ((rt->rt6i_dev == dev || dev == NULL) &&
2155             rt != adn->net->ipv6.ip6_null_entry) {
2156                 RT6_TRACE("deleted by ifdown %p\n", rt);
2157                 return -1;
2158         }
2159         return 0;
2160 }
2161
2162 void rt6_ifdown(struct net *net, struct net_device *dev)
2163 {
2164         struct arg_dev_net adn = {
2165                 .dev = dev,
2166                 .net = net,
2167         };
2168
2169         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2170         icmp6_clean_all(fib6_ifdown, &adn);
2171 }
2172
2173 struct rt6_mtu_change_arg
2174 {
2175         struct net_device *dev;
2176         unsigned mtu;
2177 };
2178
2179 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2180 {
2181         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2182         struct inet6_dev *idev;
2183
2184         /* In IPv6 pmtu discovery is not optional,
2185            so that RTAX_MTU lock cannot disable it.
2186            We still use this lock to block changes
2187            caused by addrconf/ndisc.
2188         */
2189
2190         idev = __in6_dev_get(arg->dev);
2191         if (idev == NULL)
2192                 return 0;
2193
2194         /* For administrative MTU increase, there is no way to discover
2195            IPv6 PMTU increase, so PMTU increase should be updated here.
2196            Since RFC 1981 doesn't include administrative MTU increase
2197            update PMTU increase is a MUST. (i.e. jumbo frame)
2198          */
2199         /*
2200            If new MTU is less than route PMTU, this new MTU will be the
2201            lowest MTU in the path, update the route PMTU to reflect PMTU
2202            decreases; if new MTU is greater than route PMTU, and the
2203            old MTU is the lowest MTU in the path, update the route PMTU
2204            to reflect the increase. In this case if the other nodes' MTU
2205            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2206            PMTU discouvery.
2207          */
2208         if (rt->rt6i_dev == arg->dev &&
2209             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2210             (dst_mtu(&rt->dst) >= arg->mtu ||
2211              (dst_mtu(&rt->dst) < arg->mtu &&
2212               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2213                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2214         }
2215         return 0;
2216 }
2217
2218 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2219 {
2220         struct rt6_mtu_change_arg arg = {
2221                 .dev = dev,
2222                 .mtu = mtu,
2223         };
2224
2225         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2226 }
2227
2228 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2229         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2230         [RTA_OIF]               = { .type = NLA_U32 },
2231         [RTA_IIF]               = { .type = NLA_U32 },
2232         [RTA_PRIORITY]          = { .type = NLA_U32 },
2233         [RTA_METRICS]           = { .type = NLA_NESTED },
2234 };
2235
2236 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2237                               struct fib6_config *cfg)
2238 {
2239         struct rtmsg *rtm;
2240         struct nlattr *tb[RTA_MAX+1];
2241         int err;
2242
2243         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2244         if (err < 0)
2245                 goto errout;
2246
2247         err = -EINVAL;
2248         rtm = nlmsg_data(nlh);
2249         memset(cfg, 0, sizeof(*cfg));
2250
2251         cfg->fc_table = rtm->rtm_table;
2252         cfg->fc_dst_len = rtm->rtm_dst_len;
2253         cfg->fc_src_len = rtm->rtm_src_len;
2254         cfg->fc_flags = RTF_UP;
2255         cfg->fc_protocol = rtm->rtm_protocol;
2256
2257         if (rtm->rtm_type == RTN_UNREACHABLE)
2258                 cfg->fc_flags |= RTF_REJECT;
2259
2260         if (rtm->rtm_type == RTN_LOCAL)
2261                 cfg->fc_flags |= RTF_LOCAL;
2262
2263         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2264         cfg->fc_nlinfo.nlh = nlh;
2265         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2266
2267         if (tb[RTA_GATEWAY]) {
2268                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2269                 cfg->fc_flags |= RTF_GATEWAY;
2270         }
2271
2272         if (tb[RTA_DST]) {
2273                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2274
2275                 if (nla_len(tb[RTA_DST]) < plen)
2276                         goto errout;
2277
2278                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2279         }
2280
2281         if (tb[RTA_SRC]) {
2282                 int plen = (rtm->rtm_src_len + 7) >> 3;
2283
2284                 if (nla_len(tb[RTA_SRC]) < plen)
2285                         goto errout;
2286
2287                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2288         }
2289
2290         if (tb[RTA_PREFSRC])
2291                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2292
2293         if (tb[RTA_OIF])
2294                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2295
2296         if (tb[RTA_PRIORITY])
2297                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2298
2299         if (tb[RTA_METRICS]) {
2300                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2301                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2302         }
2303
2304         if (tb[RTA_TABLE])
2305                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2306
2307         err = 0;
2308 errout:
2309         return err;
2310 }
2311
2312 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2313 {
2314         struct fib6_config cfg;
2315         int err;
2316
2317         err = rtm_to_fib6_config(skb, nlh, &cfg);
2318         if (err < 0)
2319                 return err;
2320
2321         return ip6_route_del(&cfg);
2322 }
2323
2324 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2325 {
2326         struct fib6_config cfg;
2327         int err;
2328
2329         err = rtm_to_fib6_config(skb, nlh, &cfg);
2330         if (err < 0)
2331                 return err;
2332
2333         return ip6_route_add(&cfg);
2334 }
2335
2336 static inline size_t rt6_nlmsg_size(void)
2337 {
2338         return NLMSG_ALIGN(sizeof(struct rtmsg))
2339                + nla_total_size(16) /* RTA_SRC */
2340                + nla_total_size(16) /* RTA_DST */
2341                + nla_total_size(16) /* RTA_GATEWAY */
2342                + nla_total_size(16) /* RTA_PREFSRC */
2343                + nla_total_size(4) /* RTA_TABLE */
2344                + nla_total_size(4) /* RTA_IIF */
2345                + nla_total_size(4) /* RTA_OIF */
2346                + nla_total_size(4) /* RTA_PRIORITY */
2347                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2348                + nla_total_size(sizeof(struct rta_cacheinfo));
2349 }
2350
2351 static int rt6_fill_node(struct net *net,
2352                          struct sk_buff *skb, struct rt6_info *rt,
2353                          struct in6_addr *dst, struct in6_addr *src,
2354                          int iif, int type, u32 pid, u32 seq,
2355                          int prefix, int nowait, unsigned int flags)
2356 {
2357         struct rtmsg *rtm;
2358         struct nlmsghdr *nlh;
2359         long expires;
2360         u32 table;
2361         struct neighbour *n;
2362
2363         if (prefix) {   /* user wants prefix routes only */
2364                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2365                         /* success since this is not a prefix route */
2366                         return 1;
2367                 }
2368         }
2369
2370         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2371         if (nlh == NULL)
2372                 return -EMSGSIZE;
2373
2374         rtm = nlmsg_data(nlh);
2375         rtm->rtm_family = AF_INET6;
2376         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2377         rtm->rtm_src_len = rt->rt6i_src.plen;
2378         rtm->rtm_tos = 0;
2379         if (rt->rt6i_table)
2380                 table = rt->rt6i_table->tb6_id;
2381         else
2382                 table = RT6_TABLE_UNSPEC;
2383         rtm->rtm_table = table;
2384         NLA_PUT_U32(skb, RTA_TABLE, table);
2385         if (rt->rt6i_flags&RTF_REJECT)
2386                 rtm->rtm_type = RTN_UNREACHABLE;
2387         else if (rt->rt6i_flags&RTF_LOCAL)
2388                 rtm->rtm_type = RTN_LOCAL;
2389         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2390                 rtm->rtm_type = RTN_LOCAL;
2391         else
2392                 rtm->rtm_type = RTN_UNICAST;
2393         rtm->rtm_flags = 0;
2394         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2395         rtm->rtm_protocol = rt->rt6i_protocol;
2396         if (rt->rt6i_flags&RTF_DYNAMIC)
2397                 rtm->rtm_protocol = RTPROT_REDIRECT;
2398         else if (rt->rt6i_flags & RTF_ADDRCONF)
2399                 rtm->rtm_protocol = RTPROT_KERNEL;
2400         else if (rt->rt6i_flags&RTF_DEFAULT)
2401                 rtm->rtm_protocol = RTPROT_RA;
2402
2403         if (rt->rt6i_flags&RTF_CACHE)
2404                 rtm->rtm_flags |= RTM_F_CLONED;
2405
2406         if (dst) {
2407                 NLA_PUT(skb, RTA_DST, 16, dst);
2408                 rtm->rtm_dst_len = 128;
2409         } else if (rtm->rtm_dst_len)
2410                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2411 #ifdef CONFIG_IPV6_SUBTREES
2412         if (src) {
2413                 NLA_PUT(skb, RTA_SRC, 16, src);
2414                 rtm->rtm_src_len = 128;
2415         } else if (rtm->rtm_src_len)
2416                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2417 #endif
2418         if (iif) {
2419 #ifdef CONFIG_IPV6_MROUTE
2420                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2421                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2422                         if (err <= 0) {
2423                                 if (!nowait) {
2424                                         if (err == 0)
2425                                                 return 0;
2426                                         goto nla_put_failure;
2427                                 } else {
2428                                         if (err == -EMSGSIZE)
2429                                                 goto nla_put_failure;
2430                                 }
2431                         }
2432                 } else
2433 #endif
2434                         NLA_PUT_U32(skb, RTA_IIF, iif);
2435         } else if (dst) {
2436                 struct in6_addr saddr_buf;
2437                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2438                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2439         }
2440
2441         if (rt->rt6i_prefsrc.plen) {
2442                 struct in6_addr saddr_buf;
2443                 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2444                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2445         }
2446
2447         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2448                 goto nla_put_failure;
2449
2450         rcu_read_lock();
2451         n = dst_get_neighbour(&rt->dst);
2452         if (n) {
2453                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2454                         rcu_read_unlock();
2455                         goto nla_put_failure;
2456                 }
2457         }
2458         rcu_read_unlock();
2459
2460         if (rt->dst.dev)
2461                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2462
2463         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2464
2465         if (!(rt->rt6i_flags & RTF_EXPIRES))
2466                 expires = 0;
2467         else if (rt->rt6i_expires - jiffies < INT_MAX)
2468                 expires = rt->rt6i_expires - jiffies;
2469         else
2470                 expires = INT_MAX;
2471
2472         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2473                                expires, rt->dst.error) < 0)
2474                 goto nla_put_failure;
2475
2476         return nlmsg_end(skb, nlh);
2477
2478 nla_put_failure:
2479         nlmsg_cancel(skb, nlh);
2480         return -EMSGSIZE;
2481 }
2482
2483 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2484 {
2485         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2486         int prefix;
2487
2488         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2489                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2490                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2491         } else
2492                 prefix = 0;
2493
2494         return rt6_fill_node(arg->net,
2495                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2496                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2497                      prefix, 0, NLM_F_MULTI);
2498 }
2499
2500 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2501 {
2502         struct net *net = sock_net(in_skb->sk);
2503         struct nlattr *tb[RTA_MAX+1];
2504         struct rt6_info *rt;
2505         struct sk_buff *skb;
2506         struct rtmsg *rtm;
2507         struct flowi6 fl6;
2508         int err, iif = 0;
2509
2510         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2511         if (err < 0)
2512                 goto errout;
2513
2514         err = -EINVAL;
2515         memset(&fl6, 0, sizeof(fl6));
2516
2517         if (tb[RTA_SRC]) {
2518                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2519                         goto errout;
2520
2521                 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2522         }
2523
2524         if (tb[RTA_DST]) {
2525                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2526                         goto errout;
2527
2528                 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2529         }
2530
2531         if (tb[RTA_IIF])
2532                 iif = nla_get_u32(tb[RTA_IIF]);
2533
2534         if (tb[RTA_OIF])
2535                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2536
2537         if (iif) {
2538                 struct net_device *dev;
2539                 dev = __dev_get_by_index(net, iif);
2540                 if (!dev) {
2541                         err = -ENODEV;
2542                         goto errout;
2543                 }
2544         }
2545
2546         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2547         if (skb == NULL) {
2548                 err = -ENOBUFS;
2549                 goto errout;
2550         }
2551
2552         /* Reserve room for dummy headers, this skb can pass
2553            through good chunk of routing engine.
2554          */
2555         skb_reset_mac_header(skb);
2556         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2557
2558         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2559         skb_dst_set(skb, &rt->dst);
2560
2561         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2562                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2563                             nlh->nlmsg_seq, 0, 0, 0);
2564         if (err < 0) {
2565                 kfree_skb(skb);
2566                 goto errout;
2567         }
2568
2569         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2570 errout:
2571         return err;
2572 }
2573
2574 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2575 {
2576         struct sk_buff *skb;
2577         struct net *net = info->nl_net;
2578         u32 seq;
2579         int err;
2580
2581         err = -ENOBUFS;
2582         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2583
2584         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2585         if (skb == NULL)
2586                 goto errout;
2587
2588         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2589                                 event, info->pid, seq, 0, 0, 0);
2590         if (err < 0) {
2591                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2592                 WARN_ON(err == -EMSGSIZE);
2593                 kfree_skb(skb);
2594                 goto errout;
2595         }
2596         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2597                     info->nlh, gfp_any());
2598         return;
2599 errout:
2600         if (err < 0)
2601                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2602 }
2603
2604 static int ip6_route_dev_notify(struct notifier_block *this,
2605                                 unsigned long event, void *data)
2606 {
2607         struct net_device *dev = (struct net_device *)data;
2608         struct net *net = dev_net(dev);
2609
2610         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2611                 net->ipv6.ip6_null_entry->dst.dev = dev;
2612                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2613 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2614                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2615                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2616                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2617                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2618 #endif
2619         }
2620
2621         return NOTIFY_OK;
2622 }
2623
2624 /*
2625  *      /proc
2626  */
2627
2628 #ifdef CONFIG_PROC_FS
2629
2630 struct rt6_proc_arg
2631 {
2632         char *buffer;
2633         int offset;
2634         int length;
2635         int skip;
2636         int len;
2637 };
2638
2639 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2640 {
2641         struct seq_file *m = p_arg;
2642         struct neighbour *n;
2643
2644         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2645
2646 #ifdef CONFIG_IPV6_SUBTREES
2647         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2648 #else
2649         seq_puts(m, "00000000000000000000000000000000 00 ");
2650 #endif
2651         rcu_read_lock();
2652         n = dst_get_neighbour(&rt->dst);
2653         if (n) {
2654                 seq_printf(m, "%pi6", n->primary_key);
2655         } else {
2656                 seq_puts(m, "00000000000000000000000000000000");
2657         }
2658         rcu_read_unlock();
2659         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2660                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2661                    rt->dst.__use, rt->rt6i_flags,
2662                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2663         return 0;
2664 }
2665
2666 static int ipv6_route_show(struct seq_file *m, void *v)
2667 {
2668         struct net *net = (struct net *)m->private;
2669         fib6_clean_all(net, rt6_info_route, 0, m);
2670         return 0;
2671 }
2672
2673 static int ipv6_route_open(struct inode *inode, struct file *file)
2674 {
2675         return single_open_net(inode, file, ipv6_route_show);
2676 }
2677
2678 static const struct file_operations ipv6_route_proc_fops = {
2679         .owner          = THIS_MODULE,
2680         .open           = ipv6_route_open,
2681         .read           = seq_read,
2682         .llseek         = seq_lseek,
2683         .release        = single_release_net,
2684 };
2685
2686 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2687 {
2688         struct net *net = (struct net *)seq->private;
2689         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2690                    net->ipv6.rt6_stats->fib_nodes,
2691                    net->ipv6.rt6_stats->fib_route_nodes,
2692                    net->ipv6.rt6_stats->fib_rt_alloc,
2693                    net->ipv6.rt6_stats->fib_rt_entries,
2694                    net->ipv6.rt6_stats->fib_rt_cache,
2695                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2696                    net->ipv6.rt6_stats->fib_discarded_routes);
2697
2698         return 0;
2699 }
2700
2701 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2702 {
2703         return single_open_net(inode, file, rt6_stats_seq_show);
2704 }
2705
2706 static const struct file_operations rt6_stats_seq_fops = {
2707         .owner   = THIS_MODULE,
2708         .open    = rt6_stats_seq_open,
2709         .read    = seq_read,
2710         .llseek  = seq_lseek,
2711         .release = single_release_net,
2712 };
2713 #endif  /* CONFIG_PROC_FS */
2714
2715 #ifdef CONFIG_SYSCTL
2716
2717 static
2718 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2719                               void __user *buffer, size_t *lenp, loff_t *ppos)
2720 {
2721         struct net *net;
2722         int delay;
2723         if (!write)
2724                 return -EINVAL;
2725
2726         net = (struct net *)ctl->extra1;
2727         delay = net->ipv6.sysctl.flush_delay;
2728         proc_dointvec(ctl, write, buffer, lenp, ppos);
2729         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2730         return 0;
2731 }
2732
2733 ctl_table ipv6_route_table_template[] = {
2734         {
2735                 .procname       =       "flush",
2736                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2737                 .maxlen         =       sizeof(int),
2738                 .mode           =       0200,
2739                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2740         },
2741         {
2742                 .procname       =       "gc_thresh",
2743                 .data           =       &ip6_dst_ops_template.gc_thresh,
2744                 .maxlen         =       sizeof(int),
2745                 .mode           =       0644,
2746                 .proc_handler   =       proc_dointvec,
2747         },
2748         {
2749                 .procname       =       "max_size",
2750                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2751                 .maxlen         =       sizeof(int),
2752                 .mode           =       0644,
2753                 .proc_handler   =       proc_dointvec,
2754         },
2755         {
2756                 .procname       =       "gc_min_interval",
2757                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2758                 .maxlen         =       sizeof(int),
2759                 .mode           =       0644,
2760                 .proc_handler   =       proc_dointvec_jiffies,
2761         },
2762         {
2763                 .procname       =       "gc_timeout",
2764                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2765                 .maxlen         =       sizeof(int),
2766                 .mode           =       0644,
2767                 .proc_handler   =       proc_dointvec_jiffies,
2768         },
2769         {
2770                 .procname       =       "gc_interval",
2771                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2772                 .maxlen         =       sizeof(int),
2773                 .mode           =       0644,
2774                 .proc_handler   =       proc_dointvec_jiffies,
2775         },
2776         {
2777                 .procname       =       "gc_elasticity",
2778                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2779                 .maxlen         =       sizeof(int),
2780                 .mode           =       0644,
2781                 .proc_handler   =       proc_dointvec,
2782         },
2783         {
2784                 .procname       =       "mtu_expires",
2785                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2786                 .maxlen         =       sizeof(int),
2787                 .mode           =       0644,
2788                 .proc_handler   =       proc_dointvec_jiffies,
2789         },
2790         {
2791                 .procname       =       "min_adv_mss",
2792                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2793                 .maxlen         =       sizeof(int),
2794                 .mode           =       0644,
2795                 .proc_handler   =       proc_dointvec,
2796         },
2797         {
2798                 .procname       =       "gc_min_interval_ms",
2799                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2800                 .maxlen         =       sizeof(int),
2801                 .mode           =       0644,
2802                 .proc_handler   =       proc_dointvec_ms_jiffies,
2803         },
2804         { }
2805 };
2806
2807 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2808 {
2809         struct ctl_table *table;
2810
2811         table = kmemdup(ipv6_route_table_template,
2812                         sizeof(ipv6_route_table_template),
2813                         GFP_KERNEL);
2814
2815         if (table) {
2816                 table[0].data = &net->ipv6.sysctl.flush_delay;
2817                 table[0].extra1 = net;
2818                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2819                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2820                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2821                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2822                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2823                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2824                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2825                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2826                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2827         }
2828
2829         return table;
2830 }
2831 #endif
2832
2833 static int __net_init ip6_route_net_init(struct net *net)
2834 {
2835         int ret = -ENOMEM;
2836
2837         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2838                sizeof(net->ipv6.ip6_dst_ops));
2839
2840         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2841                 goto out_ip6_dst_ops;
2842
2843         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2844                                            sizeof(*net->ipv6.ip6_null_entry),
2845                                            GFP_KERNEL);
2846         if (!net->ipv6.ip6_null_entry)
2847                 goto out_ip6_dst_entries;
2848         net->ipv6.ip6_null_entry->dst.path =
2849                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2850         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2851         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2852                          ip6_template_metrics, true);
2853
2854 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2855         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2856                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2857                                                GFP_KERNEL);
2858         if (!net->ipv6.ip6_prohibit_entry)
2859                 goto out_ip6_null_entry;
2860         net->ipv6.ip6_prohibit_entry->dst.path =
2861                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2862         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2863         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2864                          ip6_template_metrics, true);
2865
2866         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2867                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2868                                                GFP_KERNEL);
2869         if (!net->ipv6.ip6_blk_hole_entry)
2870                 goto out_ip6_prohibit_entry;
2871         net->ipv6.ip6_blk_hole_entry->dst.path =
2872                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2873         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2874         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2875                          ip6_template_metrics, true);
2876 #endif
2877
2878         net->ipv6.sysctl.flush_delay = 0;
2879         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2880         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2881         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2882         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2883         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2884         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2885         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2886
2887         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2888
2889         ret = 0;
2890 out:
2891         return ret;
2892
2893 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2894 out_ip6_prohibit_entry:
2895         kfree(net->ipv6.ip6_prohibit_entry);
2896 out_ip6_null_entry:
2897         kfree(net->ipv6.ip6_null_entry);
2898 #endif
2899 out_ip6_dst_entries:
2900         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2901 out_ip6_dst_ops:
2902         goto out;
2903 }
2904
2905 static void __net_exit ip6_route_net_exit(struct net *net)
2906 {
2907         kfree(net->ipv6.ip6_null_entry);
2908 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2909         kfree(net->ipv6.ip6_prohibit_entry);
2910         kfree(net->ipv6.ip6_blk_hole_entry);
2911 #endif
2912         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2913 }
2914
2915 static int __net_init ip6_route_net_init_late(struct net *net)
2916 {
2917 #ifdef CONFIG_PROC_FS
2918         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2919         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2920 #endif
2921         return 0;
2922 }
2923
2924 static void __net_exit ip6_route_net_exit_late(struct net *net)
2925 {
2926 #ifdef CONFIG_PROC_FS
2927         proc_net_remove(net, "ipv6_route");
2928         proc_net_remove(net, "rt6_stats");
2929 #endif
2930 }
2931
2932 static struct pernet_operations ip6_route_net_ops = {
2933         .init = ip6_route_net_init,
2934         .exit = ip6_route_net_exit,
2935 };
2936
2937 static struct pernet_operations ip6_route_net_late_ops = {
2938         .init = ip6_route_net_init_late,
2939         .exit = ip6_route_net_exit_late,
2940 };
2941
2942 static struct notifier_block ip6_route_dev_notifier = {
2943         .notifier_call = ip6_route_dev_notify,
2944         .priority = 0,
2945 };
2946
2947 int __init ip6_route_init(void)
2948 {
2949         int ret;
2950
2951         ret = -ENOMEM;
2952         ip6_dst_ops_template.kmem_cachep =
2953                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2954                                   SLAB_HWCACHE_ALIGN, NULL);
2955         if (!ip6_dst_ops_template.kmem_cachep)
2956                 goto out;
2957
2958         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2959         if (ret)
2960                 goto out_kmem_cache;
2961
2962         ret = register_pernet_subsys(&ip6_route_net_ops);
2963         if (ret)
2964                 goto out_dst_entries;
2965
2966         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2967
2968         /* Registering of the loopback is done before this portion of code,
2969          * the loopback reference in rt6_info will not be taken, do it
2970          * manually for init_net */
2971         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2972         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2973   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2974         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2975         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2976         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2977         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2978   #endif
2979         ret = fib6_init();
2980         if (ret)
2981                 goto out_register_subsys;
2982
2983         ret = xfrm6_init();
2984         if (ret)
2985                 goto out_fib6_init;
2986
2987         ret = fib6_rules_init();
2988         if (ret)
2989                 goto xfrm6_init;
2990
2991         ret = register_pernet_subsys(&ip6_route_net_late_ops);
2992         if (ret)
2993                 goto fib6_rules_init;
2994
2995         ret = -ENOBUFS;
2996         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2997             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2998             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2999                 goto out_register_late_subsys;
3000
3001         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3002         if (ret)
3003                 goto out_register_late_subsys;
3004
3005 out:
3006         return ret;
3007
3008 out_register_late_subsys:
3009         unregister_pernet_subsys(&ip6_route_net_late_ops);
3010 fib6_rules_init:
3011         fib6_rules_cleanup();
3012 xfrm6_init:
3013         xfrm6_fini();
3014 out_fib6_init:
3015         fib6_gc_cleanup();
3016 out_register_subsys:
3017         unregister_pernet_subsys(&ip6_route_net_ops);
3018 out_dst_entries:
3019         dst_entries_destroy(&ip6_dst_blackhole_ops);
3020 out_kmem_cache:
3021         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3022         goto out;
3023 }
3024
3025 void ip6_route_cleanup(void)
3026 {
3027         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3028         unregister_pernet_subsys(&ip6_route_net_late_ops);
3029         fib6_rules_cleanup();
3030         xfrm6_fini();
3031         fib6_gc_cleanup();
3032         unregister_pernet_subsys(&ip6_route_net_ops);
3033         dst_entries_destroy(&ip6_dst_blackhole_ops);
3034         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3035 }