gro: Disable frag0 optimization on IPv6 ext headers
[pandora-kernel.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 /* Set to 3 to get tracing. */
66 #define RT6_DEBUG 2
67
68 #if RT6_DEBUG >= 3
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #else
72 #define RDBG(x)
73 #define RT6_TRACE(x...) do { ; } while (0)
74 #endif
75
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77                                     const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int      ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void             ip6_dst_destroy(struct dst_entry *);
83 static void             ip6_dst_ifdown(struct dst_entry *,
84                                        struct net_device *dev, int how);
85 static int               ip6_dst_gc(struct dst_ops *ops);
86
87 static int              ip6_pkt_discard(struct sk_buff *skb);
88 static int              ip6_pkt_discard_out(struct sk_buff *skb);
89 static void             ip6_link_failure(struct sk_buff *skb);
90 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
91
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94                                            const struct in6_addr *prefix, int prefixlen,
95                                            const struct in6_addr *gwaddr, int ifindex,
96                                            unsigned pref);
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98                                            const struct in6_addr *prefix, int prefixlen,
99                                            const struct in6_addr *gwaddr, int ifindex);
100 #endif
101
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
103 {
104         struct rt6_info *rt = (struct rt6_info *) dst;
105         struct inet_peer *peer;
106         u32 *p = NULL;
107
108         if (!(rt->dst.flags & DST_HOST))
109                 return dst_cow_metrics_generic(dst, old);
110
111         if (!rt->rt6i_peer)
112                 rt6_bind_peer(rt, 1);
113
114         peer = rt->rt6i_peer;
115         if (peer) {
116                 u32 *old_p = __DST_METRICS_PTR(old);
117                 unsigned long prev, new;
118
119                 p = peer->metrics;
120                 if (inet_metrics_new(peer))
121                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
122
123                 new = (unsigned long) p;
124                 prev = cmpxchg(&dst->_metrics, old, new);
125
126                 if (prev != old) {
127                         p = __DST_METRICS_PTR(prev);
128                         if (prev & DST_METRICS_READ_ONLY)
129                                 p = NULL;
130                 }
131         }
132         return p;
133 }
134
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
136 {
137         return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
138 }
139
140 static struct dst_ops ip6_dst_ops_template = {
141         .family                 =       AF_INET6,
142         .protocol               =       cpu_to_be16(ETH_P_IPV6),
143         .gc                     =       ip6_dst_gc,
144         .gc_thresh              =       1024,
145         .check                  =       ip6_dst_check,
146         .default_advmss         =       ip6_default_advmss,
147         .mtu                    =       ip6_mtu,
148         .cow_metrics            =       ipv6_cow_metrics,
149         .destroy                =       ip6_dst_destroy,
150         .ifdown                 =       ip6_dst_ifdown,
151         .negative_advice        =       ip6_negative_advice,
152         .link_failure           =       ip6_link_failure,
153         .update_pmtu            =       ip6_rt_update_pmtu,
154         .local_out              =       __ip6_local_out,
155         .neigh_lookup           =       ip6_neigh_lookup,
156 };
157
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
159 {
160         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
161
162         return mtu ? : dst->dev->mtu;
163 }
164
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
166 {
167 }
168
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
170                                          unsigned long old)
171 {
172         return NULL;
173 }
174
175 static struct dst_ops ip6_dst_blackhole_ops = {
176         .family                 =       AF_INET6,
177         .protocol               =       cpu_to_be16(ETH_P_IPV6),
178         .destroy                =       ip6_dst_destroy,
179         .check                  =       ip6_dst_check,
180         .mtu                    =       ip6_blackhole_mtu,
181         .default_advmss         =       ip6_default_advmss,
182         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
183         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
184         .neigh_lookup           =       ip6_neigh_lookup,
185 };
186
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188         [RTAX_HOPLIMIT - 1] = 0,
189 };
190
191 static struct rt6_info ip6_null_entry_template = {
192         .dst = {
193                 .__refcnt       = ATOMIC_INIT(1),
194                 .__use          = 1,
195                 .obsolete       = -1,
196                 .error          = -ENETUNREACH,
197                 .input          = ip6_pkt_discard,
198                 .output         = ip6_pkt_discard_out,
199         },
200         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
201         .rt6i_protocol  = RTPROT_KERNEL,
202         .rt6i_metric    = ~(u32) 0,
203         .rt6i_ref       = ATOMIC_INIT(1),
204 };
205
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
207
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
210
211 static struct rt6_info ip6_prohibit_entry_template = {
212         .dst = {
213                 .__refcnt       = ATOMIC_INIT(1),
214                 .__use          = 1,
215                 .obsolete       = -1,
216                 .error          = -EACCES,
217                 .input          = ip6_pkt_prohibit,
218                 .output         = ip6_pkt_prohibit_out,
219         },
220         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
221         .rt6i_protocol  = RTPROT_KERNEL,
222         .rt6i_metric    = ~(u32) 0,
223         .rt6i_ref       = ATOMIC_INIT(1),
224 };
225
226 static struct rt6_info ip6_blk_hole_entry_template = {
227         .dst = {
228                 .__refcnt       = ATOMIC_INIT(1),
229                 .__use          = 1,
230                 .obsolete       = -1,
231                 .error          = -EINVAL,
232                 .input          = dst_discard,
233                 .output         = dst_discard,
234         },
235         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
236         .rt6i_protocol  = RTPROT_KERNEL,
237         .rt6i_metric    = ~(u32) 0,
238         .rt6i_ref       = ATOMIC_INIT(1),
239 };
240
241 #endif
242
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245                                              struct net_device *dev,
246                                              int flags)
247 {
248         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
249
250         if (rt != NULL)
251                 memset(&rt->rt6i_table, 0,
252                         sizeof(*rt) - sizeof(struct dst_entry));
253
254         return rt;
255 }
256
257 static void ip6_dst_destroy(struct dst_entry *dst)
258 {
259         struct rt6_info *rt = (struct rt6_info *)dst;
260         struct inet6_dev *idev = rt->rt6i_idev;
261         struct inet_peer *peer = rt->rt6i_peer;
262
263         if (!(rt->dst.flags & DST_HOST))
264                 dst_destroy_metrics_generic(dst);
265
266         if (idev != NULL) {
267                 rt->rt6i_idev = NULL;
268                 in6_dev_put(idev);
269         }
270         if (peer) {
271                 rt->rt6i_peer = NULL;
272                 inet_putpeer(peer);
273         }
274 }
275
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
277
278 static u32 rt6_peer_genid(void)
279 {
280         return atomic_read(&__rt6_peer_genid);
281 }
282
283 void rt6_bind_peer(struct rt6_info *rt, int create)
284 {
285         struct inet_peer *peer;
286
287         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
289                 inet_putpeer(peer);
290         else
291                 rt->rt6i_peer_genid = rt6_peer_genid();
292 }
293
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
295                            int how)
296 {
297         struct rt6_info *rt = (struct rt6_info *)dst;
298         struct inet6_dev *idev = rt->rt6i_idev;
299         struct net_device *loopback_dev =
300                 dev_net(dev)->loopback_dev;
301
302         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
303                 struct inet6_dev *loopback_idev =
304                         in6_dev_get(loopback_dev);
305                 if (loopback_idev != NULL) {
306                         rt->rt6i_idev = loopback_idev;
307                         in6_dev_put(idev);
308                 }
309         }
310 }
311
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
313 {
314         return (rt->rt6i_flags & RTF_EXPIRES) &&
315                 time_after(jiffies, rt->rt6i_expires);
316 }
317
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
319 {
320         return ipv6_addr_type(daddr) &
321                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
322 }
323
324 /*
325  *      Route lookup. Any table->tb6_lock is implied.
326  */
327
328 static inline struct rt6_info *rt6_device_match(struct net *net,
329                                                     struct rt6_info *rt,
330                                                     const struct in6_addr *saddr,
331                                                     int oif,
332                                                     int flags)
333 {
334         struct rt6_info *local = NULL;
335         struct rt6_info *sprt;
336
337         if (!oif && ipv6_addr_any(saddr))
338                 goto out;
339
340         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341                 struct net_device *dev = sprt->rt6i_dev;
342
343                 if (oif) {
344                         if (dev->ifindex == oif)
345                                 return sprt;
346                         if (dev->flags & IFF_LOOPBACK) {
347                                 if (sprt->rt6i_idev == NULL ||
348                                     sprt->rt6i_idev->dev->ifindex != oif) {
349                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
350                                                 continue;
351                                         if (local && (!oif ||
352                                                       local->rt6i_idev->dev->ifindex == oif))
353                                                 continue;
354                                 }
355                                 local = sprt;
356                         }
357                 } else {
358                         if (ipv6_chk_addr(net, saddr, dev,
359                                           flags & RT6_LOOKUP_F_IFACE))
360                                 return sprt;
361                 }
362         }
363
364         if (oif) {
365                 if (local)
366                         return local;
367
368                 if (flags & RT6_LOOKUP_F_IFACE)
369                         return net->ipv6.ip6_null_entry;
370         }
371 out:
372         return rt;
373 }
374
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
377 {
378         struct neighbour *neigh;
379         /*
380          * Okay, this does not seem to be appropriate
381          * for now, however, we need to check if it
382          * is really so; aka Router Reachability Probing.
383          *
384          * Router Reachability Probe MUST be rate-limited
385          * to no more than one per minute.
386          */
387         rcu_read_lock();
388         neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389         if (!neigh || (neigh->nud_state & NUD_VALID))
390                 goto out;
391         read_lock_bh(&neigh->lock);
392         if (!(neigh->nud_state & NUD_VALID) &&
393             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394                 struct in6_addr mcaddr;
395                 struct in6_addr *target;
396
397                 neigh->updated = jiffies;
398                 read_unlock_bh(&neigh->lock);
399
400                 target = (struct in6_addr *)&neigh->primary_key;
401                 addrconf_addr_solict_mult(target, &mcaddr);
402                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
403         } else {
404                 read_unlock_bh(&neigh->lock);
405         }
406 out:
407         rcu_read_unlock();
408 }
409 #else
410 static inline void rt6_probe(struct rt6_info *rt)
411 {
412 }
413 #endif
414
415 /*
416  * Default Router Selection (RFC 2461 6.3.6)
417  */
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
419 {
420         struct net_device *dev = rt->rt6i_dev;
421         if (!oif || dev->ifindex == oif)
422                 return 2;
423         if ((dev->flags & IFF_LOOPBACK) &&
424             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
425                 return 1;
426         return 0;
427 }
428
429 static inline int rt6_check_neigh(struct rt6_info *rt)
430 {
431         struct neighbour *neigh;
432         int m;
433
434         rcu_read_lock();
435         neigh = dst_get_neighbour(&rt->dst);
436         if (rt->rt6i_flags & RTF_NONEXTHOP ||
437             !(rt->rt6i_flags & RTF_GATEWAY))
438                 m = 1;
439         else if (neigh) {
440                 read_lock_bh(&neigh->lock);
441                 if (neigh->nud_state & NUD_VALID)
442                         m = 2;
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444                 else if (neigh->nud_state & NUD_FAILED)
445                         m = 0;
446 #endif
447                 else
448                         m = 1;
449                 read_unlock_bh(&neigh->lock);
450         } else
451                 m = 0;
452         rcu_read_unlock();
453         return m;
454 }
455
456 static int rt6_score_route(struct rt6_info *rt, int oif,
457                            int strict)
458 {
459         int m, n;
460
461         m = rt6_check_dev(rt, oif);
462         if (!m && (strict & RT6_LOOKUP_F_IFACE))
463                 return -1;
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
466 #endif
467         n = rt6_check_neigh(rt);
468         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
469                 return -1;
470         return m;
471 }
472
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474                                    int *mpri, struct rt6_info *match)
475 {
476         int m;
477
478         if (rt6_check_expired(rt))
479                 goto out;
480
481         m = rt6_score_route(rt, oif, strict);
482         if (m < 0)
483                 goto out;
484
485         if (m > *mpri) {
486                 if (strict & RT6_LOOKUP_F_REACHABLE)
487                         rt6_probe(match);
488                 *mpri = m;
489                 match = rt;
490         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
491                 rt6_probe(rt);
492         }
493
494 out:
495         return match;
496 }
497
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499                                      struct rt6_info *rr_head,
500                                      u32 metric, int oif, int strict)
501 {
502         struct rt6_info *rt, *match;
503         int mpri = -1;
504
505         match = NULL;
506         for (rt = rr_head; rt && rt->rt6i_metric == metric;
507              rt = rt->dst.rt6_next)
508                 match = find_match(rt, oif, strict, &mpri, match);
509         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510              rt = rt->dst.rt6_next)
511                 match = find_match(rt, oif, strict, &mpri, match);
512
513         return match;
514 }
515
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
517 {
518         struct rt6_info *match, *rt0;
519         struct net *net;
520
521         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522                   __func__, fn->leaf, oif);
523
524         rt0 = fn->rr_ptr;
525         if (!rt0)
526                 fn->rr_ptr = rt0 = fn->leaf;
527
528         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
529
530         if (!match &&
531             (strict & RT6_LOOKUP_F_REACHABLE)) {
532                 struct rt6_info *next = rt0->dst.rt6_next;
533
534                 /* no entries matched; do round-robin */
535                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
536                         next = fn->leaf;
537
538                 if (next != rt0)
539                         fn->rr_ptr = next;
540         }
541
542         RT6_TRACE("%s() => %p\n",
543                   __func__, match);
544
545         net = dev_net(rt0->rt6i_dev);
546         return match ? match : net->ipv6.ip6_null_entry;
547 }
548
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551                   const struct in6_addr *gwaddr)
552 {
553         struct net *net = dev_net(dev);
554         struct route_info *rinfo = (struct route_info *) opt;
555         struct in6_addr prefix_buf, *prefix;
556         unsigned int pref;
557         unsigned long lifetime;
558         struct rt6_info *rt;
559
560         if (len < sizeof(struct route_info)) {
561                 return -EINVAL;
562         }
563
564         /* Sanity check for prefix_len and length */
565         if (rinfo->length > 3) {
566                 return -EINVAL;
567         } else if (rinfo->prefix_len > 128) {
568                 return -EINVAL;
569         } else if (rinfo->prefix_len > 64) {
570                 if (rinfo->length < 2) {
571                         return -EINVAL;
572                 }
573         } else if (rinfo->prefix_len > 0) {
574                 if (rinfo->length < 1) {
575                         return -EINVAL;
576                 }
577         }
578
579         pref = rinfo->route_pref;
580         if (pref == ICMPV6_ROUTER_PREF_INVALID)
581                 return -EINVAL;
582
583         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
584
585         if (rinfo->length == 3)
586                 prefix = (struct in6_addr *)rinfo->prefix;
587         else {
588                 /* this function is safe */
589                 ipv6_addr_prefix(&prefix_buf,
590                                  (struct in6_addr *)rinfo->prefix,
591                                  rinfo->prefix_len);
592                 prefix = &prefix_buf;
593         }
594
595         if (rinfo->prefix_len == 0)
596                 rt = rt6_get_dflt_router(gwaddr, dev);
597         else
598                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
599                                         gwaddr, dev->ifindex);
600
601         if (rt && !lifetime) {
602                 ip6_del_rt(rt);
603                 rt = NULL;
604         }
605
606         if (!rt && lifetime)
607                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
608                                         pref);
609         else if (rt)
610                 rt->rt6i_flags = RTF_ROUTEINFO |
611                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
612
613         if (rt) {
614                 if (!addrconf_finite_timeout(lifetime)) {
615                         rt->rt6i_flags &= ~RTF_EXPIRES;
616                 } else {
617                         rt->rt6i_expires = jiffies + HZ * lifetime;
618                         rt->rt6i_flags |= RTF_EXPIRES;
619                 }
620                 dst_release(&rt->dst);
621         }
622         return 0;
623 }
624 #endif
625
626 #define BACKTRACK(__net, saddr)                 \
627 do { \
628         if (rt == __net->ipv6.ip6_null_entry) { \
629                 struct fib6_node *pn; \
630                 while (1) { \
631                         if (fn->fn_flags & RTN_TL_ROOT) \
632                                 goto out; \
633                         pn = fn->parent; \
634                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
635                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
636                         else \
637                                 fn = pn; \
638                         if (fn->fn_flags & RTN_RTINFO) \
639                                 goto restart; \
640                 } \
641         } \
642 } while(0)
643
644 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
645                                              struct fib6_table *table,
646                                              struct flowi6 *fl6, int flags)
647 {
648         struct fib6_node *fn;
649         struct rt6_info *rt;
650
651         read_lock_bh(&table->tb6_lock);
652         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
653 restart:
654         rt = fn->leaf;
655         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
656         BACKTRACK(net, &fl6->saddr);
657 out:
658         dst_use(&rt->dst, jiffies);
659         read_unlock_bh(&table->tb6_lock);
660         return rt;
661
662 }
663
664 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
665                             const struct in6_addr *saddr, int oif, int strict)
666 {
667         struct flowi6 fl6 = {
668                 .flowi6_oif = oif,
669                 .daddr = *daddr,
670         };
671         struct dst_entry *dst;
672         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
673
674         if (saddr) {
675                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
676                 flags |= RT6_LOOKUP_F_HAS_SADDR;
677         }
678
679         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
680         if (dst->error == 0)
681                 return (struct rt6_info *) dst;
682
683         dst_release(dst);
684
685         return NULL;
686 }
687
688 EXPORT_SYMBOL(rt6_lookup);
689
690 /* ip6_ins_rt is called with FREE table->tb6_lock.
691    It takes new route entry, the addition fails by any reason the
692    route is freed. In any case, if caller does not hold it, it may
693    be destroyed.
694  */
695
696 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
697 {
698         int err;
699         struct fib6_table *table;
700
701         table = rt->rt6i_table;
702         write_lock_bh(&table->tb6_lock);
703         err = fib6_add(&table->tb6_root, rt, info);
704         write_unlock_bh(&table->tb6_lock);
705
706         return err;
707 }
708
709 int ip6_ins_rt(struct rt6_info *rt)
710 {
711         struct nl_info info = {
712                 .nl_net = dev_net(rt->rt6i_dev),
713         };
714         return __ip6_ins_rt(rt, &info);
715 }
716
717 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
718                                       const struct in6_addr *daddr,
719                                       const struct in6_addr *saddr)
720 {
721         struct rt6_info *rt;
722
723         /*
724          *      Clone the route.
725          */
726
727         rt = ip6_rt_copy(ort, daddr);
728
729         if (rt) {
730                 struct neighbour *neigh;
731                 int attempts = !in_softirq();
732
733                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
734                         if (ort->rt6i_dst.plen != 128 &&
735                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
736                                 rt->rt6i_flags |= RTF_ANYCAST;
737                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
738                 }
739
740                 rt->rt6i_flags |= RTF_CACHE;
741
742 #ifdef CONFIG_IPV6_SUBTREES
743                 if (rt->rt6i_src.plen && saddr) {
744                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
745                         rt->rt6i_src.plen = 128;
746                 }
747 #endif
748
749         retry:
750                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
751                 if (IS_ERR(neigh)) {
752                         struct net *net = dev_net(rt->rt6i_dev);
753                         int saved_rt_min_interval =
754                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
755                         int saved_rt_elasticity =
756                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
757
758                         if (attempts-- > 0) {
759                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
760                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
761
762                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
763
764                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
765                                         saved_rt_elasticity;
766                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
767                                         saved_rt_min_interval;
768                                 goto retry;
769                         }
770
771                         if (net_ratelimit())
772                                 printk(KERN_WARNING
773                                        "ipv6: Neighbour table overflow.\n");
774                         dst_free(&rt->dst);
775                         return NULL;
776                 }
777                 dst_set_neighbour(&rt->dst, neigh);
778
779         }
780
781         return rt;
782 }
783
784 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
785                                         const struct in6_addr *daddr)
786 {
787         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
788
789         if (rt) {
790                 rt->rt6i_flags |= RTF_CACHE;
791                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
792         }
793         return rt;
794 }
795
796 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
797                                       struct flowi6 *fl6, int flags, bool input)
798 {
799         struct fib6_node *fn;
800         struct rt6_info *rt, *nrt;
801         int strict = 0;
802         int attempts = 3;
803         int err;
804         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
805         int local = RTF_NONEXTHOP;
806
807         strict |= flags & RT6_LOOKUP_F_IFACE;
808         if (input)
809                 local |= RTF_LOCAL;
810
811 relookup:
812         read_lock_bh(&table->tb6_lock);
813
814 restart_2:
815         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
816
817 restart:
818         rt = rt6_select(fn, oif, strict | reachable);
819
820         BACKTRACK(net, &fl6->saddr);
821         if (rt == net->ipv6.ip6_null_entry ||
822             rt->rt6i_flags & RTF_CACHE)
823                 goto out;
824
825         dst_hold(&rt->dst);
826         read_unlock_bh(&table->tb6_lock);
827
828         if (!dst_get_neighbour_raw(&rt->dst)
829             && !(rt->rt6i_flags & local))
830                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
831         else if (!(rt->dst.flags & DST_HOST))
832                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
833         else
834                 goto out2;
835
836         dst_release(&rt->dst);
837         rt = nrt ? : net->ipv6.ip6_null_entry;
838
839         dst_hold(&rt->dst);
840         if (nrt) {
841                 err = ip6_ins_rt(nrt);
842                 if (!err)
843                         goto out2;
844         }
845
846         if (--attempts <= 0)
847                 goto out2;
848
849         /*
850          * Race condition! In the gap, when table->tb6_lock was
851          * released someone could insert this route.  Relookup.
852          */
853         dst_release(&rt->dst);
854         goto relookup;
855
856 out:
857         if (reachable) {
858                 reachable = 0;
859                 goto restart_2;
860         }
861         dst_hold(&rt->dst);
862         read_unlock_bh(&table->tb6_lock);
863 out2:
864         rt->dst.lastuse = jiffies;
865         rt->dst.__use++;
866
867         return rt;
868 }
869
870 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
871                                             struct flowi6 *fl6, int flags)
872 {
873         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags, true);
874 }
875
876 void ip6_route_input(struct sk_buff *skb)
877 {
878         const struct ipv6hdr *iph = ipv6_hdr(skb);
879         struct net *net = dev_net(skb->dev);
880         int flags = RT6_LOOKUP_F_HAS_SADDR;
881         struct flowi6 fl6 = {
882                 .flowi6_iif = skb->dev->ifindex,
883                 .daddr = iph->daddr,
884                 .saddr = iph->saddr,
885                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
886                 .flowi6_mark = skb->mark,
887                 .flowi6_proto = iph->nexthdr,
888         };
889
890         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
891                 flags |= RT6_LOOKUP_F_IFACE;
892
893         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
894 }
895
896 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
897                                              struct flowi6 *fl6, int flags)
898 {
899         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags, false);
900 }
901
902 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
903                                     struct flowi6 *fl6)
904 {
905         int flags = 0;
906
907         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
908                 flags |= RT6_LOOKUP_F_IFACE;
909
910         if (!ipv6_addr_any(&fl6->saddr))
911                 flags |= RT6_LOOKUP_F_HAS_SADDR;
912         else if (sk)
913                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
914
915         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
916 }
917
918 EXPORT_SYMBOL(ip6_route_output);
919
920 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
921 {
922         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
923         struct dst_entry *new = NULL;
924
925         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
926         if (rt) {
927                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
928
929                 new = &rt->dst;
930
931                 new->__use = 1;
932                 new->input = dst_discard;
933                 new->output = dst_discard;
934
935                 if (dst_metrics_read_only(&ort->dst))
936                         new->_metrics = ort->dst._metrics;
937                 else
938                         dst_copy_metrics(new, &ort->dst);
939                 rt->rt6i_idev = ort->rt6i_idev;
940                 if (rt->rt6i_idev)
941                         in6_dev_hold(rt->rt6i_idev);
942                 rt->rt6i_expires = 0;
943
944                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
945                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
946                 rt->rt6i_metric = 0;
947
948                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
949 #ifdef CONFIG_IPV6_SUBTREES
950                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
951 #endif
952
953                 dst_free(new);
954         }
955
956         dst_release(dst_orig);
957         return new ? new : ERR_PTR(-ENOMEM);
958 }
959
960 /*
961  *      Destination cache support functions
962  */
963
964 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
965 {
966         struct rt6_info *rt;
967
968         rt = (struct rt6_info *) dst;
969
970         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
971                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
972                         if (!rt->rt6i_peer)
973                                 rt6_bind_peer(rt, 0);
974                         rt->rt6i_peer_genid = rt6_peer_genid();
975                 }
976                 return dst;
977         }
978         return NULL;
979 }
980
981 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
982 {
983         struct rt6_info *rt = (struct rt6_info *) dst;
984
985         if (rt) {
986                 if (rt->rt6i_flags & RTF_CACHE) {
987                         if (rt6_check_expired(rt)) {
988                                 ip6_del_rt(rt);
989                                 dst = NULL;
990                         }
991                 } else {
992                         dst_release(dst);
993                         dst = NULL;
994                 }
995         }
996         return dst;
997 }
998
999 static void ip6_link_failure(struct sk_buff *skb)
1000 {
1001         struct rt6_info *rt;
1002
1003         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1004
1005         rt = (struct rt6_info *) skb_dst(skb);
1006         if (rt) {
1007                 if (rt->rt6i_flags&RTF_CACHE) {
1008                         dst_set_expires(&rt->dst, 0);
1009                         rt->rt6i_flags |= RTF_EXPIRES;
1010                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1011                         rt->rt6i_node->fn_sernum = -1;
1012         }
1013 }
1014
1015 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1016 {
1017         struct rt6_info *rt6 = (struct rt6_info*)dst;
1018
1019         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1020                 rt6->rt6i_flags |= RTF_MODIFIED;
1021                 if (mtu < IPV6_MIN_MTU)
1022                         mtu = IPV6_MIN_MTU;
1023
1024                 dst_metric_set(dst, RTAX_MTU, mtu);
1025         }
1026 }
1027
1028 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1029 {
1030         struct net_device *dev = dst->dev;
1031         unsigned int mtu = dst_mtu(dst);
1032         struct net *net = dev_net(dev);
1033
1034         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1035
1036         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1037                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1038
1039         /*
1040          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1041          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1042          * IPV6_MAXPLEN is also valid and means: "any MSS,
1043          * rely only on pmtu discovery"
1044          */
1045         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1046                 mtu = IPV6_MAXPLEN;
1047         return mtu;
1048 }
1049
1050 static unsigned int ip6_mtu(const struct dst_entry *dst)
1051 {
1052         struct inet6_dev *idev;
1053         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1054
1055         if (mtu)
1056                 goto out;
1057
1058         mtu = IPV6_MIN_MTU;
1059
1060         rcu_read_lock();
1061         idev = __in6_dev_get(dst->dev);
1062         if (idev)
1063                 mtu = idev->cnf.mtu6;
1064         rcu_read_unlock();
1065
1066 out:
1067         return min_t(unsigned int, mtu, IP6_MAX_MTU);
1068 }
1069
1070 static struct dst_entry *icmp6_dst_gc_list;
1071 static DEFINE_SPINLOCK(icmp6_dst_lock);
1072
1073 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1074                                   struct neighbour *neigh,
1075                                   const struct in6_addr *addr)
1076 {
1077         struct rt6_info *rt;
1078         struct inet6_dev *idev = in6_dev_get(dev);
1079         struct net *net = dev_net(dev);
1080
1081         if (unlikely(idev == NULL))
1082                 return NULL;
1083
1084         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1085         if (unlikely(rt == NULL)) {
1086                 in6_dev_put(idev);
1087                 goto out;
1088         }
1089
1090         if (neigh)
1091                 neigh_hold(neigh);
1092         else {
1093                 neigh = ndisc_get_neigh(dev, addr);
1094                 if (IS_ERR(neigh))
1095                         neigh = NULL;
1096         }
1097
1098         rt->dst.flags |= DST_HOST;
1099         rt->dst.output  = ip6_output;
1100         dst_set_neighbour(&rt->dst, neigh);
1101         atomic_set(&rt->dst.__refcnt, 1);
1102         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1103         rt->rt6i_dst.plen = 128;
1104         rt->rt6i_idev     = idev;
1105         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1106
1107         spin_lock_bh(&icmp6_dst_lock);
1108         rt->dst.next = icmp6_dst_gc_list;
1109         icmp6_dst_gc_list = &rt->dst;
1110         spin_unlock_bh(&icmp6_dst_lock);
1111
1112         fib6_force_start_gc(net);
1113
1114 out:
1115         return &rt->dst;
1116 }
1117
1118 int icmp6_dst_gc(void)
1119 {
1120         struct dst_entry *dst, **pprev;
1121         int more = 0;
1122
1123         spin_lock_bh(&icmp6_dst_lock);
1124         pprev = &icmp6_dst_gc_list;
1125
1126         while ((dst = *pprev) != NULL) {
1127                 if (!atomic_read(&dst->__refcnt)) {
1128                         *pprev = dst->next;
1129                         dst_free(dst);
1130                 } else {
1131                         pprev = &dst->next;
1132                         ++more;
1133                 }
1134         }
1135
1136         spin_unlock_bh(&icmp6_dst_lock);
1137
1138         return more;
1139 }
1140
1141 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1142                             void *arg)
1143 {
1144         struct dst_entry *dst, **pprev;
1145
1146         spin_lock_bh(&icmp6_dst_lock);
1147         pprev = &icmp6_dst_gc_list;
1148         while ((dst = *pprev) != NULL) {
1149                 struct rt6_info *rt = (struct rt6_info *) dst;
1150                 if (func(rt, arg)) {
1151                         *pprev = dst->next;
1152                         dst_free(dst);
1153                 } else {
1154                         pprev = &dst->next;
1155                 }
1156         }
1157         spin_unlock_bh(&icmp6_dst_lock);
1158 }
1159
1160 static int ip6_dst_gc(struct dst_ops *ops)
1161 {
1162         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1163         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1164         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1165         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1166         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1167         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1168         int entries;
1169
1170         entries = dst_entries_get_fast(ops);
1171         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1172             entries <= rt_max_size)
1173                 goto out;
1174
1175         net->ipv6.ip6_rt_gc_expire++;
1176         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, entries > rt_max_size);
1177         entries = dst_entries_get_slow(ops);
1178         if (entries < ops->gc_thresh)
1179                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1180 out:
1181         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1182         return entries > rt_max_size;
1183 }
1184
1185 /* Clean host part of a prefix. Not necessary in radix tree,
1186    but results in cleaner routing tables.
1187
1188    Remove it only when all the things will work!
1189  */
1190
1191 int ip6_dst_hoplimit(struct dst_entry *dst)
1192 {
1193         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1194         if (hoplimit == 0) {
1195                 struct net_device *dev = dst->dev;
1196                 struct inet6_dev *idev;
1197
1198                 rcu_read_lock();
1199                 idev = __in6_dev_get(dev);
1200                 if (idev)
1201                         hoplimit = idev->cnf.hop_limit;
1202                 else
1203                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1204                 rcu_read_unlock();
1205         }
1206         return hoplimit;
1207 }
1208 EXPORT_SYMBOL(ip6_dst_hoplimit);
1209
1210 /*
1211  *
1212  */
1213
1214 int ip6_route_add(struct fib6_config *cfg)
1215 {
1216         int err;
1217         struct net *net = cfg->fc_nlinfo.nl_net;
1218         struct rt6_info *rt = NULL;
1219         struct net_device *dev = NULL;
1220         struct inet6_dev *idev = NULL;
1221         struct fib6_table *table;
1222         int addr_type;
1223
1224         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1225                 return -EINVAL;
1226 #ifndef CONFIG_IPV6_SUBTREES
1227         if (cfg->fc_src_len)
1228                 return -EINVAL;
1229 #endif
1230         if (cfg->fc_ifindex) {
1231                 err = -ENODEV;
1232                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1233                 if (!dev)
1234                         goto out;
1235                 idev = in6_dev_get(dev);
1236                 if (!idev)
1237                         goto out;
1238         }
1239
1240         if (cfg->fc_metric == 0)
1241                 cfg->fc_metric = IP6_RT_PRIO_USER;
1242
1243         table = fib6_new_table(net, cfg->fc_table);
1244         if (table == NULL) {
1245                 err = -ENOBUFS;
1246                 goto out;
1247         }
1248
1249         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1250
1251         if (rt == NULL) {
1252                 err = -ENOMEM;
1253                 goto out;
1254         }
1255
1256         rt->dst.obsolete = -1;
1257         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1258                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1259                                 0;
1260
1261         if (cfg->fc_protocol == RTPROT_UNSPEC)
1262                 cfg->fc_protocol = RTPROT_BOOT;
1263         rt->rt6i_protocol = cfg->fc_protocol;
1264
1265         addr_type = ipv6_addr_type(&cfg->fc_dst);
1266
1267         if (addr_type & IPV6_ADDR_MULTICAST)
1268                 rt->dst.input = ip6_mc_input;
1269         else if (cfg->fc_flags & RTF_LOCAL)
1270                 rt->dst.input = ip6_input;
1271         else
1272                 rt->dst.input = ip6_forward;
1273
1274         rt->dst.output = ip6_output;
1275
1276         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1277         rt->rt6i_dst.plen = cfg->fc_dst_len;
1278         if (rt->rt6i_dst.plen == 128)
1279                rt->dst.flags |= DST_HOST;
1280
1281         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1282                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1283                 if (!metrics) {
1284                         err = -ENOMEM;
1285                         goto out;
1286                 }
1287                 dst_init_metrics(&rt->dst, metrics, 0);
1288         }
1289 #ifdef CONFIG_IPV6_SUBTREES
1290         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1291         rt->rt6i_src.plen = cfg->fc_src_len;
1292 #endif
1293
1294         rt->rt6i_metric = cfg->fc_metric;
1295
1296         /* We cannot add true routes via loopback here,
1297            they would result in kernel looping; promote them to reject routes
1298          */
1299         if ((cfg->fc_flags & RTF_REJECT) ||
1300             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1301                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1302                 /* hold loopback dev/idev if we haven't done so. */
1303                 if (dev != net->loopback_dev) {
1304                         if (dev) {
1305                                 dev_put(dev);
1306                                 in6_dev_put(idev);
1307                         }
1308                         dev = net->loopback_dev;
1309                         dev_hold(dev);
1310                         idev = in6_dev_get(dev);
1311                         if (!idev) {
1312                                 err = -ENODEV;
1313                                 goto out;
1314                         }
1315                 }
1316                 rt->dst.output = ip6_pkt_discard_out;
1317                 rt->dst.input = ip6_pkt_discard;
1318                 rt->dst.error = -ENETUNREACH;
1319                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1320                 goto install_route;
1321         }
1322
1323         if (cfg->fc_flags & RTF_GATEWAY) {
1324                 const struct in6_addr *gw_addr;
1325                 int gwa_type;
1326
1327                 gw_addr = &cfg->fc_gateway;
1328                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1329                 gwa_type = ipv6_addr_type(gw_addr);
1330
1331                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1332                         struct rt6_info *grt;
1333
1334                         /* IPv6 strictly inhibits using not link-local
1335                            addresses as nexthop address.
1336                            Otherwise, router will not able to send redirects.
1337                            It is very good, but in some (rare!) circumstances
1338                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1339                            some exceptions. --ANK
1340                          */
1341                         err = -EINVAL;
1342                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1343                                 goto out;
1344
1345                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1346
1347                         err = -EHOSTUNREACH;
1348                         if (grt == NULL)
1349                                 goto out;
1350                         if (dev) {
1351                                 if (dev != grt->rt6i_dev) {
1352                                         dst_release(&grt->dst);
1353                                         goto out;
1354                                 }
1355                         } else {
1356                                 dev = grt->rt6i_dev;
1357                                 idev = grt->rt6i_idev;
1358                                 dev_hold(dev);
1359                                 in6_dev_hold(grt->rt6i_idev);
1360                         }
1361                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1362                                 err = 0;
1363                         dst_release(&grt->dst);
1364
1365                         if (err)
1366                                 goto out;
1367                 }
1368                 err = -EINVAL;
1369                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1370                         goto out;
1371         }
1372
1373         err = -ENODEV;
1374         if (dev == NULL)
1375                 goto out;
1376
1377         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1378                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1379                         err = -EINVAL;
1380                         goto out;
1381                 }
1382                 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1383                 rt->rt6i_prefsrc.plen = 128;
1384         } else
1385                 rt->rt6i_prefsrc.plen = 0;
1386
1387         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1388                 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1389                 if (IS_ERR(n)) {
1390                         err = PTR_ERR(n);
1391                         goto out;
1392                 }
1393                 dst_set_neighbour(&rt->dst, n);
1394         }
1395
1396         rt->rt6i_flags = cfg->fc_flags;
1397
1398 install_route:
1399         if (cfg->fc_mx) {
1400                 struct nlattr *nla;
1401                 int remaining;
1402
1403                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1404                         int type = nla_type(nla);
1405
1406                         if (type) {
1407                                 if (type > RTAX_MAX) {
1408                                         err = -EINVAL;
1409                                         goto out;
1410                                 }
1411
1412                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1413                         }
1414                 }
1415         }
1416
1417         rt->dst.dev = dev;
1418         rt->rt6i_idev = idev;
1419         rt->rt6i_table = table;
1420
1421         cfg->fc_nlinfo.nl_net = dev_net(dev);
1422
1423         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1424
1425 out:
1426         if (dev)
1427                 dev_put(dev);
1428         if (idev)
1429                 in6_dev_put(idev);
1430         if (rt)
1431                 dst_free(&rt->dst);
1432         return err;
1433 }
1434
1435 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1436 {
1437         int err;
1438         struct fib6_table *table;
1439         struct net *net = dev_net(rt->rt6i_dev);
1440
1441         if (rt == net->ipv6.ip6_null_entry) {
1442                 err = -ENOENT;
1443                 goto out;
1444         }
1445
1446         table = rt->rt6i_table;
1447         write_lock_bh(&table->tb6_lock);
1448         err = fib6_del(rt, info);
1449         write_unlock_bh(&table->tb6_lock);
1450
1451 out:
1452         dst_release(&rt->dst);
1453         return err;
1454 }
1455
1456 int ip6_del_rt(struct rt6_info *rt)
1457 {
1458         struct nl_info info = {
1459                 .nl_net = dev_net(rt->rt6i_dev),
1460         };
1461         return __ip6_del_rt(rt, &info);
1462 }
1463
1464 static int ip6_route_del(struct fib6_config *cfg)
1465 {
1466         struct fib6_table *table;
1467         struct fib6_node *fn;
1468         struct rt6_info *rt;
1469         int err = -ESRCH;
1470
1471         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1472         if (table == NULL)
1473                 return err;
1474
1475         read_lock_bh(&table->tb6_lock);
1476
1477         fn = fib6_locate(&table->tb6_root,
1478                          &cfg->fc_dst, cfg->fc_dst_len,
1479                          &cfg->fc_src, cfg->fc_src_len);
1480
1481         if (fn) {
1482                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1483                         if (cfg->fc_ifindex &&
1484                             (rt->rt6i_dev == NULL ||
1485                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1486                                 continue;
1487                         if (cfg->fc_flags & RTF_GATEWAY &&
1488                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1489                                 continue;
1490                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1491                                 continue;
1492                         dst_hold(&rt->dst);
1493                         read_unlock_bh(&table->tb6_lock);
1494
1495                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1496                 }
1497         }
1498         read_unlock_bh(&table->tb6_lock);
1499
1500         return err;
1501 }
1502
1503 /*
1504  *      Handle redirects
1505  */
1506 struct ip6rd_flowi {
1507         struct flowi6 fl6;
1508         struct in6_addr gateway;
1509 };
1510
1511 static struct rt6_info *__ip6_route_redirect(struct net *net,
1512                                              struct fib6_table *table,
1513                                              struct flowi6 *fl6,
1514                                              int flags)
1515 {
1516         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1517         struct rt6_info *rt;
1518         struct fib6_node *fn;
1519
1520         /*
1521          * Get the "current" route for this destination and
1522          * check if the redirect has come from approriate router.
1523          *
1524          * RFC 2461 specifies that redirects should only be
1525          * accepted if they come from the nexthop to the target.
1526          * Due to the way the routes are chosen, this notion
1527          * is a bit fuzzy and one might need to check all possible
1528          * routes.
1529          */
1530
1531         read_lock_bh(&table->tb6_lock);
1532         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1533 restart:
1534         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1535                 /*
1536                  * Current route is on-link; redirect is always invalid.
1537                  *
1538                  * Seems, previous statement is not true. It could
1539                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1540                  * But then router serving it might decide, that we should
1541                  * know truth 8)8) --ANK (980726).
1542                  */
1543                 if (rt6_check_expired(rt))
1544                         continue;
1545                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1546                         continue;
1547                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1548                         continue;
1549                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1550                         continue;
1551                 break;
1552         }
1553
1554         if (!rt)
1555                 rt = net->ipv6.ip6_null_entry;
1556         BACKTRACK(net, &fl6->saddr);
1557 out:
1558         dst_hold(&rt->dst);
1559
1560         read_unlock_bh(&table->tb6_lock);
1561
1562         return rt;
1563 };
1564
1565 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1566                                            const struct in6_addr *src,
1567                                            const struct in6_addr *gateway,
1568                                            struct net_device *dev)
1569 {
1570         int flags = RT6_LOOKUP_F_HAS_SADDR;
1571         struct net *net = dev_net(dev);
1572         struct ip6rd_flowi rdfl = {
1573                 .fl6 = {
1574                         .flowi6_oif = dev->ifindex,
1575                         .daddr = *dest,
1576                         .saddr = *src,
1577                 },
1578         };
1579
1580         ipv6_addr_copy(&rdfl.gateway, gateway);
1581
1582         if (rt6_need_strict(dest))
1583                 flags |= RT6_LOOKUP_F_IFACE;
1584
1585         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1586                                                    flags, __ip6_route_redirect);
1587 }
1588
1589 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1590                   const struct in6_addr *saddr,
1591                   struct neighbour *neigh, u8 *lladdr, int on_link)
1592 {
1593         struct rt6_info *rt, *nrt = NULL;
1594         struct netevent_redirect netevent;
1595         struct net *net = dev_net(neigh->dev);
1596
1597         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1598
1599         if (rt == net->ipv6.ip6_null_entry) {
1600                 if (net_ratelimit())
1601                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1602                                "for redirect target\n");
1603                 goto out;
1604         }
1605
1606         /*
1607          *      We have finally decided to accept it.
1608          */
1609
1610         neigh_update(neigh, lladdr, NUD_STALE,
1611                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1612                      NEIGH_UPDATE_F_OVERRIDE|
1613                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1614                                      NEIGH_UPDATE_F_ISROUTER))
1615                      );
1616
1617         /*
1618          * Redirect received -> path was valid.
1619          * Look, redirects are sent only in response to data packets,
1620          * so that this nexthop apparently is reachable. --ANK
1621          */
1622         dst_confirm(&rt->dst);
1623
1624         /* Duplicate redirect: silently ignore. */
1625         if (neigh == dst_get_neighbour_raw(&rt->dst))
1626                 goto out;
1627
1628         nrt = ip6_rt_copy(rt, dest);
1629         if (nrt == NULL)
1630                 goto out;
1631
1632         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1633         if (on_link)
1634                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1635
1636         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1637         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1638
1639         if (ip6_ins_rt(nrt))
1640                 goto out;
1641
1642         netevent.old = &rt->dst;
1643         netevent.new = &nrt->dst;
1644         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1645
1646         if (rt->rt6i_flags&RTF_CACHE) {
1647                 ip6_del_rt(rt);
1648                 return;
1649         }
1650
1651 out:
1652         dst_release(&rt->dst);
1653 }
1654
1655 /*
1656  *      Handle ICMP "packet too big" messages
1657  *      i.e. Path MTU discovery
1658  */
1659
1660 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1661                              struct net *net, u32 pmtu, int ifindex)
1662 {
1663         struct rt6_info *rt, *nrt;
1664         int allfrag = 0;
1665 again:
1666         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1667         if (rt == NULL)
1668                 return;
1669
1670         if (rt6_check_expired(rt)) {
1671                 ip6_del_rt(rt);
1672                 goto again;
1673         }
1674
1675         if (pmtu >= dst_mtu(&rt->dst))
1676                 goto out;
1677
1678         if (pmtu < IPV6_MIN_MTU) {
1679                 /*
1680                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1681                  * MTU (1280) and a fragment header should always be included
1682                  * after a node receiving Too Big message reporting PMTU is
1683                  * less than the IPv6 Minimum Link MTU.
1684                  */
1685                 pmtu = IPV6_MIN_MTU;
1686                 allfrag = 1;
1687         }
1688
1689         /* New mtu received -> path was valid.
1690            They are sent only in response to data packets,
1691            so that this nexthop apparently is reachable. --ANK
1692          */
1693         dst_confirm(&rt->dst);
1694
1695         /* Host route. If it is static, it would be better
1696            not to override it, but add new one, so that
1697            when cache entry will expire old pmtu
1698            would return automatically.
1699          */
1700         if (rt->rt6i_flags & RTF_CACHE) {
1701                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1702                 if (allfrag) {
1703                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1704                         features |= RTAX_FEATURE_ALLFRAG;
1705                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1706                 }
1707                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1708                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1709                 goto out;
1710         }
1711
1712         /* Network route.
1713            Two cases are possible:
1714            1. It is connected route. Action: COW
1715            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1716          */
1717         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1718                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1719         else
1720                 nrt = rt6_alloc_clone(rt, daddr);
1721
1722         if (nrt) {
1723                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1724                 if (allfrag) {
1725                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1726                         features |= RTAX_FEATURE_ALLFRAG;
1727                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1728                 }
1729
1730                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1731                  * happened within 5 mins, the recommended timer is 10 mins.
1732                  * Here this route expiration time is set to ip6_rt_mtu_expires
1733                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1734                  * and detecting PMTU increase will be automatically happened.
1735                  */
1736                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1737                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1738
1739                 ip6_ins_rt(nrt);
1740         }
1741 out:
1742         dst_release(&rt->dst);
1743 }
1744
1745 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1746                         struct net_device *dev, u32 pmtu)
1747 {
1748         struct net *net = dev_net(dev);
1749
1750         /*
1751          * RFC 1981 states that a node "MUST reduce the size of the packets it
1752          * is sending along the path" that caused the Packet Too Big message.
1753          * Since it's not possible in the general case to determine which
1754          * interface was used to send the original packet, we update the MTU
1755          * on the interface that will be used to send future packets. We also
1756          * update the MTU on the interface that received the Packet Too Big in
1757          * case the original packet was forced out that interface with
1758          * SO_BINDTODEVICE or similar. This is the next best thing to the
1759          * correct behaviour, which would be to update the MTU on all
1760          * interfaces.
1761          */
1762         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1763         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1764 }
1765
1766 /*
1767  *      Misc support functions
1768  */
1769
1770 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1771                                     const struct in6_addr *dest)
1772 {
1773         struct net *net = dev_net(ort->rt6i_dev);
1774         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1775                                             ort->dst.dev, 0);
1776
1777         if (rt) {
1778                 rt->dst.input = ort->dst.input;
1779                 rt->dst.output = ort->dst.output;
1780                 rt->dst.flags |= DST_HOST;
1781
1782                 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1783                 rt->rt6i_dst.plen = 128;
1784                 dst_copy_metrics(&rt->dst, &ort->dst);
1785                 rt->dst.error = ort->dst.error;
1786                 rt->rt6i_idev = ort->rt6i_idev;
1787                 if (rt->rt6i_idev)
1788                         in6_dev_hold(rt->rt6i_idev);
1789                 rt->dst.lastuse = jiffies;
1790                 rt->rt6i_expires = 0;
1791
1792                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1793                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1794                 rt->rt6i_metric = 0;
1795
1796 #ifdef CONFIG_IPV6_SUBTREES
1797                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1798 #endif
1799                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1800                 rt->rt6i_table = ort->rt6i_table;
1801         }
1802         return rt;
1803 }
1804
1805 #ifdef CONFIG_IPV6_ROUTE_INFO
1806 static struct rt6_info *rt6_get_route_info(struct net *net,
1807                                            const struct in6_addr *prefix, int prefixlen,
1808                                            const struct in6_addr *gwaddr, int ifindex)
1809 {
1810         struct fib6_node *fn;
1811         struct rt6_info *rt = NULL;
1812         struct fib6_table *table;
1813
1814         table = fib6_get_table(net, RT6_TABLE_INFO);
1815         if (table == NULL)
1816                 return NULL;
1817
1818         write_lock_bh(&table->tb6_lock);
1819         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1820         if (!fn)
1821                 goto out;
1822
1823         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1824                 if (rt->rt6i_dev->ifindex != ifindex)
1825                         continue;
1826                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1827                         continue;
1828                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1829                         continue;
1830                 dst_hold(&rt->dst);
1831                 break;
1832         }
1833 out:
1834         write_unlock_bh(&table->tb6_lock);
1835         return rt;
1836 }
1837
1838 static struct rt6_info *rt6_add_route_info(struct net *net,
1839                                            const struct in6_addr *prefix, int prefixlen,
1840                                            const struct in6_addr *gwaddr, int ifindex,
1841                                            unsigned pref)
1842 {
1843         struct fib6_config cfg = {
1844                 .fc_table       = RT6_TABLE_INFO,
1845                 .fc_metric      = IP6_RT_PRIO_USER,
1846                 .fc_ifindex     = ifindex,
1847                 .fc_dst_len     = prefixlen,
1848                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1849                                   RTF_UP | RTF_PREF(pref),
1850                 .fc_nlinfo.pid = 0,
1851                 .fc_nlinfo.nlh = NULL,
1852                 .fc_nlinfo.nl_net = net,
1853         };
1854
1855         ipv6_addr_copy(&cfg.fc_dst, prefix);
1856         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1857
1858         /* We should treat it as a default route if prefix length is 0. */
1859         if (!prefixlen)
1860                 cfg.fc_flags |= RTF_DEFAULT;
1861
1862         ip6_route_add(&cfg);
1863
1864         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1865 }
1866 #endif
1867
1868 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1869 {
1870         struct rt6_info *rt;
1871         struct fib6_table *table;
1872
1873         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1874         if (table == NULL)
1875                 return NULL;
1876
1877         write_lock_bh(&table->tb6_lock);
1878         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1879                 if (dev == rt->rt6i_dev &&
1880                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1881                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1882                         break;
1883         }
1884         if (rt)
1885                 dst_hold(&rt->dst);
1886         write_unlock_bh(&table->tb6_lock);
1887         return rt;
1888 }
1889
1890 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1891                                      struct net_device *dev,
1892                                      unsigned int pref)
1893 {
1894         struct fib6_config cfg = {
1895                 .fc_table       = RT6_TABLE_DFLT,
1896                 .fc_metric      = IP6_RT_PRIO_USER,
1897                 .fc_ifindex     = dev->ifindex,
1898                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1899                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1900                 .fc_nlinfo.pid = 0,
1901                 .fc_nlinfo.nlh = NULL,
1902                 .fc_nlinfo.nl_net = dev_net(dev),
1903         };
1904
1905         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1906
1907         ip6_route_add(&cfg);
1908
1909         return rt6_get_dflt_router(gwaddr, dev);
1910 }
1911
1912 void rt6_purge_dflt_routers(struct net *net)
1913 {
1914         struct rt6_info *rt;
1915         struct fib6_table *table;
1916
1917         /* NOTE: Keep consistent with rt6_get_dflt_router */
1918         table = fib6_get_table(net, RT6_TABLE_DFLT);
1919         if (table == NULL)
1920                 return;
1921
1922 restart:
1923         read_lock_bh(&table->tb6_lock);
1924         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1925                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
1926                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
1927                         dst_hold(&rt->dst);
1928                         read_unlock_bh(&table->tb6_lock);
1929                         ip6_del_rt(rt);
1930                         goto restart;
1931                 }
1932         }
1933         read_unlock_bh(&table->tb6_lock);
1934 }
1935
1936 static void rtmsg_to_fib6_config(struct net *net,
1937                                  struct in6_rtmsg *rtmsg,
1938                                  struct fib6_config *cfg)
1939 {
1940         memset(cfg, 0, sizeof(*cfg));
1941
1942         cfg->fc_table = RT6_TABLE_MAIN;
1943         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1944         cfg->fc_metric = rtmsg->rtmsg_metric;
1945         cfg->fc_expires = rtmsg->rtmsg_info;
1946         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1947         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1948         cfg->fc_flags = rtmsg->rtmsg_flags;
1949
1950         cfg->fc_nlinfo.nl_net = net;
1951
1952         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1953         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1954         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1955 }
1956
1957 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1958 {
1959         struct fib6_config cfg;
1960         struct in6_rtmsg rtmsg;
1961         int err;
1962
1963         switch(cmd) {
1964         case SIOCADDRT:         /* Add a route */
1965         case SIOCDELRT:         /* Delete a route */
1966                 if (!capable(CAP_NET_ADMIN))
1967                         return -EPERM;
1968                 err = copy_from_user(&rtmsg, arg,
1969                                      sizeof(struct in6_rtmsg));
1970                 if (err)
1971                         return -EFAULT;
1972
1973                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1974
1975                 rtnl_lock();
1976                 switch (cmd) {
1977                 case SIOCADDRT:
1978                         err = ip6_route_add(&cfg);
1979                         break;
1980                 case SIOCDELRT:
1981                         err = ip6_route_del(&cfg);
1982                         break;
1983                 default:
1984                         err = -EINVAL;
1985                 }
1986                 rtnl_unlock();
1987
1988                 return err;
1989         }
1990
1991         return -EINVAL;
1992 }
1993
1994 /*
1995  *      Drop the packet on the floor
1996  */
1997
1998 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1999 {
2000         int type;
2001         struct dst_entry *dst = skb_dst(skb);
2002         switch (ipstats_mib_noroutes) {
2003         case IPSTATS_MIB_INNOROUTES:
2004                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2005                 if (type == IPV6_ADDR_ANY) {
2006                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2007                                       IPSTATS_MIB_INADDRERRORS);
2008                         break;
2009                 }
2010                 /* FALLTHROUGH */
2011         case IPSTATS_MIB_OUTNOROUTES:
2012                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2013                               ipstats_mib_noroutes);
2014                 break;
2015         }
2016         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2017         kfree_skb(skb);
2018         return 0;
2019 }
2020
2021 static int ip6_pkt_discard(struct sk_buff *skb)
2022 {
2023         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2024 }
2025
2026 static int ip6_pkt_discard_out(struct sk_buff *skb)
2027 {
2028         skb->dev = skb_dst(skb)->dev;
2029         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2030 }
2031
2032 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2033
2034 static int ip6_pkt_prohibit(struct sk_buff *skb)
2035 {
2036         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2037 }
2038
2039 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2040 {
2041         skb->dev = skb_dst(skb)->dev;
2042         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2043 }
2044
2045 #endif
2046
2047 /*
2048  *      Allocate a dst for local (unicast / anycast) address.
2049  */
2050
2051 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2052                                     const struct in6_addr *addr,
2053                                     int anycast)
2054 {
2055         struct net *net = dev_net(idev->dev);
2056         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2057                                             net->loopback_dev, DST_NOCOUNT);
2058         struct neighbour *neigh;
2059
2060         if (rt == NULL)
2061                 return ERR_PTR(-ENOMEM);
2062
2063         in6_dev_hold(idev);
2064
2065         rt->dst.flags |= DST_HOST;
2066         rt->dst.input = ip6_input;
2067         rt->dst.output = ip6_output;
2068         rt->rt6i_idev = idev;
2069         rt->dst.obsolete = -1;
2070
2071         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2072         if (anycast)
2073                 rt->rt6i_flags |= RTF_ANYCAST;
2074         else
2075                 rt->rt6i_flags |= RTF_LOCAL;
2076         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2077         if (IS_ERR(neigh)) {
2078                 dst_free(&rt->dst);
2079
2080                 return ERR_CAST(neigh);
2081         }
2082         dst_set_neighbour(&rt->dst, neigh);
2083
2084         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2085         rt->rt6i_dst.plen = 128;
2086         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2087
2088         atomic_set(&rt->dst.__refcnt, 1);
2089
2090         return rt;
2091 }
2092
2093 int ip6_route_get_saddr(struct net *net,
2094                         struct rt6_info *rt,
2095                         const struct in6_addr *daddr,
2096                         unsigned int prefs,
2097                         struct in6_addr *saddr)
2098 {
2099         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2100         int err = 0;
2101         if (rt->rt6i_prefsrc.plen)
2102                 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2103         else
2104                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2105                                          daddr, prefs, saddr);
2106         return err;
2107 }
2108
2109 /* remove deleted ip from prefsrc entries */
2110 struct arg_dev_net_ip {
2111         struct net_device *dev;
2112         struct net *net;
2113         struct in6_addr *addr;
2114 };
2115
2116 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2117 {
2118         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2119         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2120         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2121
2122         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2123             rt != net->ipv6.ip6_null_entry &&
2124             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2125                 /* remove prefsrc entry */
2126                 rt->rt6i_prefsrc.plen = 0;
2127         }
2128         return 0;
2129 }
2130
2131 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2132 {
2133         struct net *net = dev_net(ifp->idev->dev);
2134         struct arg_dev_net_ip adni = {
2135                 .dev = ifp->idev->dev,
2136                 .net = net,
2137                 .addr = &ifp->addr,
2138         };
2139         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2140 }
2141
2142 struct arg_dev_net {
2143         struct net_device *dev;
2144         struct net *net;
2145 };
2146
2147 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2148 {
2149         const struct arg_dev_net *adn = arg;
2150         const struct net_device *dev = adn->dev;
2151
2152         if ((rt->rt6i_dev == dev || dev == NULL) &&
2153             rt != adn->net->ipv6.ip6_null_entry) {
2154                 RT6_TRACE("deleted by ifdown %p\n", rt);
2155                 return -1;
2156         }
2157         return 0;
2158 }
2159
2160 void rt6_ifdown(struct net *net, struct net_device *dev)
2161 {
2162         struct arg_dev_net adn = {
2163                 .dev = dev,
2164                 .net = net,
2165         };
2166
2167         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2168         icmp6_clean_all(fib6_ifdown, &adn);
2169 }
2170
2171 struct rt6_mtu_change_arg
2172 {
2173         struct net_device *dev;
2174         unsigned mtu;
2175 };
2176
2177 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2178 {
2179         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2180         struct inet6_dev *idev;
2181
2182         /* In IPv6 pmtu discovery is not optional,
2183            so that RTAX_MTU lock cannot disable it.
2184            We still use this lock to block changes
2185            caused by addrconf/ndisc.
2186         */
2187
2188         idev = __in6_dev_get(arg->dev);
2189         if (idev == NULL)
2190                 return 0;
2191
2192         /* For administrative MTU increase, there is no way to discover
2193            IPv6 PMTU increase, so PMTU increase should be updated here.
2194            Since RFC 1981 doesn't include administrative MTU increase
2195            update PMTU increase is a MUST. (i.e. jumbo frame)
2196          */
2197         /*
2198            If new MTU is less than route PMTU, this new MTU will be the
2199            lowest MTU in the path, update the route PMTU to reflect PMTU
2200            decreases; if new MTU is greater than route PMTU, and the
2201            old MTU is the lowest MTU in the path, update the route PMTU
2202            to reflect the increase. In this case if the other nodes' MTU
2203            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2204            PMTU discouvery.
2205          */
2206         if (rt->rt6i_dev == arg->dev &&
2207             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2208             (dst_mtu(&rt->dst) >= arg->mtu ||
2209              (dst_mtu(&rt->dst) < arg->mtu &&
2210               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2211                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2212         }
2213         return 0;
2214 }
2215
2216 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2217 {
2218         struct rt6_mtu_change_arg arg = {
2219                 .dev = dev,
2220                 .mtu = mtu,
2221         };
2222
2223         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2224 }
2225
2226 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2227         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2228         [RTA_OIF]               = { .type = NLA_U32 },
2229         [RTA_IIF]               = { .type = NLA_U32 },
2230         [RTA_PRIORITY]          = { .type = NLA_U32 },
2231         [RTA_METRICS]           = { .type = NLA_NESTED },
2232 };
2233
2234 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2235                               struct fib6_config *cfg)
2236 {
2237         struct rtmsg *rtm;
2238         struct nlattr *tb[RTA_MAX+1];
2239         int err;
2240
2241         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2242         if (err < 0)
2243                 goto errout;
2244
2245         err = -EINVAL;
2246         rtm = nlmsg_data(nlh);
2247         memset(cfg, 0, sizeof(*cfg));
2248
2249         cfg->fc_table = rtm->rtm_table;
2250         cfg->fc_dst_len = rtm->rtm_dst_len;
2251         cfg->fc_src_len = rtm->rtm_src_len;
2252         cfg->fc_flags = RTF_UP;
2253         cfg->fc_protocol = rtm->rtm_protocol;
2254
2255         if (rtm->rtm_type == RTN_UNREACHABLE)
2256                 cfg->fc_flags |= RTF_REJECT;
2257
2258         if (rtm->rtm_type == RTN_LOCAL)
2259                 cfg->fc_flags |= RTF_LOCAL;
2260
2261         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2262         cfg->fc_nlinfo.nlh = nlh;
2263         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2264
2265         if (tb[RTA_GATEWAY]) {
2266                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2267                 cfg->fc_flags |= RTF_GATEWAY;
2268         }
2269
2270         if (tb[RTA_DST]) {
2271                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2272
2273                 if (nla_len(tb[RTA_DST]) < plen)
2274                         goto errout;
2275
2276                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2277         }
2278
2279         if (tb[RTA_SRC]) {
2280                 int plen = (rtm->rtm_src_len + 7) >> 3;
2281
2282                 if (nla_len(tb[RTA_SRC]) < plen)
2283                         goto errout;
2284
2285                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2286         }
2287
2288         if (tb[RTA_PREFSRC])
2289                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2290
2291         if (tb[RTA_OIF])
2292                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2293
2294         if (tb[RTA_PRIORITY])
2295                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2296
2297         if (tb[RTA_METRICS]) {
2298                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2299                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2300         }
2301
2302         if (tb[RTA_TABLE])
2303                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2304
2305         err = 0;
2306 errout:
2307         return err;
2308 }
2309
2310 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2311 {
2312         struct fib6_config cfg;
2313         int err;
2314
2315         err = rtm_to_fib6_config(skb, nlh, &cfg);
2316         if (err < 0)
2317                 return err;
2318
2319         return ip6_route_del(&cfg);
2320 }
2321
2322 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2323 {
2324         struct fib6_config cfg;
2325         int err;
2326
2327         err = rtm_to_fib6_config(skb, nlh, &cfg);
2328         if (err < 0)
2329                 return err;
2330
2331         return ip6_route_add(&cfg);
2332 }
2333
2334 static inline size_t rt6_nlmsg_size(void)
2335 {
2336         return NLMSG_ALIGN(sizeof(struct rtmsg))
2337                + nla_total_size(16) /* RTA_SRC */
2338                + nla_total_size(16) /* RTA_DST */
2339                + nla_total_size(16) /* RTA_GATEWAY */
2340                + nla_total_size(16) /* RTA_PREFSRC */
2341                + nla_total_size(4) /* RTA_TABLE */
2342                + nla_total_size(4) /* RTA_IIF */
2343                + nla_total_size(4) /* RTA_OIF */
2344                + nla_total_size(4) /* RTA_PRIORITY */
2345                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2346                + nla_total_size(sizeof(struct rta_cacheinfo));
2347 }
2348
2349 static int rt6_fill_node(struct net *net,
2350                          struct sk_buff *skb, struct rt6_info *rt,
2351                          struct in6_addr *dst, struct in6_addr *src,
2352                          int iif, int type, u32 pid, u32 seq,
2353                          int prefix, int nowait, unsigned int flags)
2354 {
2355         struct rtmsg *rtm;
2356         struct nlmsghdr *nlh;
2357         long expires;
2358         u32 table;
2359         struct neighbour *n;
2360
2361         if (prefix) {   /* user wants prefix routes only */
2362                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2363                         /* success since this is not a prefix route */
2364                         return 1;
2365                 }
2366         }
2367
2368         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2369         if (nlh == NULL)
2370                 return -EMSGSIZE;
2371
2372         rtm = nlmsg_data(nlh);
2373         rtm->rtm_family = AF_INET6;
2374         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2375         rtm->rtm_src_len = rt->rt6i_src.plen;
2376         rtm->rtm_tos = 0;
2377         if (rt->rt6i_table)
2378                 table = rt->rt6i_table->tb6_id;
2379         else
2380                 table = RT6_TABLE_UNSPEC;
2381         rtm->rtm_table = table;
2382         NLA_PUT_U32(skb, RTA_TABLE, table);
2383         if (rt->rt6i_flags&RTF_REJECT)
2384                 rtm->rtm_type = RTN_UNREACHABLE;
2385         else if (rt->rt6i_flags&RTF_LOCAL)
2386                 rtm->rtm_type = RTN_LOCAL;
2387         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2388                 rtm->rtm_type = RTN_LOCAL;
2389         else
2390                 rtm->rtm_type = RTN_UNICAST;
2391         rtm->rtm_flags = 0;
2392         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2393         rtm->rtm_protocol = rt->rt6i_protocol;
2394         if (rt->rt6i_flags&RTF_DYNAMIC)
2395                 rtm->rtm_protocol = RTPROT_REDIRECT;
2396         else if (rt->rt6i_flags & RTF_ADDRCONF)
2397                 rtm->rtm_protocol = RTPROT_KERNEL;
2398         else if (rt->rt6i_flags&RTF_DEFAULT)
2399                 rtm->rtm_protocol = RTPROT_RA;
2400
2401         if (rt->rt6i_flags&RTF_CACHE)
2402                 rtm->rtm_flags |= RTM_F_CLONED;
2403
2404         if (dst) {
2405                 NLA_PUT(skb, RTA_DST, 16, dst);
2406                 rtm->rtm_dst_len = 128;
2407         } else if (rtm->rtm_dst_len)
2408                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2409 #ifdef CONFIG_IPV6_SUBTREES
2410         if (src) {
2411                 NLA_PUT(skb, RTA_SRC, 16, src);
2412                 rtm->rtm_src_len = 128;
2413         } else if (rtm->rtm_src_len)
2414                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2415 #endif
2416         if (iif) {
2417 #ifdef CONFIG_IPV6_MROUTE
2418                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2419                         int err = ip6mr_get_route(net, skb, rtm, nowait,
2420                                                   pid);
2421
2422                         if (err <= 0) {
2423                                 if (!nowait) {
2424                                         if (err == 0)
2425                                                 return 0;
2426                                         goto nla_put_failure;
2427                                 } else {
2428                                         if (err == -EMSGSIZE)
2429                                                 goto nla_put_failure;
2430                                 }
2431                         }
2432                 } else
2433 #endif
2434                         NLA_PUT_U32(skb, RTA_IIF, iif);
2435         } else if (dst) {
2436                 struct in6_addr saddr_buf;
2437                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2438                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2439         }
2440
2441         if (rt->rt6i_prefsrc.plen) {
2442                 struct in6_addr saddr_buf;
2443                 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2444                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2445         }
2446
2447         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2448                 goto nla_put_failure;
2449
2450         rcu_read_lock();
2451         n = dst_get_neighbour(&rt->dst);
2452         if (n) {
2453                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2454                         rcu_read_unlock();
2455                         goto nla_put_failure;
2456                 }
2457         }
2458         rcu_read_unlock();
2459
2460         if (rt->dst.dev)
2461                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2462
2463         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2464
2465         if (!(rt->rt6i_flags & RTF_EXPIRES))
2466                 expires = 0;
2467         else if (rt->rt6i_expires - jiffies < INT_MAX)
2468                 expires = rt->rt6i_expires - jiffies;
2469         else
2470                 expires = INT_MAX;
2471
2472         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2473                                expires, rt->dst.error) < 0)
2474                 goto nla_put_failure;
2475
2476         return nlmsg_end(skb, nlh);
2477
2478 nla_put_failure:
2479         nlmsg_cancel(skb, nlh);
2480         return -EMSGSIZE;
2481 }
2482
2483 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2484 {
2485         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2486         int prefix;
2487
2488         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2489                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2490                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2491         } else
2492                 prefix = 0;
2493
2494         return rt6_fill_node(arg->net,
2495                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2496                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2497                      prefix, 0, NLM_F_MULTI);
2498 }
2499
2500 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2501 {
2502         struct net *net = sock_net(in_skb->sk);
2503         struct nlattr *tb[RTA_MAX+1];
2504         struct rt6_info *rt;
2505         struct sk_buff *skb;
2506         struct rtmsg *rtm;
2507         struct flowi6 fl6;
2508         int err, iif = 0;
2509
2510         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2511         if (err < 0)
2512                 goto errout;
2513
2514         err = -EINVAL;
2515         memset(&fl6, 0, sizeof(fl6));
2516
2517         if (tb[RTA_SRC]) {
2518                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2519                         goto errout;
2520
2521                 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2522         }
2523
2524         if (tb[RTA_DST]) {
2525                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2526                         goto errout;
2527
2528                 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2529         }
2530
2531         if (tb[RTA_IIF])
2532                 iif = nla_get_u32(tb[RTA_IIF]);
2533
2534         if (tb[RTA_OIF])
2535                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2536
2537         if (iif) {
2538                 struct net_device *dev;
2539                 dev = __dev_get_by_index(net, iif);
2540                 if (!dev) {
2541                         err = -ENODEV;
2542                         goto errout;
2543                 }
2544         }
2545
2546         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2547         if (skb == NULL) {
2548                 err = -ENOBUFS;
2549                 goto errout;
2550         }
2551
2552         /* Reserve room for dummy headers, this skb can pass
2553            through good chunk of routing engine.
2554          */
2555         skb_reset_mac_header(skb);
2556         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2557
2558         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2559         skb_dst_set(skb, &rt->dst);
2560
2561         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2562                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2563                             nlh->nlmsg_seq, 0, 0, 0);
2564         if (err < 0) {
2565                 kfree_skb(skb);
2566                 goto errout;
2567         }
2568
2569         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2570 errout:
2571         return err;
2572 }
2573
2574 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2575 {
2576         struct sk_buff *skb;
2577         struct net *net = info->nl_net;
2578         u32 seq;
2579         int err;
2580
2581         err = -ENOBUFS;
2582         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2583
2584         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2585         if (skb == NULL)
2586                 goto errout;
2587
2588         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2589                                 event, info->pid, seq, 0, 0, 0);
2590         if (err < 0) {
2591                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2592                 WARN_ON(err == -EMSGSIZE);
2593                 kfree_skb(skb);
2594                 goto errout;
2595         }
2596         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2597                     info->nlh, gfp_any());
2598         return;
2599 errout:
2600         if (err < 0)
2601                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2602 }
2603
2604 static int ip6_route_dev_notify(struct notifier_block *this,
2605                                 unsigned long event, void *data)
2606 {
2607         struct net_device *dev = (struct net_device *)data;
2608         struct net *net = dev_net(dev);
2609
2610         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2611                 net->ipv6.ip6_null_entry->dst.dev = dev;
2612                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2613 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2614                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2615                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2616                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2617                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2618 #endif
2619         }
2620
2621         return NOTIFY_OK;
2622 }
2623
2624 /*
2625  *      /proc
2626  */
2627
2628 #ifdef CONFIG_PROC_FS
2629
2630 struct rt6_proc_arg
2631 {
2632         char *buffer;
2633         int offset;
2634         int length;
2635         int skip;
2636         int len;
2637 };
2638
2639 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2640 {
2641         struct seq_file *m = p_arg;
2642         struct neighbour *n;
2643
2644         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2645
2646 #ifdef CONFIG_IPV6_SUBTREES
2647         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2648 #else
2649         seq_puts(m, "00000000000000000000000000000000 00 ");
2650 #endif
2651         rcu_read_lock();
2652         n = dst_get_neighbour(&rt->dst);
2653         if (n) {
2654                 seq_printf(m, "%pi6", n->primary_key);
2655         } else {
2656                 seq_puts(m, "00000000000000000000000000000000");
2657         }
2658         rcu_read_unlock();
2659         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2660                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2661                    rt->dst.__use, rt->rt6i_flags,
2662                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2663         return 0;
2664 }
2665
2666 static int ipv6_route_show(struct seq_file *m, void *v)
2667 {
2668         struct net *net = (struct net *)m->private;
2669         fib6_clean_all(net, rt6_info_route, 0, m);
2670         return 0;
2671 }
2672
2673 static int ipv6_route_open(struct inode *inode, struct file *file)
2674 {
2675         return single_open_net(inode, file, ipv6_route_show);
2676 }
2677
2678 static const struct file_operations ipv6_route_proc_fops = {
2679         .owner          = THIS_MODULE,
2680         .open           = ipv6_route_open,
2681         .read           = seq_read,
2682         .llseek         = seq_lseek,
2683         .release        = single_release_net,
2684 };
2685
2686 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2687 {
2688         struct net *net = (struct net *)seq->private;
2689         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2690                    net->ipv6.rt6_stats->fib_nodes,
2691                    net->ipv6.rt6_stats->fib_route_nodes,
2692                    net->ipv6.rt6_stats->fib_rt_alloc,
2693                    net->ipv6.rt6_stats->fib_rt_entries,
2694                    net->ipv6.rt6_stats->fib_rt_cache,
2695                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2696                    net->ipv6.rt6_stats->fib_discarded_routes);
2697
2698         return 0;
2699 }
2700
2701 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2702 {
2703         return single_open_net(inode, file, rt6_stats_seq_show);
2704 }
2705
2706 static const struct file_operations rt6_stats_seq_fops = {
2707         .owner   = THIS_MODULE,
2708         .open    = rt6_stats_seq_open,
2709         .read    = seq_read,
2710         .llseek  = seq_lseek,
2711         .release = single_release_net,
2712 };
2713 #endif  /* CONFIG_PROC_FS */
2714
2715 #ifdef CONFIG_SYSCTL
2716
2717 static
2718 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2719                               void __user *buffer, size_t *lenp, loff_t *ppos)
2720 {
2721         struct net *net;
2722         int delay;
2723         if (!write)
2724                 return -EINVAL;
2725
2726         net = (struct net *)ctl->extra1;
2727         delay = net->ipv6.sysctl.flush_delay;
2728         proc_dointvec(ctl, write, buffer, lenp, ppos);
2729         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
2730         return 0;
2731 }
2732
2733 ctl_table ipv6_route_table_template[] = {
2734         {
2735                 .procname       =       "flush",
2736                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2737                 .maxlen         =       sizeof(int),
2738                 .mode           =       0200,
2739                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2740         },
2741         {
2742                 .procname       =       "gc_thresh",
2743                 .data           =       &ip6_dst_ops_template.gc_thresh,
2744                 .maxlen         =       sizeof(int),
2745                 .mode           =       0644,
2746                 .proc_handler   =       proc_dointvec,
2747         },
2748         {
2749                 .procname       =       "max_size",
2750                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2751                 .maxlen         =       sizeof(int),
2752                 .mode           =       0644,
2753                 .proc_handler   =       proc_dointvec,
2754         },
2755         {
2756                 .procname       =       "gc_min_interval",
2757                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2758                 .maxlen         =       sizeof(int),
2759                 .mode           =       0644,
2760                 .proc_handler   =       proc_dointvec_jiffies,
2761         },
2762         {
2763                 .procname       =       "gc_timeout",
2764                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2765                 .maxlen         =       sizeof(int),
2766                 .mode           =       0644,
2767                 .proc_handler   =       proc_dointvec_jiffies,
2768         },
2769         {
2770                 .procname       =       "gc_interval",
2771                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2772                 .maxlen         =       sizeof(int),
2773                 .mode           =       0644,
2774                 .proc_handler   =       proc_dointvec_jiffies,
2775         },
2776         {
2777                 .procname       =       "gc_elasticity",
2778                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2779                 .maxlen         =       sizeof(int),
2780                 .mode           =       0644,
2781                 .proc_handler   =       proc_dointvec,
2782         },
2783         {
2784                 .procname       =       "mtu_expires",
2785                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2786                 .maxlen         =       sizeof(int),
2787                 .mode           =       0644,
2788                 .proc_handler   =       proc_dointvec_jiffies,
2789         },
2790         {
2791                 .procname       =       "min_adv_mss",
2792                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2793                 .maxlen         =       sizeof(int),
2794                 .mode           =       0644,
2795                 .proc_handler   =       proc_dointvec,
2796         },
2797         {
2798                 .procname       =       "gc_min_interval_ms",
2799                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2800                 .maxlen         =       sizeof(int),
2801                 .mode           =       0644,
2802                 .proc_handler   =       proc_dointvec_ms_jiffies,
2803         },
2804         { }
2805 };
2806
2807 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2808 {
2809         struct ctl_table *table;
2810
2811         table = kmemdup(ipv6_route_table_template,
2812                         sizeof(ipv6_route_table_template),
2813                         GFP_KERNEL);
2814
2815         if (table) {
2816                 table[0].data = &net->ipv6.sysctl.flush_delay;
2817                 table[0].extra1 = net;
2818                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2819                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2820                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2821                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2822                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2823                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2824                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2825                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2826                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2827         }
2828
2829         return table;
2830 }
2831 #endif
2832
2833 static int __net_init ip6_route_net_init(struct net *net)
2834 {
2835         int ret = -ENOMEM;
2836
2837         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2838                sizeof(net->ipv6.ip6_dst_ops));
2839
2840         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2841                 goto out_ip6_dst_ops;
2842
2843         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2844                                            sizeof(*net->ipv6.ip6_null_entry),
2845                                            GFP_KERNEL);
2846         if (!net->ipv6.ip6_null_entry)
2847                 goto out_ip6_dst_entries;
2848         net->ipv6.ip6_null_entry->dst.path =
2849                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2850         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2851         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2852                          ip6_template_metrics, true);
2853
2854 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2855         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2856                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2857                                                GFP_KERNEL);
2858         if (!net->ipv6.ip6_prohibit_entry)
2859                 goto out_ip6_null_entry;
2860         net->ipv6.ip6_prohibit_entry->dst.path =
2861                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2862         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2863         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2864                          ip6_template_metrics, true);
2865
2866         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2867                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2868                                                GFP_KERNEL);
2869         if (!net->ipv6.ip6_blk_hole_entry)
2870                 goto out_ip6_prohibit_entry;
2871         net->ipv6.ip6_blk_hole_entry->dst.path =
2872                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2873         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2874         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2875                          ip6_template_metrics, true);
2876 #endif
2877
2878         net->ipv6.sysctl.flush_delay = 0;
2879         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2880         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2881         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2882         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2883         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2884         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2885         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2886
2887         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2888
2889         ret = 0;
2890 out:
2891         return ret;
2892
2893 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2894 out_ip6_prohibit_entry:
2895         kfree(net->ipv6.ip6_prohibit_entry);
2896 out_ip6_null_entry:
2897         kfree(net->ipv6.ip6_null_entry);
2898 #endif
2899 out_ip6_dst_entries:
2900         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2901 out_ip6_dst_ops:
2902         goto out;
2903 }
2904
2905 static void __net_exit ip6_route_net_exit(struct net *net)
2906 {
2907         kfree(net->ipv6.ip6_null_entry);
2908 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2909         kfree(net->ipv6.ip6_prohibit_entry);
2910         kfree(net->ipv6.ip6_blk_hole_entry);
2911 #endif
2912         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2913 }
2914
2915 static int __net_init ip6_route_net_init_late(struct net *net)
2916 {
2917 #ifdef CONFIG_PROC_FS
2918         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2919         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2920 #endif
2921         return 0;
2922 }
2923
2924 static void __net_exit ip6_route_net_exit_late(struct net *net)
2925 {
2926 #ifdef CONFIG_PROC_FS
2927         proc_net_remove(net, "ipv6_route");
2928         proc_net_remove(net, "rt6_stats");
2929 #endif
2930 }
2931
2932 static struct pernet_operations ip6_route_net_ops = {
2933         .init = ip6_route_net_init,
2934         .exit = ip6_route_net_exit,
2935 };
2936
2937 static struct pernet_operations ip6_route_net_late_ops = {
2938         .init = ip6_route_net_init_late,
2939         .exit = ip6_route_net_exit_late,
2940 };
2941
2942 static struct notifier_block ip6_route_dev_notifier = {
2943         .notifier_call = ip6_route_dev_notify,
2944         .priority = 0,
2945 };
2946
2947 int __init ip6_route_init(void)
2948 {
2949         int ret;
2950
2951         ret = -ENOMEM;
2952         ip6_dst_ops_template.kmem_cachep =
2953                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2954                                   SLAB_HWCACHE_ALIGN, NULL);
2955         if (!ip6_dst_ops_template.kmem_cachep)
2956                 goto out;
2957
2958         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2959         if (ret)
2960                 goto out_kmem_cache;
2961
2962         ret = register_pernet_subsys(&ip6_route_net_ops);
2963         if (ret)
2964                 goto out_dst_entries;
2965
2966         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2967
2968         /* Registering of the loopback is done before this portion of code,
2969          * the loopback reference in rt6_info will not be taken, do it
2970          * manually for init_net */
2971         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2972         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2973   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2974         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2975         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2976         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2977         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2978   #endif
2979         ret = fib6_init();
2980         if (ret)
2981                 goto out_register_subsys;
2982
2983         ret = xfrm6_init();
2984         if (ret)
2985                 goto out_fib6_init;
2986
2987         ret = fib6_rules_init();
2988         if (ret)
2989                 goto xfrm6_init;
2990
2991         ret = register_pernet_subsys(&ip6_route_net_late_ops);
2992         if (ret)
2993                 goto fib6_rules_init;
2994
2995         ret = -ENOBUFS;
2996         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2997             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2998             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2999                 goto out_register_late_subsys;
3000
3001         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3002         if (ret)
3003                 goto out_register_late_subsys;
3004
3005 out:
3006         return ret;
3007
3008 out_register_late_subsys:
3009         unregister_pernet_subsys(&ip6_route_net_late_ops);
3010 fib6_rules_init:
3011         fib6_rules_cleanup();
3012 xfrm6_init:
3013         xfrm6_fini();
3014 out_fib6_init:
3015         fib6_gc_cleanup();
3016 out_register_subsys:
3017         unregister_pernet_subsys(&ip6_route_net_ops);
3018 out_dst_entries:
3019         dst_entries_destroy(&ip6_dst_blackhole_ops);
3020 out_kmem_cache:
3021         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3022         goto out;
3023 }
3024
3025 void ip6_route_cleanup(void)
3026 {
3027         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3028         unregister_pernet_subsys(&ip6_route_net_late_ops);
3029         fib6_rules_cleanup();
3030         xfrm6_fini();
3031         fib6_gc_cleanup();
3032         unregister_pernet_subsys(&ip6_route_net_ops);
3033         dst_entries_destroy(&ip6_dst_blackhole_ops);
3034         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3035 }