Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wirel...
[pandora-kernel.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int      ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            const struct in6_addr *prefix, int prefixlen,
93                                            const struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            const struct in6_addr *prefix, int prefixlen,
97                                            const struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
101 {
102         struct rt6_info *rt = (struct rt6_info *) dst;
103         struct inet_peer *peer;
104         u32 *p = NULL;
105
106         if (!rt->rt6i_peer)
107                 rt6_bind_peer(rt, 1);
108
109         peer = rt->rt6i_peer;
110         if (peer) {
111                 u32 *old_p = __DST_METRICS_PTR(old);
112                 unsigned long prev, new;
113
114                 p = peer->metrics;
115                 if (inet_metrics_new(peer))
116                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
117
118                 new = (unsigned long) p;
119                 prev = cmpxchg(&dst->_metrics, old, new);
120
121                 if (prev != old) {
122                         p = __DST_METRICS_PTR(prev);
123                         if (prev & DST_METRICS_READ_ONLY)
124                                 p = NULL;
125                 }
126         }
127         return p;
128 }
129
130 static struct dst_ops ip6_dst_ops_template = {
131         .family                 =       AF_INET6,
132         .protocol               =       cpu_to_be16(ETH_P_IPV6),
133         .gc                     =       ip6_dst_gc,
134         .gc_thresh              =       1024,
135         .check                  =       ip6_dst_check,
136         .default_advmss         =       ip6_default_advmss,
137         .default_mtu            =       ip6_default_mtu,
138         .cow_metrics            =       ipv6_cow_metrics,
139         .destroy                =       ip6_dst_destroy,
140         .ifdown                 =       ip6_dst_ifdown,
141         .negative_advice        =       ip6_negative_advice,
142         .link_failure           =       ip6_link_failure,
143         .update_pmtu            =       ip6_rt_update_pmtu,
144         .local_out              =       __ip6_local_out,
145 };
146
147 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
148 {
149         return 0;
150 }
151
152 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
153 {
154 }
155
156 static struct dst_ops ip6_dst_blackhole_ops = {
157         .family                 =       AF_INET6,
158         .protocol               =       cpu_to_be16(ETH_P_IPV6),
159         .destroy                =       ip6_dst_destroy,
160         .check                  =       ip6_dst_check,
161         .default_mtu            =       ip6_blackhole_default_mtu,
162         .default_advmss         =       ip6_default_advmss,
163         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
164 };
165
166 static const u32 ip6_template_metrics[RTAX_MAX] = {
167         [RTAX_HOPLIMIT - 1] = 255,
168 };
169
170 static struct rt6_info ip6_null_entry_template = {
171         .dst = {
172                 .__refcnt       = ATOMIC_INIT(1),
173                 .__use          = 1,
174                 .obsolete       = -1,
175                 .error          = -ENETUNREACH,
176                 .input          = ip6_pkt_discard,
177                 .output         = ip6_pkt_discard_out,
178         },
179         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
180         .rt6i_protocol  = RTPROT_KERNEL,
181         .rt6i_metric    = ~(u32) 0,
182         .rt6i_ref       = ATOMIC_INIT(1),
183 };
184
185 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
186
187 static int ip6_pkt_prohibit(struct sk_buff *skb);
188 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
189
190 static struct rt6_info ip6_prohibit_entry_template = {
191         .dst = {
192                 .__refcnt       = ATOMIC_INIT(1),
193                 .__use          = 1,
194                 .obsolete       = -1,
195                 .error          = -EACCES,
196                 .input          = ip6_pkt_prohibit,
197                 .output         = ip6_pkt_prohibit_out,
198         },
199         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
200         .rt6i_protocol  = RTPROT_KERNEL,
201         .rt6i_metric    = ~(u32) 0,
202         .rt6i_ref       = ATOMIC_INIT(1),
203 };
204
205 static struct rt6_info ip6_blk_hole_entry_template = {
206         .dst = {
207                 .__refcnt       = ATOMIC_INIT(1),
208                 .__use          = 1,
209                 .obsolete       = -1,
210                 .error          = -EINVAL,
211                 .input          = dst_discard,
212                 .output         = dst_discard,
213         },
214         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
215         .rt6i_protocol  = RTPROT_KERNEL,
216         .rt6i_metric    = ~(u32) 0,
217         .rt6i_ref       = ATOMIC_INIT(1),
218 };
219
220 #endif
221
222 /* allocate dst with ip6_dst_ops */
223 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
224 {
225         return (struct rt6_info *)dst_alloc(ops, 0);
226 }
227
228 static void ip6_dst_destroy(struct dst_entry *dst)
229 {
230         struct rt6_info *rt = (struct rt6_info *)dst;
231         struct inet6_dev *idev = rt->rt6i_idev;
232         struct inet_peer *peer = rt->rt6i_peer;
233
234         if (idev != NULL) {
235                 rt->rt6i_idev = NULL;
236                 in6_dev_put(idev);
237         }
238         if (peer) {
239                 rt->rt6i_peer = NULL;
240                 inet_putpeer(peer);
241         }
242 }
243
244 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
245
246 static u32 rt6_peer_genid(void)
247 {
248         return atomic_read(&__rt6_peer_genid);
249 }
250
251 void rt6_bind_peer(struct rt6_info *rt, int create)
252 {
253         struct inet_peer *peer;
254
255         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
256         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
257                 inet_putpeer(peer);
258         else
259                 rt->rt6i_peer_genid = rt6_peer_genid();
260 }
261
262 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
263                            int how)
264 {
265         struct rt6_info *rt = (struct rt6_info *)dst;
266         struct inet6_dev *idev = rt->rt6i_idev;
267         struct net_device *loopback_dev =
268                 dev_net(dev)->loopback_dev;
269
270         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
271                 struct inet6_dev *loopback_idev =
272                         in6_dev_get(loopback_dev);
273                 if (loopback_idev != NULL) {
274                         rt->rt6i_idev = loopback_idev;
275                         in6_dev_put(idev);
276                 }
277         }
278 }
279
280 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
281 {
282         return (rt->rt6i_flags & RTF_EXPIRES) &&
283                 time_after(jiffies, rt->rt6i_expires);
284 }
285
286 static inline int rt6_need_strict(const struct in6_addr *daddr)
287 {
288         return ipv6_addr_type(daddr) &
289                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
290 }
291
292 /*
293  *      Route lookup. Any table->tb6_lock is implied.
294  */
295
296 static inline struct rt6_info *rt6_device_match(struct net *net,
297                                                     struct rt6_info *rt,
298                                                     const struct in6_addr *saddr,
299                                                     int oif,
300                                                     int flags)
301 {
302         struct rt6_info *local = NULL;
303         struct rt6_info *sprt;
304
305         if (!oif && ipv6_addr_any(saddr))
306                 goto out;
307
308         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
309                 struct net_device *dev = sprt->rt6i_dev;
310
311                 if (oif) {
312                         if (dev->ifindex == oif)
313                                 return sprt;
314                         if (dev->flags & IFF_LOOPBACK) {
315                                 if (sprt->rt6i_idev == NULL ||
316                                     sprt->rt6i_idev->dev->ifindex != oif) {
317                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
318                                                 continue;
319                                         if (local && (!oif ||
320                                                       local->rt6i_idev->dev->ifindex == oif))
321                                                 continue;
322                                 }
323                                 local = sprt;
324                         }
325                 } else {
326                         if (ipv6_chk_addr(net, saddr, dev,
327                                           flags & RT6_LOOKUP_F_IFACE))
328                                 return sprt;
329                 }
330         }
331
332         if (oif) {
333                 if (local)
334                         return local;
335
336                 if (flags & RT6_LOOKUP_F_IFACE)
337                         return net->ipv6.ip6_null_entry;
338         }
339 out:
340         return rt;
341 }
342
343 #ifdef CONFIG_IPV6_ROUTER_PREF
344 static void rt6_probe(struct rt6_info *rt)
345 {
346         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
347         /*
348          * Okay, this does not seem to be appropriate
349          * for now, however, we need to check if it
350          * is really so; aka Router Reachability Probing.
351          *
352          * Router Reachability Probe MUST be rate-limited
353          * to no more than one per minute.
354          */
355         if (!neigh || (neigh->nud_state & NUD_VALID))
356                 return;
357         read_lock_bh(&neigh->lock);
358         if (!(neigh->nud_state & NUD_VALID) &&
359             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
360                 struct in6_addr mcaddr;
361                 struct in6_addr *target;
362
363                 neigh->updated = jiffies;
364                 read_unlock_bh(&neigh->lock);
365
366                 target = (struct in6_addr *)&neigh->primary_key;
367                 addrconf_addr_solict_mult(target, &mcaddr);
368                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
369         } else
370                 read_unlock_bh(&neigh->lock);
371 }
372 #else
373 static inline void rt6_probe(struct rt6_info *rt)
374 {
375 }
376 #endif
377
378 /*
379  * Default Router Selection (RFC 2461 6.3.6)
380  */
381 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
382 {
383         struct net_device *dev = rt->rt6i_dev;
384         if (!oif || dev->ifindex == oif)
385                 return 2;
386         if ((dev->flags & IFF_LOOPBACK) &&
387             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
388                 return 1;
389         return 0;
390 }
391
392 static inline int rt6_check_neigh(struct rt6_info *rt)
393 {
394         struct neighbour *neigh = rt->rt6i_nexthop;
395         int m;
396         if (rt->rt6i_flags & RTF_NONEXTHOP ||
397             !(rt->rt6i_flags & RTF_GATEWAY))
398                 m = 1;
399         else if (neigh) {
400                 read_lock_bh(&neigh->lock);
401                 if (neigh->nud_state & NUD_VALID)
402                         m = 2;
403 #ifdef CONFIG_IPV6_ROUTER_PREF
404                 else if (neigh->nud_state & NUD_FAILED)
405                         m = 0;
406 #endif
407                 else
408                         m = 1;
409                 read_unlock_bh(&neigh->lock);
410         } else
411                 m = 0;
412         return m;
413 }
414
415 static int rt6_score_route(struct rt6_info *rt, int oif,
416                            int strict)
417 {
418         int m, n;
419
420         m = rt6_check_dev(rt, oif);
421         if (!m && (strict & RT6_LOOKUP_F_IFACE))
422                 return -1;
423 #ifdef CONFIG_IPV6_ROUTER_PREF
424         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
425 #endif
426         n = rt6_check_neigh(rt);
427         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
428                 return -1;
429         return m;
430 }
431
432 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
433                                    int *mpri, struct rt6_info *match)
434 {
435         int m;
436
437         if (rt6_check_expired(rt))
438                 goto out;
439
440         m = rt6_score_route(rt, oif, strict);
441         if (m < 0)
442                 goto out;
443
444         if (m > *mpri) {
445                 if (strict & RT6_LOOKUP_F_REACHABLE)
446                         rt6_probe(match);
447                 *mpri = m;
448                 match = rt;
449         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
450                 rt6_probe(rt);
451         }
452
453 out:
454         return match;
455 }
456
457 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
458                                      struct rt6_info *rr_head,
459                                      u32 metric, int oif, int strict)
460 {
461         struct rt6_info *rt, *match;
462         int mpri = -1;
463
464         match = NULL;
465         for (rt = rr_head; rt && rt->rt6i_metric == metric;
466              rt = rt->dst.rt6_next)
467                 match = find_match(rt, oif, strict, &mpri, match);
468         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
469              rt = rt->dst.rt6_next)
470                 match = find_match(rt, oif, strict, &mpri, match);
471
472         return match;
473 }
474
475 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
476 {
477         struct rt6_info *match, *rt0;
478         struct net *net;
479
480         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
481                   __func__, fn->leaf, oif);
482
483         rt0 = fn->rr_ptr;
484         if (!rt0)
485                 fn->rr_ptr = rt0 = fn->leaf;
486
487         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
488
489         if (!match &&
490             (strict & RT6_LOOKUP_F_REACHABLE)) {
491                 struct rt6_info *next = rt0->dst.rt6_next;
492
493                 /* no entries matched; do round-robin */
494                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
495                         next = fn->leaf;
496
497                 if (next != rt0)
498                         fn->rr_ptr = next;
499         }
500
501         RT6_TRACE("%s() => %p\n",
502                   __func__, match);
503
504         net = dev_net(rt0->rt6i_dev);
505         return match ? match : net->ipv6.ip6_null_entry;
506 }
507
508 #ifdef CONFIG_IPV6_ROUTE_INFO
509 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
510                   const struct in6_addr *gwaddr)
511 {
512         struct net *net = dev_net(dev);
513         struct route_info *rinfo = (struct route_info *) opt;
514         struct in6_addr prefix_buf, *prefix;
515         unsigned int pref;
516         unsigned long lifetime;
517         struct rt6_info *rt;
518
519         if (len < sizeof(struct route_info)) {
520                 return -EINVAL;
521         }
522
523         /* Sanity check for prefix_len and length */
524         if (rinfo->length > 3) {
525                 return -EINVAL;
526         } else if (rinfo->prefix_len > 128) {
527                 return -EINVAL;
528         } else if (rinfo->prefix_len > 64) {
529                 if (rinfo->length < 2) {
530                         return -EINVAL;
531                 }
532         } else if (rinfo->prefix_len > 0) {
533                 if (rinfo->length < 1) {
534                         return -EINVAL;
535                 }
536         }
537
538         pref = rinfo->route_pref;
539         if (pref == ICMPV6_ROUTER_PREF_INVALID)
540                 return -EINVAL;
541
542         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
543
544         if (rinfo->length == 3)
545                 prefix = (struct in6_addr *)rinfo->prefix;
546         else {
547                 /* this function is safe */
548                 ipv6_addr_prefix(&prefix_buf,
549                                  (struct in6_addr *)rinfo->prefix,
550                                  rinfo->prefix_len);
551                 prefix = &prefix_buf;
552         }
553
554         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
555                                 dev->ifindex);
556
557         if (rt && !lifetime) {
558                 ip6_del_rt(rt);
559                 rt = NULL;
560         }
561
562         if (!rt && lifetime)
563                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
564                                         pref);
565         else if (rt)
566                 rt->rt6i_flags = RTF_ROUTEINFO |
567                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
568
569         if (rt) {
570                 if (!addrconf_finite_timeout(lifetime)) {
571                         rt->rt6i_flags &= ~RTF_EXPIRES;
572                 } else {
573                         rt->rt6i_expires = jiffies + HZ * lifetime;
574                         rt->rt6i_flags |= RTF_EXPIRES;
575                 }
576                 dst_release(&rt->dst);
577         }
578         return 0;
579 }
580 #endif
581
582 #define BACKTRACK(__net, saddr)                 \
583 do { \
584         if (rt == __net->ipv6.ip6_null_entry) { \
585                 struct fib6_node *pn; \
586                 while (1) { \
587                         if (fn->fn_flags & RTN_TL_ROOT) \
588                                 goto out; \
589                         pn = fn->parent; \
590                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
591                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
592                         else \
593                                 fn = pn; \
594                         if (fn->fn_flags & RTN_RTINFO) \
595                                 goto restart; \
596                 } \
597         } \
598 } while(0)
599
600 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
601                                              struct fib6_table *table,
602                                              struct flowi6 *fl6, int flags)
603 {
604         struct fib6_node *fn;
605         struct rt6_info *rt;
606
607         read_lock_bh(&table->tb6_lock);
608         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
609 restart:
610         rt = fn->leaf;
611         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
612         BACKTRACK(net, &fl6->saddr);
613 out:
614         dst_use(&rt->dst, jiffies);
615         read_unlock_bh(&table->tb6_lock);
616         return rt;
617
618 }
619
620 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
621                             const struct in6_addr *saddr, int oif, int strict)
622 {
623         struct flowi6 fl6 = {
624                 .flowi6_oif = oif,
625                 .daddr = *daddr,
626         };
627         struct dst_entry *dst;
628         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
629
630         if (saddr) {
631                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
632                 flags |= RT6_LOOKUP_F_HAS_SADDR;
633         }
634
635         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
636         if (dst->error == 0)
637                 return (struct rt6_info *) dst;
638
639         dst_release(dst);
640
641         return NULL;
642 }
643
644 EXPORT_SYMBOL(rt6_lookup);
645
646 /* ip6_ins_rt is called with FREE table->tb6_lock.
647    It takes new route entry, the addition fails by any reason the
648    route is freed. In any case, if caller does not hold it, it may
649    be destroyed.
650  */
651
652 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
653 {
654         int err;
655         struct fib6_table *table;
656
657         table = rt->rt6i_table;
658         write_lock_bh(&table->tb6_lock);
659         err = fib6_add(&table->tb6_root, rt, info);
660         write_unlock_bh(&table->tb6_lock);
661
662         return err;
663 }
664
665 int ip6_ins_rt(struct rt6_info *rt)
666 {
667         struct nl_info info = {
668                 .nl_net = dev_net(rt->rt6i_dev),
669         };
670         return __ip6_ins_rt(rt, &info);
671 }
672
673 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, const struct in6_addr *daddr,
674                                       const struct in6_addr *saddr)
675 {
676         struct rt6_info *rt;
677
678         /*
679          *      Clone the route.
680          */
681
682         rt = ip6_rt_copy(ort);
683
684         if (rt) {
685                 struct neighbour *neigh;
686                 int attempts = !in_softirq();
687
688                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
689                         if (rt->rt6i_dst.plen != 128 &&
690                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
691                                 rt->rt6i_flags |= RTF_ANYCAST;
692                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
693                 }
694
695                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
696                 rt->rt6i_dst.plen = 128;
697                 rt->rt6i_flags |= RTF_CACHE;
698                 rt->dst.flags |= DST_HOST;
699
700 #ifdef CONFIG_IPV6_SUBTREES
701                 if (rt->rt6i_src.plen && saddr) {
702                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
703                         rt->rt6i_src.plen = 128;
704                 }
705 #endif
706
707         retry:
708                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
709                 if (IS_ERR(neigh)) {
710                         struct net *net = dev_net(rt->rt6i_dev);
711                         int saved_rt_min_interval =
712                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
713                         int saved_rt_elasticity =
714                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
715
716                         if (attempts-- > 0) {
717                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
718                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
719
720                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
721
722                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
723                                         saved_rt_elasticity;
724                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
725                                         saved_rt_min_interval;
726                                 goto retry;
727                         }
728
729                         if (net_ratelimit())
730                                 printk(KERN_WARNING
731                                        "ipv6: Neighbour table overflow.\n");
732                         dst_free(&rt->dst);
733                         return NULL;
734                 }
735                 rt->rt6i_nexthop = neigh;
736
737         }
738
739         return rt;
740 }
741
742 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, const struct in6_addr *daddr)
743 {
744         struct rt6_info *rt = ip6_rt_copy(ort);
745         if (rt) {
746                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
747                 rt->rt6i_dst.plen = 128;
748                 rt->rt6i_flags |= RTF_CACHE;
749                 rt->dst.flags |= DST_HOST;
750                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
751         }
752         return rt;
753 }
754
755 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
756                                       struct flowi6 *fl6, int flags)
757 {
758         struct fib6_node *fn;
759         struct rt6_info *rt, *nrt;
760         int strict = 0;
761         int attempts = 3;
762         int err;
763         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
764
765         strict |= flags & RT6_LOOKUP_F_IFACE;
766
767 relookup:
768         read_lock_bh(&table->tb6_lock);
769
770 restart_2:
771         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
772
773 restart:
774         rt = rt6_select(fn, oif, strict | reachable);
775
776         BACKTRACK(net, &fl6->saddr);
777         if (rt == net->ipv6.ip6_null_entry ||
778             rt->rt6i_flags & RTF_CACHE)
779                 goto out;
780
781         dst_hold(&rt->dst);
782         read_unlock_bh(&table->tb6_lock);
783
784         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
785                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
786         else if (!(rt->dst.flags & DST_HOST))
787                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
788         else
789                 goto out2;
790
791         dst_release(&rt->dst);
792         rt = nrt ? : net->ipv6.ip6_null_entry;
793
794         dst_hold(&rt->dst);
795         if (nrt) {
796                 err = ip6_ins_rt(nrt);
797                 if (!err)
798                         goto out2;
799         }
800
801         if (--attempts <= 0)
802                 goto out2;
803
804         /*
805          * Race condition! In the gap, when table->tb6_lock was
806          * released someone could insert this route.  Relookup.
807          */
808         dst_release(&rt->dst);
809         goto relookup;
810
811 out:
812         if (reachable) {
813                 reachable = 0;
814                 goto restart_2;
815         }
816         dst_hold(&rt->dst);
817         read_unlock_bh(&table->tb6_lock);
818 out2:
819         rt->dst.lastuse = jiffies;
820         rt->dst.__use++;
821
822         return rt;
823 }
824
825 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
826                                             struct flowi6 *fl6, int flags)
827 {
828         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
829 }
830
831 void ip6_route_input(struct sk_buff *skb)
832 {
833         const struct ipv6hdr *iph = ipv6_hdr(skb);
834         struct net *net = dev_net(skb->dev);
835         int flags = RT6_LOOKUP_F_HAS_SADDR;
836         struct flowi6 fl6 = {
837                 .flowi6_iif = skb->dev->ifindex,
838                 .daddr = iph->daddr,
839                 .saddr = iph->saddr,
840                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
841                 .flowi6_mark = skb->mark,
842                 .flowi6_proto = iph->nexthdr,
843         };
844
845         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
846                 flags |= RT6_LOOKUP_F_IFACE;
847
848         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
849 }
850
851 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
852                                              struct flowi6 *fl6, int flags)
853 {
854         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
855 }
856
857 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
858                                     struct flowi6 *fl6)
859 {
860         int flags = 0;
861
862         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
863                 flags |= RT6_LOOKUP_F_IFACE;
864
865         if (!ipv6_addr_any(&fl6->saddr))
866                 flags |= RT6_LOOKUP_F_HAS_SADDR;
867         else if (sk)
868                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
869
870         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
871 }
872
873 EXPORT_SYMBOL(ip6_route_output);
874
875 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
876 {
877         struct rt6_info *rt = dst_alloc(&ip6_dst_blackhole_ops, 1);
878         struct rt6_info *ort = (struct rt6_info *) dst_orig;
879         struct dst_entry *new = NULL;
880
881         if (rt) {
882                 new = &rt->dst;
883
884                 new->__use = 1;
885                 new->input = dst_discard;
886                 new->output = dst_discard;
887
888                 dst_copy_metrics(new, &ort->dst);
889                 new->dev = ort->dst.dev;
890                 if (new->dev)
891                         dev_hold(new->dev);
892                 rt->rt6i_idev = ort->rt6i_idev;
893                 if (rt->rt6i_idev)
894                         in6_dev_hold(rt->rt6i_idev);
895                 rt->rt6i_expires = 0;
896
897                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
898                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
899                 rt->rt6i_metric = 0;
900
901                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
902 #ifdef CONFIG_IPV6_SUBTREES
903                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
904 #endif
905
906                 dst_free(new);
907         }
908
909         dst_release(dst_orig);
910         return new ? new : ERR_PTR(-ENOMEM);
911 }
912
913 /*
914  *      Destination cache support functions
915  */
916
917 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
918 {
919         struct rt6_info *rt;
920
921         rt = (struct rt6_info *) dst;
922
923         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
924                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
925                         if (!rt->rt6i_peer)
926                                 rt6_bind_peer(rt, 0);
927                         rt->rt6i_peer_genid = rt6_peer_genid();
928                 }
929                 return dst;
930         }
931         return NULL;
932 }
933
934 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
935 {
936         struct rt6_info *rt = (struct rt6_info *) dst;
937
938         if (rt) {
939                 if (rt->rt6i_flags & RTF_CACHE) {
940                         if (rt6_check_expired(rt)) {
941                                 ip6_del_rt(rt);
942                                 dst = NULL;
943                         }
944                 } else {
945                         dst_release(dst);
946                         dst = NULL;
947                 }
948         }
949         return dst;
950 }
951
952 static void ip6_link_failure(struct sk_buff *skb)
953 {
954         struct rt6_info *rt;
955
956         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
957
958         rt = (struct rt6_info *) skb_dst(skb);
959         if (rt) {
960                 if (rt->rt6i_flags&RTF_CACHE) {
961                         dst_set_expires(&rt->dst, 0);
962                         rt->rt6i_flags |= RTF_EXPIRES;
963                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
964                         rt->rt6i_node->fn_sernum = -1;
965         }
966 }
967
968 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
969 {
970         struct rt6_info *rt6 = (struct rt6_info*)dst;
971
972         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
973                 rt6->rt6i_flags |= RTF_MODIFIED;
974                 if (mtu < IPV6_MIN_MTU) {
975                         u32 features = dst_metric(dst, RTAX_FEATURES);
976                         mtu = IPV6_MIN_MTU;
977                         features |= RTAX_FEATURE_ALLFRAG;
978                         dst_metric_set(dst, RTAX_FEATURES, features);
979                 }
980                 dst_metric_set(dst, RTAX_MTU, mtu);
981         }
982 }
983
984 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
985 {
986         struct net_device *dev = dst->dev;
987         unsigned int mtu = dst_mtu(dst);
988         struct net *net = dev_net(dev);
989
990         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
991
992         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
993                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
994
995         /*
996          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
997          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
998          * IPV6_MAXPLEN is also valid and means: "any MSS,
999          * rely only on pmtu discovery"
1000          */
1001         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1002                 mtu = IPV6_MAXPLEN;
1003         return mtu;
1004 }
1005
1006 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1007 {
1008         unsigned int mtu = IPV6_MIN_MTU;
1009         struct inet6_dev *idev;
1010
1011         rcu_read_lock();
1012         idev = __in6_dev_get(dst->dev);
1013         if (idev)
1014                 mtu = idev->cnf.mtu6;
1015         rcu_read_unlock();
1016
1017         return mtu;
1018 }
1019
1020 static struct dst_entry *icmp6_dst_gc_list;
1021 static DEFINE_SPINLOCK(icmp6_dst_lock);
1022
1023 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1024                                   struct neighbour *neigh,
1025                                   const struct in6_addr *addr)
1026 {
1027         struct rt6_info *rt;
1028         struct inet6_dev *idev = in6_dev_get(dev);
1029         struct net *net = dev_net(dev);
1030
1031         if (unlikely(idev == NULL))
1032                 return NULL;
1033
1034         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1035         if (unlikely(rt == NULL)) {
1036                 in6_dev_put(idev);
1037                 goto out;
1038         }
1039
1040         dev_hold(dev);
1041         if (neigh)
1042                 neigh_hold(neigh);
1043         else {
1044                 neigh = ndisc_get_neigh(dev, addr);
1045                 if (IS_ERR(neigh))
1046                         neigh = NULL;
1047         }
1048
1049         rt->rt6i_dev      = dev;
1050         rt->rt6i_idev     = idev;
1051         rt->rt6i_nexthop  = neigh;
1052         atomic_set(&rt->dst.__refcnt, 1);
1053         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1054         rt->dst.output  = ip6_output;
1055
1056 #if 0   /* there's no chance to use these for ndisc */
1057         rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
1058                                 ? DST_HOST
1059                                 : 0;
1060         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1061         rt->rt6i_dst.plen = 128;
1062 #endif
1063
1064         spin_lock_bh(&icmp6_dst_lock);
1065         rt->dst.next = icmp6_dst_gc_list;
1066         icmp6_dst_gc_list = &rt->dst;
1067         spin_unlock_bh(&icmp6_dst_lock);
1068
1069         fib6_force_start_gc(net);
1070
1071 out:
1072         return &rt->dst;
1073 }
1074
1075 int icmp6_dst_gc(void)
1076 {
1077         struct dst_entry *dst, **pprev;
1078         int more = 0;
1079
1080         spin_lock_bh(&icmp6_dst_lock);
1081         pprev = &icmp6_dst_gc_list;
1082
1083         while ((dst = *pprev) != NULL) {
1084                 if (!atomic_read(&dst->__refcnt)) {
1085                         *pprev = dst->next;
1086                         dst_free(dst);
1087                 } else {
1088                         pprev = &dst->next;
1089                         ++more;
1090                 }
1091         }
1092
1093         spin_unlock_bh(&icmp6_dst_lock);
1094
1095         return more;
1096 }
1097
1098 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1099                             void *arg)
1100 {
1101         struct dst_entry *dst, **pprev;
1102
1103         spin_lock_bh(&icmp6_dst_lock);
1104         pprev = &icmp6_dst_gc_list;
1105         while ((dst = *pprev) != NULL) {
1106                 struct rt6_info *rt = (struct rt6_info *) dst;
1107                 if (func(rt, arg)) {
1108                         *pprev = dst->next;
1109                         dst_free(dst);
1110                 } else {
1111                         pprev = &dst->next;
1112                 }
1113         }
1114         spin_unlock_bh(&icmp6_dst_lock);
1115 }
1116
1117 static int ip6_dst_gc(struct dst_ops *ops)
1118 {
1119         unsigned long now = jiffies;
1120         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1121         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1122         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1123         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1124         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1125         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1126         int entries;
1127
1128         entries = dst_entries_get_fast(ops);
1129         if (time_after(rt_last_gc + rt_min_interval, now) &&
1130             entries <= rt_max_size)
1131                 goto out;
1132
1133         net->ipv6.ip6_rt_gc_expire++;
1134         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1135         net->ipv6.ip6_rt_last_gc = now;
1136         entries = dst_entries_get_slow(ops);
1137         if (entries < ops->gc_thresh)
1138                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1139 out:
1140         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1141         return entries > rt_max_size;
1142 }
1143
1144 /* Clean host part of a prefix. Not necessary in radix tree,
1145    but results in cleaner routing tables.
1146
1147    Remove it only when all the things will work!
1148  */
1149
1150 int ip6_dst_hoplimit(struct dst_entry *dst)
1151 {
1152         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1153         if (hoplimit == 0) {
1154                 struct net_device *dev = dst->dev;
1155                 struct inet6_dev *idev;
1156
1157                 rcu_read_lock();
1158                 idev = __in6_dev_get(dev);
1159                 if (idev)
1160                         hoplimit = idev->cnf.hop_limit;
1161                 else
1162                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1163                 rcu_read_unlock();
1164         }
1165         return hoplimit;
1166 }
1167 EXPORT_SYMBOL(ip6_dst_hoplimit);
1168
1169 /*
1170  *
1171  */
1172
1173 int ip6_route_add(struct fib6_config *cfg)
1174 {
1175         int err;
1176         struct net *net = cfg->fc_nlinfo.nl_net;
1177         struct rt6_info *rt = NULL;
1178         struct net_device *dev = NULL;
1179         struct inet6_dev *idev = NULL;
1180         struct fib6_table *table;
1181         int addr_type;
1182
1183         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1184                 return -EINVAL;
1185 #ifndef CONFIG_IPV6_SUBTREES
1186         if (cfg->fc_src_len)
1187                 return -EINVAL;
1188 #endif
1189         if (cfg->fc_ifindex) {
1190                 err = -ENODEV;
1191                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1192                 if (!dev)
1193                         goto out;
1194                 idev = in6_dev_get(dev);
1195                 if (!idev)
1196                         goto out;
1197         }
1198
1199         if (cfg->fc_metric == 0)
1200                 cfg->fc_metric = IP6_RT_PRIO_USER;
1201
1202         table = fib6_new_table(net, cfg->fc_table);
1203         if (table == NULL) {
1204                 err = -ENOBUFS;
1205                 goto out;
1206         }
1207
1208         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1209
1210         if (rt == NULL) {
1211                 err = -ENOMEM;
1212                 goto out;
1213         }
1214
1215         rt->dst.obsolete = -1;
1216         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1217                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1218                                 0;
1219
1220         if (cfg->fc_protocol == RTPROT_UNSPEC)
1221                 cfg->fc_protocol = RTPROT_BOOT;
1222         rt->rt6i_protocol = cfg->fc_protocol;
1223
1224         addr_type = ipv6_addr_type(&cfg->fc_dst);
1225
1226         if (addr_type & IPV6_ADDR_MULTICAST)
1227                 rt->dst.input = ip6_mc_input;
1228         else if (cfg->fc_flags & RTF_LOCAL)
1229                 rt->dst.input = ip6_input;
1230         else
1231                 rt->dst.input = ip6_forward;
1232
1233         rt->dst.output = ip6_output;
1234
1235         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1236         rt->rt6i_dst.plen = cfg->fc_dst_len;
1237         if (rt->rt6i_dst.plen == 128)
1238                rt->dst.flags = DST_HOST;
1239
1240 #ifdef CONFIG_IPV6_SUBTREES
1241         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1242         rt->rt6i_src.plen = cfg->fc_src_len;
1243 #endif
1244
1245         rt->rt6i_metric = cfg->fc_metric;
1246
1247         /* We cannot add true routes via loopback here,
1248            they would result in kernel looping; promote them to reject routes
1249          */
1250         if ((cfg->fc_flags & RTF_REJECT) ||
1251             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1252                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1253                 /* hold loopback dev/idev if we haven't done so. */
1254                 if (dev != net->loopback_dev) {
1255                         if (dev) {
1256                                 dev_put(dev);
1257                                 in6_dev_put(idev);
1258                         }
1259                         dev = net->loopback_dev;
1260                         dev_hold(dev);
1261                         idev = in6_dev_get(dev);
1262                         if (!idev) {
1263                                 err = -ENODEV;
1264                                 goto out;
1265                         }
1266                 }
1267                 rt->dst.output = ip6_pkt_discard_out;
1268                 rt->dst.input = ip6_pkt_discard;
1269                 rt->dst.error = -ENETUNREACH;
1270                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1271                 goto install_route;
1272         }
1273
1274         if (cfg->fc_flags & RTF_GATEWAY) {
1275                 const struct in6_addr *gw_addr;
1276                 int gwa_type;
1277
1278                 gw_addr = &cfg->fc_gateway;
1279                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1280                 gwa_type = ipv6_addr_type(gw_addr);
1281
1282                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1283                         struct rt6_info *grt;
1284
1285                         /* IPv6 strictly inhibits using not link-local
1286                            addresses as nexthop address.
1287                            Otherwise, router will not able to send redirects.
1288                            It is very good, but in some (rare!) circumstances
1289                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1290                            some exceptions. --ANK
1291                          */
1292                         err = -EINVAL;
1293                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1294                                 goto out;
1295
1296                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1297
1298                         err = -EHOSTUNREACH;
1299                         if (grt == NULL)
1300                                 goto out;
1301                         if (dev) {
1302                                 if (dev != grt->rt6i_dev) {
1303                                         dst_release(&grt->dst);
1304                                         goto out;
1305                                 }
1306                         } else {
1307                                 dev = grt->rt6i_dev;
1308                                 idev = grt->rt6i_idev;
1309                                 dev_hold(dev);
1310                                 in6_dev_hold(grt->rt6i_idev);
1311                         }
1312                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1313                                 err = 0;
1314                         dst_release(&grt->dst);
1315
1316                         if (err)
1317                                 goto out;
1318                 }
1319                 err = -EINVAL;
1320                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1321                         goto out;
1322         }
1323
1324         err = -ENODEV;
1325         if (dev == NULL)
1326                 goto out;
1327
1328         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1329                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1330                         err = -EINVAL;
1331                         goto out;
1332                 }
1333                 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1334                 rt->rt6i_prefsrc.plen = 128;
1335         } else
1336                 rt->rt6i_prefsrc.plen = 0;
1337
1338         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1339                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1340                 if (IS_ERR(rt->rt6i_nexthop)) {
1341                         err = PTR_ERR(rt->rt6i_nexthop);
1342                         rt->rt6i_nexthop = NULL;
1343                         goto out;
1344                 }
1345         }
1346
1347         rt->rt6i_flags = cfg->fc_flags;
1348
1349 install_route:
1350         if (cfg->fc_mx) {
1351                 struct nlattr *nla;
1352                 int remaining;
1353
1354                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1355                         int type = nla_type(nla);
1356
1357                         if (type) {
1358                                 if (type > RTAX_MAX) {
1359                                         err = -EINVAL;
1360                                         goto out;
1361                                 }
1362
1363                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1364                         }
1365                 }
1366         }
1367
1368         rt->dst.dev = dev;
1369         rt->rt6i_idev = idev;
1370         rt->rt6i_table = table;
1371
1372         cfg->fc_nlinfo.nl_net = dev_net(dev);
1373
1374         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1375
1376 out:
1377         if (dev)
1378                 dev_put(dev);
1379         if (idev)
1380                 in6_dev_put(idev);
1381         if (rt)
1382                 dst_free(&rt->dst);
1383         return err;
1384 }
1385
1386 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1387 {
1388         int err;
1389         struct fib6_table *table;
1390         struct net *net = dev_net(rt->rt6i_dev);
1391
1392         if (rt == net->ipv6.ip6_null_entry)
1393                 return -ENOENT;
1394
1395         table = rt->rt6i_table;
1396         write_lock_bh(&table->tb6_lock);
1397
1398         err = fib6_del(rt, info);
1399         dst_release(&rt->dst);
1400
1401         write_unlock_bh(&table->tb6_lock);
1402
1403         return err;
1404 }
1405
1406 int ip6_del_rt(struct rt6_info *rt)
1407 {
1408         struct nl_info info = {
1409                 .nl_net = dev_net(rt->rt6i_dev),
1410         };
1411         return __ip6_del_rt(rt, &info);
1412 }
1413
1414 static int ip6_route_del(struct fib6_config *cfg)
1415 {
1416         struct fib6_table *table;
1417         struct fib6_node *fn;
1418         struct rt6_info *rt;
1419         int err = -ESRCH;
1420
1421         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1422         if (table == NULL)
1423                 return err;
1424
1425         read_lock_bh(&table->tb6_lock);
1426
1427         fn = fib6_locate(&table->tb6_root,
1428                          &cfg->fc_dst, cfg->fc_dst_len,
1429                          &cfg->fc_src, cfg->fc_src_len);
1430
1431         if (fn) {
1432                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1433                         if (cfg->fc_ifindex &&
1434                             (rt->rt6i_dev == NULL ||
1435                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1436                                 continue;
1437                         if (cfg->fc_flags & RTF_GATEWAY &&
1438                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1439                                 continue;
1440                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1441                                 continue;
1442                         dst_hold(&rt->dst);
1443                         read_unlock_bh(&table->tb6_lock);
1444
1445                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1446                 }
1447         }
1448         read_unlock_bh(&table->tb6_lock);
1449
1450         return err;
1451 }
1452
1453 /*
1454  *      Handle redirects
1455  */
1456 struct ip6rd_flowi {
1457         struct flowi6 fl6;
1458         struct in6_addr gateway;
1459 };
1460
1461 static struct rt6_info *__ip6_route_redirect(struct net *net,
1462                                              struct fib6_table *table,
1463                                              struct flowi6 *fl6,
1464                                              int flags)
1465 {
1466         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1467         struct rt6_info *rt;
1468         struct fib6_node *fn;
1469
1470         /*
1471          * Get the "current" route for this destination and
1472          * check if the redirect has come from approriate router.
1473          *
1474          * RFC 2461 specifies that redirects should only be
1475          * accepted if they come from the nexthop to the target.
1476          * Due to the way the routes are chosen, this notion
1477          * is a bit fuzzy and one might need to check all possible
1478          * routes.
1479          */
1480
1481         read_lock_bh(&table->tb6_lock);
1482         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1483 restart:
1484         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1485                 /*
1486                  * Current route is on-link; redirect is always invalid.
1487                  *
1488                  * Seems, previous statement is not true. It could
1489                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1490                  * But then router serving it might decide, that we should
1491                  * know truth 8)8) --ANK (980726).
1492                  */
1493                 if (rt6_check_expired(rt))
1494                         continue;
1495                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1496                         continue;
1497                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1498                         continue;
1499                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1500                         continue;
1501                 break;
1502         }
1503
1504         if (!rt)
1505                 rt = net->ipv6.ip6_null_entry;
1506         BACKTRACK(net, &fl6->saddr);
1507 out:
1508         dst_hold(&rt->dst);
1509
1510         read_unlock_bh(&table->tb6_lock);
1511
1512         return rt;
1513 };
1514
1515 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1516                                            const struct in6_addr *src,
1517                                            const struct in6_addr *gateway,
1518                                            struct net_device *dev)
1519 {
1520         int flags = RT6_LOOKUP_F_HAS_SADDR;
1521         struct net *net = dev_net(dev);
1522         struct ip6rd_flowi rdfl = {
1523                 .fl6 = {
1524                         .flowi6_oif = dev->ifindex,
1525                         .daddr = *dest,
1526                         .saddr = *src,
1527                 },
1528         };
1529
1530         ipv6_addr_copy(&rdfl.gateway, gateway);
1531
1532         if (rt6_need_strict(dest))
1533                 flags |= RT6_LOOKUP_F_IFACE;
1534
1535         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1536                                                    flags, __ip6_route_redirect);
1537 }
1538
1539 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1540                   const struct in6_addr *saddr,
1541                   struct neighbour *neigh, u8 *lladdr, int on_link)
1542 {
1543         struct rt6_info *rt, *nrt = NULL;
1544         struct netevent_redirect netevent;
1545         struct net *net = dev_net(neigh->dev);
1546
1547         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1548
1549         if (rt == net->ipv6.ip6_null_entry) {
1550                 if (net_ratelimit())
1551                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1552                                "for redirect target\n");
1553                 goto out;
1554         }
1555
1556         /*
1557          *      We have finally decided to accept it.
1558          */
1559
1560         neigh_update(neigh, lladdr, NUD_STALE,
1561                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1562                      NEIGH_UPDATE_F_OVERRIDE|
1563                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1564                                      NEIGH_UPDATE_F_ISROUTER))
1565                      );
1566
1567         /*
1568          * Redirect received -> path was valid.
1569          * Look, redirects are sent only in response to data packets,
1570          * so that this nexthop apparently is reachable. --ANK
1571          */
1572         dst_confirm(&rt->dst);
1573
1574         /* Duplicate redirect: silently ignore. */
1575         if (neigh == rt->dst.neighbour)
1576                 goto out;
1577
1578         nrt = ip6_rt_copy(rt);
1579         if (nrt == NULL)
1580                 goto out;
1581
1582         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1583         if (on_link)
1584                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1585
1586         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1587         nrt->rt6i_dst.plen = 128;
1588         nrt->dst.flags |= DST_HOST;
1589
1590         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1591         nrt->rt6i_nexthop = neigh_clone(neigh);
1592
1593         if (ip6_ins_rt(nrt))
1594                 goto out;
1595
1596         netevent.old = &rt->dst;
1597         netevent.new = &nrt->dst;
1598         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1599
1600         if (rt->rt6i_flags&RTF_CACHE) {
1601                 ip6_del_rt(rt);
1602                 return;
1603         }
1604
1605 out:
1606         dst_release(&rt->dst);
1607 }
1608
1609 /*
1610  *      Handle ICMP "packet too big" messages
1611  *      i.e. Path MTU discovery
1612  */
1613
1614 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1615                              struct net *net, u32 pmtu, int ifindex)
1616 {
1617         struct rt6_info *rt, *nrt;
1618         int allfrag = 0;
1619 again:
1620         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1621         if (rt == NULL)
1622                 return;
1623
1624         if (rt6_check_expired(rt)) {
1625                 ip6_del_rt(rt);
1626                 goto again;
1627         }
1628
1629         if (pmtu >= dst_mtu(&rt->dst))
1630                 goto out;
1631
1632         if (pmtu < IPV6_MIN_MTU) {
1633                 /*
1634                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1635                  * MTU (1280) and a fragment header should always be included
1636                  * after a node receiving Too Big message reporting PMTU is
1637                  * less than the IPv6 Minimum Link MTU.
1638                  */
1639                 pmtu = IPV6_MIN_MTU;
1640                 allfrag = 1;
1641         }
1642
1643         /* New mtu received -> path was valid.
1644            They are sent only in response to data packets,
1645            so that this nexthop apparently is reachable. --ANK
1646          */
1647         dst_confirm(&rt->dst);
1648
1649         /* Host route. If it is static, it would be better
1650            not to override it, but add new one, so that
1651            when cache entry will expire old pmtu
1652            would return automatically.
1653          */
1654         if (rt->rt6i_flags & RTF_CACHE) {
1655                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1656                 if (allfrag) {
1657                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1658                         features |= RTAX_FEATURE_ALLFRAG;
1659                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1660                 }
1661                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1662                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1663                 goto out;
1664         }
1665
1666         /* Network route.
1667            Two cases are possible:
1668            1. It is connected route. Action: COW
1669            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1670          */
1671         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1672                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1673         else
1674                 nrt = rt6_alloc_clone(rt, daddr);
1675
1676         if (nrt) {
1677                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1678                 if (allfrag) {
1679                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1680                         features |= RTAX_FEATURE_ALLFRAG;
1681                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1682                 }
1683
1684                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1685                  * happened within 5 mins, the recommended timer is 10 mins.
1686                  * Here this route expiration time is set to ip6_rt_mtu_expires
1687                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1688                  * and detecting PMTU increase will be automatically happened.
1689                  */
1690                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1691                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1692
1693                 ip6_ins_rt(nrt);
1694         }
1695 out:
1696         dst_release(&rt->dst);
1697 }
1698
1699 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1700                         struct net_device *dev, u32 pmtu)
1701 {
1702         struct net *net = dev_net(dev);
1703
1704         /*
1705          * RFC 1981 states that a node "MUST reduce the size of the packets it
1706          * is sending along the path" that caused the Packet Too Big message.
1707          * Since it's not possible in the general case to determine which
1708          * interface was used to send the original packet, we update the MTU
1709          * on the interface that will be used to send future packets. We also
1710          * update the MTU on the interface that received the Packet Too Big in
1711          * case the original packet was forced out that interface with
1712          * SO_BINDTODEVICE or similar. This is the next best thing to the
1713          * correct behaviour, which would be to update the MTU on all
1714          * interfaces.
1715          */
1716         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1717         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1718 }
1719
1720 /*
1721  *      Misc support functions
1722  */
1723
1724 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1725 {
1726         struct net *net = dev_net(ort->rt6i_dev);
1727         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1728
1729         if (rt) {
1730                 rt->dst.input = ort->dst.input;
1731                 rt->dst.output = ort->dst.output;
1732
1733                 dst_copy_metrics(&rt->dst, &ort->dst);
1734                 rt->dst.error = ort->dst.error;
1735                 rt->dst.dev = ort->dst.dev;
1736                 if (rt->dst.dev)
1737                         dev_hold(rt->dst.dev);
1738                 rt->rt6i_idev = ort->rt6i_idev;
1739                 if (rt->rt6i_idev)
1740                         in6_dev_hold(rt->rt6i_idev);
1741                 rt->dst.lastuse = jiffies;
1742                 rt->rt6i_expires = 0;
1743
1744                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1745                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1746                 rt->rt6i_metric = 0;
1747
1748                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1749 #ifdef CONFIG_IPV6_SUBTREES
1750                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1751 #endif
1752                 rt->rt6i_table = ort->rt6i_table;
1753         }
1754         return rt;
1755 }
1756
1757 #ifdef CONFIG_IPV6_ROUTE_INFO
1758 static struct rt6_info *rt6_get_route_info(struct net *net,
1759                                            const struct in6_addr *prefix, int prefixlen,
1760                                            const struct in6_addr *gwaddr, int ifindex)
1761 {
1762         struct fib6_node *fn;
1763         struct rt6_info *rt = NULL;
1764         struct fib6_table *table;
1765
1766         table = fib6_get_table(net, RT6_TABLE_INFO);
1767         if (table == NULL)
1768                 return NULL;
1769
1770         write_lock_bh(&table->tb6_lock);
1771         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1772         if (!fn)
1773                 goto out;
1774
1775         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1776                 if (rt->rt6i_dev->ifindex != ifindex)
1777                         continue;
1778                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1779                         continue;
1780                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1781                         continue;
1782                 dst_hold(&rt->dst);
1783                 break;
1784         }
1785 out:
1786         write_unlock_bh(&table->tb6_lock);
1787         return rt;
1788 }
1789
1790 static struct rt6_info *rt6_add_route_info(struct net *net,
1791                                            const struct in6_addr *prefix, int prefixlen,
1792                                            const struct in6_addr *gwaddr, int ifindex,
1793                                            unsigned pref)
1794 {
1795         struct fib6_config cfg = {
1796                 .fc_table       = RT6_TABLE_INFO,
1797                 .fc_metric      = IP6_RT_PRIO_USER,
1798                 .fc_ifindex     = ifindex,
1799                 .fc_dst_len     = prefixlen,
1800                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1801                                   RTF_UP | RTF_PREF(pref),
1802                 .fc_nlinfo.pid = 0,
1803                 .fc_nlinfo.nlh = NULL,
1804                 .fc_nlinfo.nl_net = net,
1805         };
1806
1807         ipv6_addr_copy(&cfg.fc_dst, prefix);
1808         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1809
1810         /* We should treat it as a default route if prefix length is 0. */
1811         if (!prefixlen)
1812                 cfg.fc_flags |= RTF_DEFAULT;
1813
1814         ip6_route_add(&cfg);
1815
1816         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1817 }
1818 #endif
1819
1820 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1821 {
1822         struct rt6_info *rt;
1823         struct fib6_table *table;
1824
1825         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1826         if (table == NULL)
1827                 return NULL;
1828
1829         write_lock_bh(&table->tb6_lock);
1830         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1831                 if (dev == rt->rt6i_dev &&
1832                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1833                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1834                         break;
1835         }
1836         if (rt)
1837                 dst_hold(&rt->dst);
1838         write_unlock_bh(&table->tb6_lock);
1839         return rt;
1840 }
1841
1842 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1843                                      struct net_device *dev,
1844                                      unsigned int pref)
1845 {
1846         struct fib6_config cfg = {
1847                 .fc_table       = RT6_TABLE_DFLT,
1848                 .fc_metric      = IP6_RT_PRIO_USER,
1849                 .fc_ifindex     = dev->ifindex,
1850                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1851                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1852                 .fc_nlinfo.pid = 0,
1853                 .fc_nlinfo.nlh = NULL,
1854                 .fc_nlinfo.nl_net = dev_net(dev),
1855         };
1856
1857         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1858
1859         ip6_route_add(&cfg);
1860
1861         return rt6_get_dflt_router(gwaddr, dev);
1862 }
1863
1864 void rt6_purge_dflt_routers(struct net *net)
1865 {
1866         struct rt6_info *rt;
1867         struct fib6_table *table;
1868
1869         /* NOTE: Keep consistent with rt6_get_dflt_router */
1870         table = fib6_get_table(net, RT6_TABLE_DFLT);
1871         if (table == NULL)
1872                 return;
1873
1874 restart:
1875         read_lock_bh(&table->tb6_lock);
1876         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1877                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1878                         dst_hold(&rt->dst);
1879                         read_unlock_bh(&table->tb6_lock);
1880                         ip6_del_rt(rt);
1881                         goto restart;
1882                 }
1883         }
1884         read_unlock_bh(&table->tb6_lock);
1885 }
1886
1887 static void rtmsg_to_fib6_config(struct net *net,
1888                                  struct in6_rtmsg *rtmsg,
1889                                  struct fib6_config *cfg)
1890 {
1891         memset(cfg, 0, sizeof(*cfg));
1892
1893         cfg->fc_table = RT6_TABLE_MAIN;
1894         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1895         cfg->fc_metric = rtmsg->rtmsg_metric;
1896         cfg->fc_expires = rtmsg->rtmsg_info;
1897         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1898         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1899         cfg->fc_flags = rtmsg->rtmsg_flags;
1900
1901         cfg->fc_nlinfo.nl_net = net;
1902
1903         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1904         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1905         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1906 }
1907
1908 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1909 {
1910         struct fib6_config cfg;
1911         struct in6_rtmsg rtmsg;
1912         int err;
1913
1914         switch(cmd) {
1915         case SIOCADDRT:         /* Add a route */
1916         case SIOCDELRT:         /* Delete a route */
1917                 if (!capable(CAP_NET_ADMIN))
1918                         return -EPERM;
1919                 err = copy_from_user(&rtmsg, arg,
1920                                      sizeof(struct in6_rtmsg));
1921                 if (err)
1922                         return -EFAULT;
1923
1924                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1925
1926                 rtnl_lock();
1927                 switch (cmd) {
1928                 case SIOCADDRT:
1929                         err = ip6_route_add(&cfg);
1930                         break;
1931                 case SIOCDELRT:
1932                         err = ip6_route_del(&cfg);
1933                         break;
1934                 default:
1935                         err = -EINVAL;
1936                 }
1937                 rtnl_unlock();
1938
1939                 return err;
1940         }
1941
1942         return -EINVAL;
1943 }
1944
1945 /*
1946  *      Drop the packet on the floor
1947  */
1948
1949 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1950 {
1951         int type;
1952         struct dst_entry *dst = skb_dst(skb);
1953         switch (ipstats_mib_noroutes) {
1954         case IPSTATS_MIB_INNOROUTES:
1955                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1956                 if (type == IPV6_ADDR_ANY) {
1957                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1958                                       IPSTATS_MIB_INADDRERRORS);
1959                         break;
1960                 }
1961                 /* FALLTHROUGH */
1962         case IPSTATS_MIB_OUTNOROUTES:
1963                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1964                               ipstats_mib_noroutes);
1965                 break;
1966         }
1967         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1968         kfree_skb(skb);
1969         return 0;
1970 }
1971
1972 static int ip6_pkt_discard(struct sk_buff *skb)
1973 {
1974         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1975 }
1976
1977 static int ip6_pkt_discard_out(struct sk_buff *skb)
1978 {
1979         skb->dev = skb_dst(skb)->dev;
1980         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1981 }
1982
1983 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1984
1985 static int ip6_pkt_prohibit(struct sk_buff *skb)
1986 {
1987         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1988 }
1989
1990 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1991 {
1992         skb->dev = skb_dst(skb)->dev;
1993         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1994 }
1995
1996 #endif
1997
1998 /*
1999  *      Allocate a dst for local (unicast / anycast) address.
2000  */
2001
2002 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2003                                     const struct in6_addr *addr,
2004                                     int anycast)
2005 {
2006         struct net *net = dev_net(idev->dev);
2007         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
2008         struct neighbour *neigh;
2009
2010         if (rt == NULL) {
2011                 if (net_ratelimit())
2012                         pr_warning("IPv6:  Maximum number of routes reached,"
2013                                    " consider increasing route/max_size.\n");
2014                 return ERR_PTR(-ENOMEM);
2015         }
2016
2017         dev_hold(net->loopback_dev);
2018         in6_dev_hold(idev);
2019
2020         rt->dst.flags = DST_HOST;
2021         rt->dst.input = ip6_input;
2022         rt->dst.output = ip6_output;
2023         rt->rt6i_dev = net->loopback_dev;
2024         rt->rt6i_idev = idev;
2025         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, -1);
2026         rt->dst.obsolete = -1;
2027
2028         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2029         if (anycast)
2030                 rt->rt6i_flags |= RTF_ANYCAST;
2031         else
2032                 rt->rt6i_flags |= RTF_LOCAL;
2033         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2034         if (IS_ERR(neigh)) {
2035                 dst_free(&rt->dst);
2036
2037                 return ERR_CAST(neigh);
2038         }
2039         rt->rt6i_nexthop = neigh;
2040
2041         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2042         rt->rt6i_dst.plen = 128;
2043         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2044
2045         atomic_set(&rt->dst.__refcnt, 1);
2046
2047         return rt;
2048 }
2049
2050 int ip6_route_get_saddr(struct net *net,
2051                         struct rt6_info *rt,
2052                         const struct in6_addr *daddr,
2053                         unsigned int prefs,
2054                         struct in6_addr *saddr)
2055 {
2056         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2057         int err = 0;
2058         if (rt->rt6i_prefsrc.plen)
2059                 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2060         else
2061                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2062                                          daddr, prefs, saddr);
2063         return err;
2064 }
2065
2066 /* remove deleted ip from prefsrc entries */
2067 struct arg_dev_net_ip {
2068         struct net_device *dev;
2069         struct net *net;
2070         struct in6_addr *addr;
2071 };
2072
2073 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2074 {
2075         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2076         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2077         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2078
2079         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2080             rt != net->ipv6.ip6_null_entry &&
2081             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2082                 /* remove prefsrc entry */
2083                 rt->rt6i_prefsrc.plen = 0;
2084         }
2085         return 0;
2086 }
2087
2088 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2089 {
2090         struct net *net = dev_net(ifp->idev->dev);
2091         struct arg_dev_net_ip adni = {
2092                 .dev = ifp->idev->dev,
2093                 .net = net,
2094                 .addr = &ifp->addr,
2095         };
2096         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2097 }
2098
2099 struct arg_dev_net {
2100         struct net_device *dev;
2101         struct net *net;
2102 };
2103
2104 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2105 {
2106         const struct arg_dev_net *adn = arg;
2107         const struct net_device *dev = adn->dev;
2108
2109         if ((rt->rt6i_dev == dev || dev == NULL) &&
2110             rt != adn->net->ipv6.ip6_null_entry) {
2111                 RT6_TRACE("deleted by ifdown %p\n", rt);
2112                 return -1;
2113         }
2114         return 0;
2115 }
2116
2117 void rt6_ifdown(struct net *net, struct net_device *dev)
2118 {
2119         struct arg_dev_net adn = {
2120                 .dev = dev,
2121                 .net = net,
2122         };
2123
2124         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2125         icmp6_clean_all(fib6_ifdown, &adn);
2126 }
2127
2128 struct rt6_mtu_change_arg
2129 {
2130         struct net_device *dev;
2131         unsigned mtu;
2132 };
2133
2134 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2135 {
2136         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2137         struct inet6_dev *idev;
2138
2139         /* In IPv6 pmtu discovery is not optional,
2140            so that RTAX_MTU lock cannot disable it.
2141            We still use this lock to block changes
2142            caused by addrconf/ndisc.
2143         */
2144
2145         idev = __in6_dev_get(arg->dev);
2146         if (idev == NULL)
2147                 return 0;
2148
2149         /* For administrative MTU increase, there is no way to discover
2150            IPv6 PMTU increase, so PMTU increase should be updated here.
2151            Since RFC 1981 doesn't include administrative MTU increase
2152            update PMTU increase is a MUST. (i.e. jumbo frame)
2153          */
2154         /*
2155            If new MTU is less than route PMTU, this new MTU will be the
2156            lowest MTU in the path, update the route PMTU to reflect PMTU
2157            decreases; if new MTU is greater than route PMTU, and the
2158            old MTU is the lowest MTU in the path, update the route PMTU
2159            to reflect the increase. In this case if the other nodes' MTU
2160            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2161            PMTU discouvery.
2162          */
2163         if (rt->rt6i_dev == arg->dev &&
2164             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2165             (dst_mtu(&rt->dst) >= arg->mtu ||
2166              (dst_mtu(&rt->dst) < arg->mtu &&
2167               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2168                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2169         }
2170         return 0;
2171 }
2172
2173 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2174 {
2175         struct rt6_mtu_change_arg arg = {
2176                 .dev = dev,
2177                 .mtu = mtu,
2178         };
2179
2180         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2181 }
2182
2183 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2184         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2185         [RTA_OIF]               = { .type = NLA_U32 },
2186         [RTA_IIF]               = { .type = NLA_U32 },
2187         [RTA_PRIORITY]          = { .type = NLA_U32 },
2188         [RTA_METRICS]           = { .type = NLA_NESTED },
2189 };
2190
2191 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2192                               struct fib6_config *cfg)
2193 {
2194         struct rtmsg *rtm;
2195         struct nlattr *tb[RTA_MAX+1];
2196         int err;
2197
2198         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2199         if (err < 0)
2200                 goto errout;
2201
2202         err = -EINVAL;
2203         rtm = nlmsg_data(nlh);
2204         memset(cfg, 0, sizeof(*cfg));
2205
2206         cfg->fc_table = rtm->rtm_table;
2207         cfg->fc_dst_len = rtm->rtm_dst_len;
2208         cfg->fc_src_len = rtm->rtm_src_len;
2209         cfg->fc_flags = RTF_UP;
2210         cfg->fc_protocol = rtm->rtm_protocol;
2211
2212         if (rtm->rtm_type == RTN_UNREACHABLE)
2213                 cfg->fc_flags |= RTF_REJECT;
2214
2215         if (rtm->rtm_type == RTN_LOCAL)
2216                 cfg->fc_flags |= RTF_LOCAL;
2217
2218         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2219         cfg->fc_nlinfo.nlh = nlh;
2220         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2221
2222         if (tb[RTA_GATEWAY]) {
2223                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2224                 cfg->fc_flags |= RTF_GATEWAY;
2225         }
2226
2227         if (tb[RTA_DST]) {
2228                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2229
2230                 if (nla_len(tb[RTA_DST]) < plen)
2231                         goto errout;
2232
2233                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2234         }
2235
2236         if (tb[RTA_SRC]) {
2237                 int plen = (rtm->rtm_src_len + 7) >> 3;
2238
2239                 if (nla_len(tb[RTA_SRC]) < plen)
2240                         goto errout;
2241
2242                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2243         }
2244
2245         if (tb[RTA_PREFSRC])
2246                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2247
2248         if (tb[RTA_OIF])
2249                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2250
2251         if (tb[RTA_PRIORITY])
2252                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2253
2254         if (tb[RTA_METRICS]) {
2255                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2256                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2257         }
2258
2259         if (tb[RTA_TABLE])
2260                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2261
2262         err = 0;
2263 errout:
2264         return err;
2265 }
2266
2267 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2268 {
2269         struct fib6_config cfg;
2270         int err;
2271
2272         err = rtm_to_fib6_config(skb, nlh, &cfg);
2273         if (err < 0)
2274                 return err;
2275
2276         return ip6_route_del(&cfg);
2277 }
2278
2279 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2280 {
2281         struct fib6_config cfg;
2282         int err;
2283
2284         err = rtm_to_fib6_config(skb, nlh, &cfg);
2285         if (err < 0)
2286                 return err;
2287
2288         return ip6_route_add(&cfg);
2289 }
2290
2291 static inline size_t rt6_nlmsg_size(void)
2292 {
2293         return NLMSG_ALIGN(sizeof(struct rtmsg))
2294                + nla_total_size(16) /* RTA_SRC */
2295                + nla_total_size(16) /* RTA_DST */
2296                + nla_total_size(16) /* RTA_GATEWAY */
2297                + nla_total_size(16) /* RTA_PREFSRC */
2298                + nla_total_size(4) /* RTA_TABLE */
2299                + nla_total_size(4) /* RTA_IIF */
2300                + nla_total_size(4) /* RTA_OIF */
2301                + nla_total_size(4) /* RTA_PRIORITY */
2302                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2303                + nla_total_size(sizeof(struct rta_cacheinfo));
2304 }
2305
2306 static int rt6_fill_node(struct net *net,
2307                          struct sk_buff *skb, struct rt6_info *rt,
2308                          struct in6_addr *dst, struct in6_addr *src,
2309                          int iif, int type, u32 pid, u32 seq,
2310                          int prefix, int nowait, unsigned int flags)
2311 {
2312         struct rtmsg *rtm;
2313         struct nlmsghdr *nlh;
2314         long expires;
2315         u32 table;
2316
2317         if (prefix) {   /* user wants prefix routes only */
2318                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2319                         /* success since this is not a prefix route */
2320                         return 1;
2321                 }
2322         }
2323
2324         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2325         if (nlh == NULL)
2326                 return -EMSGSIZE;
2327
2328         rtm = nlmsg_data(nlh);
2329         rtm->rtm_family = AF_INET6;
2330         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2331         rtm->rtm_src_len = rt->rt6i_src.plen;
2332         rtm->rtm_tos = 0;
2333         if (rt->rt6i_table)
2334                 table = rt->rt6i_table->tb6_id;
2335         else
2336                 table = RT6_TABLE_UNSPEC;
2337         rtm->rtm_table = table;
2338         NLA_PUT_U32(skb, RTA_TABLE, table);
2339         if (rt->rt6i_flags&RTF_REJECT)
2340                 rtm->rtm_type = RTN_UNREACHABLE;
2341         else if (rt->rt6i_flags&RTF_LOCAL)
2342                 rtm->rtm_type = RTN_LOCAL;
2343         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2344                 rtm->rtm_type = RTN_LOCAL;
2345         else
2346                 rtm->rtm_type = RTN_UNICAST;
2347         rtm->rtm_flags = 0;
2348         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2349         rtm->rtm_protocol = rt->rt6i_protocol;
2350         if (rt->rt6i_flags&RTF_DYNAMIC)
2351                 rtm->rtm_protocol = RTPROT_REDIRECT;
2352         else if (rt->rt6i_flags & RTF_ADDRCONF)
2353                 rtm->rtm_protocol = RTPROT_KERNEL;
2354         else if (rt->rt6i_flags&RTF_DEFAULT)
2355                 rtm->rtm_protocol = RTPROT_RA;
2356
2357         if (rt->rt6i_flags&RTF_CACHE)
2358                 rtm->rtm_flags |= RTM_F_CLONED;
2359
2360         if (dst) {
2361                 NLA_PUT(skb, RTA_DST, 16, dst);
2362                 rtm->rtm_dst_len = 128;
2363         } else if (rtm->rtm_dst_len)
2364                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2365 #ifdef CONFIG_IPV6_SUBTREES
2366         if (src) {
2367                 NLA_PUT(skb, RTA_SRC, 16, src);
2368                 rtm->rtm_src_len = 128;
2369         } else if (rtm->rtm_src_len)
2370                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2371 #endif
2372         if (iif) {
2373 #ifdef CONFIG_IPV6_MROUTE
2374                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2375                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2376                         if (err <= 0) {
2377                                 if (!nowait) {
2378                                         if (err == 0)
2379                                                 return 0;
2380                                         goto nla_put_failure;
2381                                 } else {
2382                                         if (err == -EMSGSIZE)
2383                                                 goto nla_put_failure;
2384                                 }
2385                         }
2386                 } else
2387 #endif
2388                         NLA_PUT_U32(skb, RTA_IIF, iif);
2389         } else if (dst) {
2390                 struct in6_addr saddr_buf;
2391                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2392                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2393         }
2394
2395         if (rt->rt6i_prefsrc.plen) {
2396                 struct in6_addr saddr_buf;
2397                 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2398                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2399         }
2400
2401         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2402                 goto nla_put_failure;
2403
2404         if (rt->dst.neighbour)
2405                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2406
2407         if (rt->dst.dev)
2408                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2409
2410         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2411
2412         if (!(rt->rt6i_flags & RTF_EXPIRES))
2413                 expires = 0;
2414         else if (rt->rt6i_expires - jiffies < INT_MAX)
2415                 expires = rt->rt6i_expires - jiffies;
2416         else
2417                 expires = INT_MAX;
2418
2419         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2420                                expires, rt->dst.error) < 0)
2421                 goto nla_put_failure;
2422
2423         return nlmsg_end(skb, nlh);
2424
2425 nla_put_failure:
2426         nlmsg_cancel(skb, nlh);
2427         return -EMSGSIZE;
2428 }
2429
2430 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2431 {
2432         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2433         int prefix;
2434
2435         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2436                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2437                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2438         } else
2439                 prefix = 0;
2440
2441         return rt6_fill_node(arg->net,
2442                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2443                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2444                      prefix, 0, NLM_F_MULTI);
2445 }
2446
2447 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2448 {
2449         struct net *net = sock_net(in_skb->sk);
2450         struct nlattr *tb[RTA_MAX+1];
2451         struct rt6_info *rt;
2452         struct sk_buff *skb;
2453         struct rtmsg *rtm;
2454         struct flowi6 fl6;
2455         int err, iif = 0;
2456
2457         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2458         if (err < 0)
2459                 goto errout;
2460
2461         err = -EINVAL;
2462         memset(&fl6, 0, sizeof(fl6));
2463
2464         if (tb[RTA_SRC]) {
2465                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2466                         goto errout;
2467
2468                 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2469         }
2470
2471         if (tb[RTA_DST]) {
2472                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2473                         goto errout;
2474
2475                 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2476         }
2477
2478         if (tb[RTA_IIF])
2479                 iif = nla_get_u32(tb[RTA_IIF]);
2480
2481         if (tb[RTA_OIF])
2482                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2483
2484         if (iif) {
2485                 struct net_device *dev;
2486                 dev = __dev_get_by_index(net, iif);
2487                 if (!dev) {
2488                         err = -ENODEV;
2489                         goto errout;
2490                 }
2491         }
2492
2493         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2494         if (skb == NULL) {
2495                 err = -ENOBUFS;
2496                 goto errout;
2497         }
2498
2499         /* Reserve room for dummy headers, this skb can pass
2500            through good chunk of routing engine.
2501          */
2502         skb_reset_mac_header(skb);
2503         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2504
2505         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2506         skb_dst_set(skb, &rt->dst);
2507
2508         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2509                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2510                             nlh->nlmsg_seq, 0, 0, 0);
2511         if (err < 0) {
2512                 kfree_skb(skb);
2513                 goto errout;
2514         }
2515
2516         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2517 errout:
2518         return err;
2519 }
2520
2521 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2522 {
2523         struct sk_buff *skb;
2524         struct net *net = info->nl_net;
2525         u32 seq;
2526         int err;
2527
2528         err = -ENOBUFS;
2529         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2530
2531         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2532         if (skb == NULL)
2533                 goto errout;
2534
2535         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2536                                 event, info->pid, seq, 0, 0, 0);
2537         if (err < 0) {
2538                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2539                 WARN_ON(err == -EMSGSIZE);
2540                 kfree_skb(skb);
2541                 goto errout;
2542         }
2543         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2544                     info->nlh, gfp_any());
2545         return;
2546 errout:
2547         if (err < 0)
2548                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2549 }
2550
2551 static int ip6_route_dev_notify(struct notifier_block *this,
2552                                 unsigned long event, void *data)
2553 {
2554         struct net_device *dev = (struct net_device *)data;
2555         struct net *net = dev_net(dev);
2556
2557         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2558                 net->ipv6.ip6_null_entry->dst.dev = dev;
2559                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2560 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2561                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2562                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2563                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2564                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2565 #endif
2566         }
2567
2568         return NOTIFY_OK;
2569 }
2570
2571 /*
2572  *      /proc
2573  */
2574
2575 #ifdef CONFIG_PROC_FS
2576
2577 struct rt6_proc_arg
2578 {
2579         char *buffer;
2580         int offset;
2581         int length;
2582         int skip;
2583         int len;
2584 };
2585
2586 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2587 {
2588         struct seq_file *m = p_arg;
2589
2590         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2591
2592 #ifdef CONFIG_IPV6_SUBTREES
2593         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2594 #else
2595         seq_puts(m, "00000000000000000000000000000000 00 ");
2596 #endif
2597
2598         if (rt->rt6i_nexthop) {
2599                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2600         } else {
2601                 seq_puts(m, "00000000000000000000000000000000");
2602         }
2603         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2604                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2605                    rt->dst.__use, rt->rt6i_flags,
2606                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2607         return 0;
2608 }
2609
2610 static int ipv6_route_show(struct seq_file *m, void *v)
2611 {
2612         struct net *net = (struct net *)m->private;
2613         fib6_clean_all(net, rt6_info_route, 0, m);
2614         return 0;
2615 }
2616
2617 static int ipv6_route_open(struct inode *inode, struct file *file)
2618 {
2619         return single_open_net(inode, file, ipv6_route_show);
2620 }
2621
2622 static const struct file_operations ipv6_route_proc_fops = {
2623         .owner          = THIS_MODULE,
2624         .open           = ipv6_route_open,
2625         .read           = seq_read,
2626         .llseek         = seq_lseek,
2627         .release        = single_release_net,
2628 };
2629
2630 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2631 {
2632         struct net *net = (struct net *)seq->private;
2633         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2634                    net->ipv6.rt6_stats->fib_nodes,
2635                    net->ipv6.rt6_stats->fib_route_nodes,
2636                    net->ipv6.rt6_stats->fib_rt_alloc,
2637                    net->ipv6.rt6_stats->fib_rt_entries,
2638                    net->ipv6.rt6_stats->fib_rt_cache,
2639                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2640                    net->ipv6.rt6_stats->fib_discarded_routes);
2641
2642         return 0;
2643 }
2644
2645 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2646 {
2647         return single_open_net(inode, file, rt6_stats_seq_show);
2648 }
2649
2650 static const struct file_operations rt6_stats_seq_fops = {
2651         .owner   = THIS_MODULE,
2652         .open    = rt6_stats_seq_open,
2653         .read    = seq_read,
2654         .llseek  = seq_lseek,
2655         .release = single_release_net,
2656 };
2657 #endif  /* CONFIG_PROC_FS */
2658
2659 #ifdef CONFIG_SYSCTL
2660
2661 static
2662 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2663                               void __user *buffer, size_t *lenp, loff_t *ppos)
2664 {
2665         struct net *net;
2666         int delay;
2667         if (!write)
2668                 return -EINVAL;
2669
2670         net = (struct net *)ctl->extra1;
2671         delay = net->ipv6.sysctl.flush_delay;
2672         proc_dointvec(ctl, write, buffer, lenp, ppos);
2673         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2674         return 0;
2675 }
2676
2677 ctl_table ipv6_route_table_template[] = {
2678         {
2679                 .procname       =       "flush",
2680                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2681                 .maxlen         =       sizeof(int),
2682                 .mode           =       0200,
2683                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2684         },
2685         {
2686                 .procname       =       "gc_thresh",
2687                 .data           =       &ip6_dst_ops_template.gc_thresh,
2688                 .maxlen         =       sizeof(int),
2689                 .mode           =       0644,
2690                 .proc_handler   =       proc_dointvec,
2691         },
2692         {
2693                 .procname       =       "max_size",
2694                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2695                 .maxlen         =       sizeof(int),
2696                 .mode           =       0644,
2697                 .proc_handler   =       proc_dointvec,
2698         },
2699         {
2700                 .procname       =       "gc_min_interval",
2701                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2702                 .maxlen         =       sizeof(int),
2703                 .mode           =       0644,
2704                 .proc_handler   =       proc_dointvec_jiffies,
2705         },
2706         {
2707                 .procname       =       "gc_timeout",
2708                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2709                 .maxlen         =       sizeof(int),
2710                 .mode           =       0644,
2711                 .proc_handler   =       proc_dointvec_jiffies,
2712         },
2713         {
2714                 .procname       =       "gc_interval",
2715                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2716                 .maxlen         =       sizeof(int),
2717                 .mode           =       0644,
2718                 .proc_handler   =       proc_dointvec_jiffies,
2719         },
2720         {
2721                 .procname       =       "gc_elasticity",
2722                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2723                 .maxlen         =       sizeof(int),
2724                 .mode           =       0644,
2725                 .proc_handler   =       proc_dointvec,
2726         },
2727         {
2728                 .procname       =       "mtu_expires",
2729                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2730                 .maxlen         =       sizeof(int),
2731                 .mode           =       0644,
2732                 .proc_handler   =       proc_dointvec_jiffies,
2733         },
2734         {
2735                 .procname       =       "min_adv_mss",
2736                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2737                 .maxlen         =       sizeof(int),
2738                 .mode           =       0644,
2739                 .proc_handler   =       proc_dointvec,
2740         },
2741         {
2742                 .procname       =       "gc_min_interval_ms",
2743                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2744                 .maxlen         =       sizeof(int),
2745                 .mode           =       0644,
2746                 .proc_handler   =       proc_dointvec_ms_jiffies,
2747         },
2748         { }
2749 };
2750
2751 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2752 {
2753         struct ctl_table *table;
2754
2755         table = kmemdup(ipv6_route_table_template,
2756                         sizeof(ipv6_route_table_template),
2757                         GFP_KERNEL);
2758
2759         if (table) {
2760                 table[0].data = &net->ipv6.sysctl.flush_delay;
2761                 table[0].extra1 = net;
2762                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2763                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2764                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2765                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2766                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2767                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2768                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2769                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2770                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2771         }
2772
2773         return table;
2774 }
2775 #endif
2776
2777 static int __net_init ip6_route_net_init(struct net *net)
2778 {
2779         int ret = -ENOMEM;
2780
2781         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2782                sizeof(net->ipv6.ip6_dst_ops));
2783
2784         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2785                 goto out_ip6_dst_ops;
2786
2787         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2788                                            sizeof(*net->ipv6.ip6_null_entry),
2789                                            GFP_KERNEL);
2790         if (!net->ipv6.ip6_null_entry)
2791                 goto out_ip6_dst_entries;
2792         net->ipv6.ip6_null_entry->dst.path =
2793                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2794         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2795         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2796                          ip6_template_metrics, true);
2797
2798 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2799         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2800                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2801                                                GFP_KERNEL);
2802         if (!net->ipv6.ip6_prohibit_entry)
2803                 goto out_ip6_null_entry;
2804         net->ipv6.ip6_prohibit_entry->dst.path =
2805                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2806         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2807         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2808                          ip6_template_metrics, true);
2809
2810         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2811                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2812                                                GFP_KERNEL);
2813         if (!net->ipv6.ip6_blk_hole_entry)
2814                 goto out_ip6_prohibit_entry;
2815         net->ipv6.ip6_blk_hole_entry->dst.path =
2816                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2817         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2818         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2819                          ip6_template_metrics, true);
2820 #endif
2821
2822         net->ipv6.sysctl.flush_delay = 0;
2823         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2824         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2825         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2826         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2827         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2828         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2829         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2830
2831 #ifdef CONFIG_PROC_FS
2832         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2833         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2834 #endif
2835         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2836
2837         ret = 0;
2838 out:
2839         return ret;
2840
2841 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2842 out_ip6_prohibit_entry:
2843         kfree(net->ipv6.ip6_prohibit_entry);
2844 out_ip6_null_entry:
2845         kfree(net->ipv6.ip6_null_entry);
2846 #endif
2847 out_ip6_dst_entries:
2848         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2849 out_ip6_dst_ops:
2850         goto out;
2851 }
2852
2853 static void __net_exit ip6_route_net_exit(struct net *net)
2854 {
2855 #ifdef CONFIG_PROC_FS
2856         proc_net_remove(net, "ipv6_route");
2857         proc_net_remove(net, "rt6_stats");
2858 #endif
2859         kfree(net->ipv6.ip6_null_entry);
2860 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2861         kfree(net->ipv6.ip6_prohibit_entry);
2862         kfree(net->ipv6.ip6_blk_hole_entry);
2863 #endif
2864         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2865 }
2866
2867 static struct pernet_operations ip6_route_net_ops = {
2868         .init = ip6_route_net_init,
2869         .exit = ip6_route_net_exit,
2870 };
2871
2872 static struct notifier_block ip6_route_dev_notifier = {
2873         .notifier_call = ip6_route_dev_notify,
2874         .priority = 0,
2875 };
2876
2877 int __init ip6_route_init(void)
2878 {
2879         int ret;
2880
2881         ret = -ENOMEM;
2882         ip6_dst_ops_template.kmem_cachep =
2883                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2884                                   SLAB_HWCACHE_ALIGN, NULL);
2885         if (!ip6_dst_ops_template.kmem_cachep)
2886                 goto out;
2887
2888         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2889         if (ret)
2890                 goto out_kmem_cache;
2891
2892         ret = register_pernet_subsys(&ip6_route_net_ops);
2893         if (ret)
2894                 goto out_dst_entries;
2895
2896         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2897
2898         /* Registering of the loopback is done before this portion of code,
2899          * the loopback reference in rt6_info will not be taken, do it
2900          * manually for init_net */
2901         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2902         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2903   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2904         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2905         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2906         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2907         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2908   #endif
2909         ret = fib6_init();
2910         if (ret)
2911                 goto out_register_subsys;
2912
2913         ret = xfrm6_init();
2914         if (ret)
2915                 goto out_fib6_init;
2916
2917         ret = fib6_rules_init();
2918         if (ret)
2919                 goto xfrm6_init;
2920
2921         ret = -ENOBUFS;
2922         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2923             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2924             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2925                 goto fib6_rules_init;
2926
2927         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2928         if (ret)
2929                 goto fib6_rules_init;
2930
2931 out:
2932         return ret;
2933
2934 fib6_rules_init:
2935         fib6_rules_cleanup();
2936 xfrm6_init:
2937         xfrm6_fini();
2938 out_fib6_init:
2939         fib6_gc_cleanup();
2940 out_register_subsys:
2941         unregister_pernet_subsys(&ip6_route_net_ops);
2942 out_dst_entries:
2943         dst_entries_destroy(&ip6_dst_blackhole_ops);
2944 out_kmem_cache:
2945         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2946         goto out;
2947 }
2948
2949 void ip6_route_cleanup(void)
2950 {
2951         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2952         fib6_rules_cleanup();
2953         xfrm6_fini();
2954         fib6_gc_cleanup();
2955         unregister_pernet_subsys(&ip6_route_net_ops);
2956         dst_entries_destroy(&ip6_dst_blackhole_ops);
2957         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2958 }