Merge branch 'stable/swiotlb-0.9' of git://git.kernel.org/pub/scm/linux/kernel/git...
[pandora-kernel.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            struct in6_addr *prefix, int prefixlen,
93                                            struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            struct in6_addr *prefix, int prefixlen,
97                                            struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static struct dst_ops ip6_dst_ops_template = {
101         .family                 =       AF_INET6,
102         .protocol               =       cpu_to_be16(ETH_P_IPV6),
103         .gc                     =       ip6_dst_gc,
104         .gc_thresh              =       1024,
105         .check                  =       ip6_dst_check,
106         .destroy                =       ip6_dst_destroy,
107         .ifdown                 =       ip6_dst_ifdown,
108         .negative_advice        =       ip6_negative_advice,
109         .link_failure           =       ip6_link_failure,
110         .update_pmtu            =       ip6_rt_update_pmtu,
111         .local_out              =       __ip6_local_out,
112         .entries                =       ATOMIC_INIT(0),
113 };
114
115 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
116 {
117 }
118
119 static struct dst_ops ip6_dst_blackhole_ops = {
120         .family                 =       AF_INET6,
121         .protocol               =       cpu_to_be16(ETH_P_IPV6),
122         .destroy                =       ip6_dst_destroy,
123         .check                  =       ip6_dst_check,
124         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
125         .entries                =       ATOMIC_INIT(0),
126 };
127
128 static struct rt6_info ip6_null_entry_template = {
129         .dst = {
130                 .__refcnt       = ATOMIC_INIT(1),
131                 .__use          = 1,
132                 .obsolete       = -1,
133                 .error          = -ENETUNREACH,
134                 .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
135                 .input          = ip6_pkt_discard,
136                 .output         = ip6_pkt_discard_out,
137         },
138         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
139         .rt6i_protocol  = RTPROT_KERNEL,
140         .rt6i_metric    = ~(u32) 0,
141         .rt6i_ref       = ATOMIC_INIT(1),
142 };
143
144 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
145
146 static int ip6_pkt_prohibit(struct sk_buff *skb);
147 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
148
149 static struct rt6_info ip6_prohibit_entry_template = {
150         .dst = {
151                 .__refcnt       = ATOMIC_INIT(1),
152                 .__use          = 1,
153                 .obsolete       = -1,
154                 .error          = -EACCES,
155                 .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
156                 .input          = ip6_pkt_prohibit,
157                 .output         = ip6_pkt_prohibit_out,
158         },
159         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
160         .rt6i_protocol  = RTPROT_KERNEL,
161         .rt6i_metric    = ~(u32) 0,
162         .rt6i_ref       = ATOMIC_INIT(1),
163 };
164
165 static struct rt6_info ip6_blk_hole_entry_template = {
166         .dst = {
167                 .__refcnt       = ATOMIC_INIT(1),
168                 .__use          = 1,
169                 .obsolete       = -1,
170                 .error          = -EINVAL,
171                 .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
172                 .input          = dst_discard,
173                 .output         = dst_discard,
174         },
175         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
176         .rt6i_protocol  = RTPROT_KERNEL,
177         .rt6i_metric    = ~(u32) 0,
178         .rt6i_ref       = ATOMIC_INIT(1),
179 };
180
181 #endif
182
183 /* allocate dst with ip6_dst_ops */
184 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
185 {
186         return (struct rt6_info *)dst_alloc(ops);
187 }
188
189 static void ip6_dst_destroy(struct dst_entry *dst)
190 {
191         struct rt6_info *rt = (struct rt6_info *)dst;
192         struct inet6_dev *idev = rt->rt6i_idev;
193
194         if (idev != NULL) {
195                 rt->rt6i_idev = NULL;
196                 in6_dev_put(idev);
197         }
198 }
199
200 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
201                            int how)
202 {
203         struct rt6_info *rt = (struct rt6_info *)dst;
204         struct inet6_dev *idev = rt->rt6i_idev;
205         struct net_device *loopback_dev =
206                 dev_net(dev)->loopback_dev;
207
208         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
209                 struct inet6_dev *loopback_idev =
210                         in6_dev_get(loopback_dev);
211                 if (loopback_idev != NULL) {
212                         rt->rt6i_idev = loopback_idev;
213                         in6_dev_put(idev);
214                 }
215         }
216 }
217
218 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
219 {
220         return (rt->rt6i_flags & RTF_EXPIRES &&
221                 time_after(jiffies, rt->rt6i_expires));
222 }
223
224 static inline int rt6_need_strict(struct in6_addr *daddr)
225 {
226         return (ipv6_addr_type(daddr) &
227                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK));
228 }
229
230 /*
231  *      Route lookup. Any table->tb6_lock is implied.
232  */
233
234 static inline struct rt6_info *rt6_device_match(struct net *net,
235                                                     struct rt6_info *rt,
236                                                     struct in6_addr *saddr,
237                                                     int oif,
238                                                     int flags)
239 {
240         struct rt6_info *local = NULL;
241         struct rt6_info *sprt;
242
243         if (!oif && ipv6_addr_any(saddr))
244                 goto out;
245
246         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
247                 struct net_device *dev = sprt->rt6i_dev;
248
249                 if (oif) {
250                         if (dev->ifindex == oif)
251                                 return sprt;
252                         if (dev->flags & IFF_LOOPBACK) {
253                                 if (sprt->rt6i_idev == NULL ||
254                                     sprt->rt6i_idev->dev->ifindex != oif) {
255                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
256                                                 continue;
257                                         if (local && (!oif ||
258                                                       local->rt6i_idev->dev->ifindex == oif))
259                                                 continue;
260                                 }
261                                 local = sprt;
262                         }
263                 } else {
264                         if (ipv6_chk_addr(net, saddr, dev,
265                                           flags & RT6_LOOKUP_F_IFACE))
266                                 return sprt;
267                 }
268         }
269
270         if (oif) {
271                 if (local)
272                         return local;
273
274                 if (flags & RT6_LOOKUP_F_IFACE)
275                         return net->ipv6.ip6_null_entry;
276         }
277 out:
278         return rt;
279 }
280
281 #ifdef CONFIG_IPV6_ROUTER_PREF
282 static void rt6_probe(struct rt6_info *rt)
283 {
284         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
285         /*
286          * Okay, this does not seem to be appropriate
287          * for now, however, we need to check if it
288          * is really so; aka Router Reachability Probing.
289          *
290          * Router Reachability Probe MUST be rate-limited
291          * to no more than one per minute.
292          */
293         if (!neigh || (neigh->nud_state & NUD_VALID))
294                 return;
295         read_lock_bh(&neigh->lock);
296         if (!(neigh->nud_state & NUD_VALID) &&
297             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
298                 struct in6_addr mcaddr;
299                 struct in6_addr *target;
300
301                 neigh->updated = jiffies;
302                 read_unlock_bh(&neigh->lock);
303
304                 target = (struct in6_addr *)&neigh->primary_key;
305                 addrconf_addr_solict_mult(target, &mcaddr);
306                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
307         } else
308                 read_unlock_bh(&neigh->lock);
309 }
310 #else
311 static inline void rt6_probe(struct rt6_info *rt)
312 {
313 }
314 #endif
315
316 /*
317  * Default Router Selection (RFC 2461 6.3.6)
318  */
319 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
320 {
321         struct net_device *dev = rt->rt6i_dev;
322         if (!oif || dev->ifindex == oif)
323                 return 2;
324         if ((dev->flags & IFF_LOOPBACK) &&
325             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
326                 return 1;
327         return 0;
328 }
329
330 static inline int rt6_check_neigh(struct rt6_info *rt)
331 {
332         struct neighbour *neigh = rt->rt6i_nexthop;
333         int m;
334         if (rt->rt6i_flags & RTF_NONEXTHOP ||
335             !(rt->rt6i_flags & RTF_GATEWAY))
336                 m = 1;
337         else if (neigh) {
338                 read_lock_bh(&neigh->lock);
339                 if (neigh->nud_state & NUD_VALID)
340                         m = 2;
341 #ifdef CONFIG_IPV6_ROUTER_PREF
342                 else if (neigh->nud_state & NUD_FAILED)
343                         m = 0;
344 #endif
345                 else
346                         m = 1;
347                 read_unlock_bh(&neigh->lock);
348         } else
349                 m = 0;
350         return m;
351 }
352
353 static int rt6_score_route(struct rt6_info *rt, int oif,
354                            int strict)
355 {
356         int m, n;
357
358         m = rt6_check_dev(rt, oif);
359         if (!m && (strict & RT6_LOOKUP_F_IFACE))
360                 return -1;
361 #ifdef CONFIG_IPV6_ROUTER_PREF
362         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
363 #endif
364         n = rt6_check_neigh(rt);
365         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
366                 return -1;
367         return m;
368 }
369
370 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
371                                    int *mpri, struct rt6_info *match)
372 {
373         int m;
374
375         if (rt6_check_expired(rt))
376                 goto out;
377
378         m = rt6_score_route(rt, oif, strict);
379         if (m < 0)
380                 goto out;
381
382         if (m > *mpri) {
383                 if (strict & RT6_LOOKUP_F_REACHABLE)
384                         rt6_probe(match);
385                 *mpri = m;
386                 match = rt;
387         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
388                 rt6_probe(rt);
389         }
390
391 out:
392         return match;
393 }
394
395 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
396                                      struct rt6_info *rr_head,
397                                      u32 metric, int oif, int strict)
398 {
399         struct rt6_info *rt, *match;
400         int mpri = -1;
401
402         match = NULL;
403         for (rt = rr_head; rt && rt->rt6i_metric == metric;
404              rt = rt->dst.rt6_next)
405                 match = find_match(rt, oif, strict, &mpri, match);
406         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
407              rt = rt->dst.rt6_next)
408                 match = find_match(rt, oif, strict, &mpri, match);
409
410         return match;
411 }
412
413 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
414 {
415         struct rt6_info *match, *rt0;
416         struct net *net;
417
418         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
419                   __func__, fn->leaf, oif);
420
421         rt0 = fn->rr_ptr;
422         if (!rt0)
423                 fn->rr_ptr = rt0 = fn->leaf;
424
425         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
426
427         if (!match &&
428             (strict & RT6_LOOKUP_F_REACHABLE)) {
429                 struct rt6_info *next = rt0->dst.rt6_next;
430
431                 /* no entries matched; do round-robin */
432                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
433                         next = fn->leaf;
434
435                 if (next != rt0)
436                         fn->rr_ptr = next;
437         }
438
439         RT6_TRACE("%s() => %p\n",
440                   __func__, match);
441
442         net = dev_net(rt0->rt6i_dev);
443         return (match ? match : net->ipv6.ip6_null_entry);
444 }
445
446 #ifdef CONFIG_IPV6_ROUTE_INFO
447 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
448                   struct in6_addr *gwaddr)
449 {
450         struct net *net = dev_net(dev);
451         struct route_info *rinfo = (struct route_info *) opt;
452         struct in6_addr prefix_buf, *prefix;
453         unsigned int pref;
454         unsigned long lifetime;
455         struct rt6_info *rt;
456
457         if (len < sizeof(struct route_info)) {
458                 return -EINVAL;
459         }
460
461         /* Sanity check for prefix_len and length */
462         if (rinfo->length > 3) {
463                 return -EINVAL;
464         } else if (rinfo->prefix_len > 128) {
465                 return -EINVAL;
466         } else if (rinfo->prefix_len > 64) {
467                 if (rinfo->length < 2) {
468                         return -EINVAL;
469                 }
470         } else if (rinfo->prefix_len > 0) {
471                 if (rinfo->length < 1) {
472                         return -EINVAL;
473                 }
474         }
475
476         pref = rinfo->route_pref;
477         if (pref == ICMPV6_ROUTER_PREF_INVALID)
478                 return -EINVAL;
479
480         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
481
482         if (rinfo->length == 3)
483                 prefix = (struct in6_addr *)rinfo->prefix;
484         else {
485                 /* this function is safe */
486                 ipv6_addr_prefix(&prefix_buf,
487                                  (struct in6_addr *)rinfo->prefix,
488                                  rinfo->prefix_len);
489                 prefix = &prefix_buf;
490         }
491
492         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
493                                 dev->ifindex);
494
495         if (rt && !lifetime) {
496                 ip6_del_rt(rt);
497                 rt = NULL;
498         }
499
500         if (!rt && lifetime)
501                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
502                                         pref);
503         else if (rt)
504                 rt->rt6i_flags = RTF_ROUTEINFO |
505                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
506
507         if (rt) {
508                 if (!addrconf_finite_timeout(lifetime)) {
509                         rt->rt6i_flags &= ~RTF_EXPIRES;
510                 } else {
511                         rt->rt6i_expires = jiffies + HZ * lifetime;
512                         rt->rt6i_flags |= RTF_EXPIRES;
513                 }
514                 dst_release(&rt->dst);
515         }
516         return 0;
517 }
518 #endif
519
520 #define BACKTRACK(__net, saddr)                 \
521 do { \
522         if (rt == __net->ipv6.ip6_null_entry) { \
523                 struct fib6_node *pn; \
524                 while (1) { \
525                         if (fn->fn_flags & RTN_TL_ROOT) \
526                                 goto out; \
527                         pn = fn->parent; \
528                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
529                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
530                         else \
531                                 fn = pn; \
532                         if (fn->fn_flags & RTN_RTINFO) \
533                                 goto restart; \
534                 } \
535         } \
536 } while(0)
537
538 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
539                                              struct fib6_table *table,
540                                              struct flowi *fl, int flags)
541 {
542         struct fib6_node *fn;
543         struct rt6_info *rt;
544
545         read_lock_bh(&table->tb6_lock);
546         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
547 restart:
548         rt = fn->leaf;
549         rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
550         BACKTRACK(net, &fl->fl6_src);
551 out:
552         dst_use(&rt->dst, jiffies);
553         read_unlock_bh(&table->tb6_lock);
554         return rt;
555
556 }
557
558 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
559                             const struct in6_addr *saddr, int oif, int strict)
560 {
561         struct flowi fl = {
562                 .oif = oif,
563                 .nl_u = {
564                         .ip6_u = {
565                                 .daddr = *daddr,
566                         },
567                 },
568         };
569         struct dst_entry *dst;
570         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
571
572         if (saddr) {
573                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
574                 flags |= RT6_LOOKUP_F_HAS_SADDR;
575         }
576
577         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
578         if (dst->error == 0)
579                 return (struct rt6_info *) dst;
580
581         dst_release(dst);
582
583         return NULL;
584 }
585
586 EXPORT_SYMBOL(rt6_lookup);
587
588 /* ip6_ins_rt is called with FREE table->tb6_lock.
589    It takes new route entry, the addition fails by any reason the
590    route is freed. In any case, if caller does not hold it, it may
591    be destroyed.
592  */
593
594 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
595 {
596         int err;
597         struct fib6_table *table;
598
599         table = rt->rt6i_table;
600         write_lock_bh(&table->tb6_lock);
601         err = fib6_add(&table->tb6_root, rt, info);
602         write_unlock_bh(&table->tb6_lock);
603
604         return err;
605 }
606
607 int ip6_ins_rt(struct rt6_info *rt)
608 {
609         struct nl_info info = {
610                 .nl_net = dev_net(rt->rt6i_dev),
611         };
612         return __ip6_ins_rt(rt, &info);
613 }
614
615 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
616                                       struct in6_addr *saddr)
617 {
618         struct rt6_info *rt;
619
620         /*
621          *      Clone the route.
622          */
623
624         rt = ip6_rt_copy(ort);
625
626         if (rt) {
627                 struct neighbour *neigh;
628                 int attempts = !in_softirq();
629
630                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
631                         if (rt->rt6i_dst.plen != 128 &&
632                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
633                                 rt->rt6i_flags |= RTF_ANYCAST;
634                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
635                 }
636
637                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
638                 rt->rt6i_dst.plen = 128;
639                 rt->rt6i_flags |= RTF_CACHE;
640                 rt->dst.flags |= DST_HOST;
641
642 #ifdef CONFIG_IPV6_SUBTREES
643                 if (rt->rt6i_src.plen && saddr) {
644                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
645                         rt->rt6i_src.plen = 128;
646                 }
647 #endif
648
649         retry:
650                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
651                 if (IS_ERR(neigh)) {
652                         struct net *net = dev_net(rt->rt6i_dev);
653                         int saved_rt_min_interval =
654                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
655                         int saved_rt_elasticity =
656                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
657
658                         if (attempts-- > 0) {
659                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
660                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
661
662                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
663
664                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
665                                         saved_rt_elasticity;
666                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
667                                         saved_rt_min_interval;
668                                 goto retry;
669                         }
670
671                         if (net_ratelimit())
672                                 printk(KERN_WARNING
673                                        "ipv6: Neighbour table overflow.\n");
674                         dst_free(&rt->dst);
675                         return NULL;
676                 }
677                 rt->rt6i_nexthop = neigh;
678
679         }
680
681         return rt;
682 }
683
684 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
685 {
686         struct rt6_info *rt = ip6_rt_copy(ort);
687         if (rt) {
688                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
689                 rt->rt6i_dst.plen = 128;
690                 rt->rt6i_flags |= RTF_CACHE;
691                 rt->dst.flags |= DST_HOST;
692                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
693         }
694         return rt;
695 }
696
697 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
698                                       struct flowi *fl, int flags)
699 {
700         struct fib6_node *fn;
701         struct rt6_info *rt, *nrt;
702         int strict = 0;
703         int attempts = 3;
704         int err;
705         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
706
707         strict |= flags & RT6_LOOKUP_F_IFACE;
708
709 relookup:
710         read_lock_bh(&table->tb6_lock);
711
712 restart_2:
713         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
714
715 restart:
716         rt = rt6_select(fn, oif, strict | reachable);
717
718         BACKTRACK(net, &fl->fl6_src);
719         if (rt == net->ipv6.ip6_null_entry ||
720             rt->rt6i_flags & RTF_CACHE)
721                 goto out;
722
723         dst_hold(&rt->dst);
724         read_unlock_bh(&table->tb6_lock);
725
726         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
727                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
728         else {
729 #if CLONE_OFFLINK_ROUTE
730                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
731 #else
732                 goto out2;
733 #endif
734         }
735
736         dst_release(&rt->dst);
737         rt = nrt ? : net->ipv6.ip6_null_entry;
738
739         dst_hold(&rt->dst);
740         if (nrt) {
741                 err = ip6_ins_rt(nrt);
742                 if (!err)
743                         goto out2;
744         }
745
746         if (--attempts <= 0)
747                 goto out2;
748
749         /*
750          * Race condition! In the gap, when table->tb6_lock was
751          * released someone could insert this route.  Relookup.
752          */
753         dst_release(&rt->dst);
754         goto relookup;
755
756 out:
757         if (reachable) {
758                 reachable = 0;
759                 goto restart_2;
760         }
761         dst_hold(&rt->dst);
762         read_unlock_bh(&table->tb6_lock);
763 out2:
764         rt->dst.lastuse = jiffies;
765         rt->dst.__use++;
766
767         return rt;
768 }
769
770 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
771                                             struct flowi *fl, int flags)
772 {
773         return ip6_pol_route(net, table, fl->iif, fl, flags);
774 }
775
776 void ip6_route_input(struct sk_buff *skb)
777 {
778         struct ipv6hdr *iph = ipv6_hdr(skb);
779         struct net *net = dev_net(skb->dev);
780         int flags = RT6_LOOKUP_F_HAS_SADDR;
781         struct flowi fl = {
782                 .iif = skb->dev->ifindex,
783                 .nl_u = {
784                         .ip6_u = {
785                                 .daddr = iph->daddr,
786                                 .saddr = iph->saddr,
787                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
788                         },
789                 },
790                 .mark = skb->mark,
791                 .proto = iph->nexthdr,
792         };
793
794         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
795                 flags |= RT6_LOOKUP_F_IFACE;
796
797         skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
798 }
799
800 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
801                                              struct flowi *fl, int flags)
802 {
803         return ip6_pol_route(net, table, fl->oif, fl, flags);
804 }
805
806 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
807                                     struct flowi *fl)
808 {
809         int flags = 0;
810
811         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
812                 flags |= RT6_LOOKUP_F_IFACE;
813
814         if (!ipv6_addr_any(&fl->fl6_src))
815                 flags |= RT6_LOOKUP_F_HAS_SADDR;
816         else if (sk)
817                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
818
819         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
820 }
821
822 EXPORT_SYMBOL(ip6_route_output);
823
824 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
825 {
826         struct rt6_info *ort = (struct rt6_info *) *dstp;
827         struct rt6_info *rt = (struct rt6_info *)
828                 dst_alloc(&ip6_dst_blackhole_ops);
829         struct dst_entry *new = NULL;
830
831         if (rt) {
832                 new = &rt->dst;
833
834                 atomic_set(&new->__refcnt, 1);
835                 new->__use = 1;
836                 new->input = dst_discard;
837                 new->output = dst_discard;
838
839                 memcpy(new->metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
840                 new->dev = ort->dst.dev;
841                 if (new->dev)
842                         dev_hold(new->dev);
843                 rt->rt6i_idev = ort->rt6i_idev;
844                 if (rt->rt6i_idev)
845                         in6_dev_hold(rt->rt6i_idev);
846                 rt->rt6i_expires = 0;
847
848                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
849                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
850                 rt->rt6i_metric = 0;
851
852                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
853 #ifdef CONFIG_IPV6_SUBTREES
854                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
855 #endif
856
857                 dst_free(new);
858         }
859
860         dst_release(*dstp);
861         *dstp = new;
862         return (new ? 0 : -ENOMEM);
863 }
864 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
865
866 /*
867  *      Destination cache support functions
868  */
869
870 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
871 {
872         struct rt6_info *rt;
873
874         rt = (struct rt6_info *) dst;
875
876         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
877                 return dst;
878
879         return NULL;
880 }
881
882 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
883 {
884         struct rt6_info *rt = (struct rt6_info *) dst;
885
886         if (rt) {
887                 if (rt->rt6i_flags & RTF_CACHE) {
888                         if (rt6_check_expired(rt)) {
889                                 ip6_del_rt(rt);
890                                 dst = NULL;
891                         }
892                 } else {
893                         dst_release(dst);
894                         dst = NULL;
895                 }
896         }
897         return dst;
898 }
899
900 static void ip6_link_failure(struct sk_buff *skb)
901 {
902         struct rt6_info *rt;
903
904         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
905
906         rt = (struct rt6_info *) skb_dst(skb);
907         if (rt) {
908                 if (rt->rt6i_flags&RTF_CACHE) {
909                         dst_set_expires(&rt->dst, 0);
910                         rt->rt6i_flags |= RTF_EXPIRES;
911                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
912                         rt->rt6i_node->fn_sernum = -1;
913         }
914 }
915
916 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
917 {
918         struct rt6_info *rt6 = (struct rt6_info*)dst;
919
920         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
921                 rt6->rt6i_flags |= RTF_MODIFIED;
922                 if (mtu < IPV6_MIN_MTU) {
923                         mtu = IPV6_MIN_MTU;
924                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
925                 }
926                 dst->metrics[RTAX_MTU-1] = mtu;
927                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
928         }
929 }
930
931 static int ipv6_get_mtu(struct net_device *dev);
932
933 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
934 {
935         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
936
937         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
938                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
939
940         /*
941          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
942          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
943          * IPV6_MAXPLEN is also valid and means: "any MSS,
944          * rely only on pmtu discovery"
945          */
946         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
947                 mtu = IPV6_MAXPLEN;
948         return mtu;
949 }
950
951 static struct dst_entry *icmp6_dst_gc_list;
952 static DEFINE_SPINLOCK(icmp6_dst_lock);
953
954 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
955                                   struct neighbour *neigh,
956                                   const struct in6_addr *addr)
957 {
958         struct rt6_info *rt;
959         struct inet6_dev *idev = in6_dev_get(dev);
960         struct net *net = dev_net(dev);
961
962         if (unlikely(idev == NULL))
963                 return NULL;
964
965         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
966         if (unlikely(rt == NULL)) {
967                 in6_dev_put(idev);
968                 goto out;
969         }
970
971         dev_hold(dev);
972         if (neigh)
973                 neigh_hold(neigh);
974         else {
975                 neigh = ndisc_get_neigh(dev, addr);
976                 if (IS_ERR(neigh))
977                         neigh = NULL;
978         }
979
980         rt->rt6i_dev      = dev;
981         rt->rt6i_idev     = idev;
982         rt->rt6i_nexthop  = neigh;
983         atomic_set(&rt->dst.__refcnt, 1);
984         rt->dst.metrics[RTAX_HOPLIMIT-1] = 255;
985         rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
986         rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
987         rt->dst.output  = ip6_output;
988
989 #if 0   /* there's no chance to use these for ndisc */
990         rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
991                                 ? DST_HOST
992                                 : 0;
993         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
994         rt->rt6i_dst.plen = 128;
995 #endif
996
997         spin_lock_bh(&icmp6_dst_lock);
998         rt->dst.next = icmp6_dst_gc_list;
999         icmp6_dst_gc_list = &rt->dst;
1000         spin_unlock_bh(&icmp6_dst_lock);
1001
1002         fib6_force_start_gc(net);
1003
1004 out:
1005         return &rt->dst;
1006 }
1007
1008 int icmp6_dst_gc(void)
1009 {
1010         struct dst_entry *dst, *next, **pprev;
1011         int more = 0;
1012
1013         next = NULL;
1014
1015         spin_lock_bh(&icmp6_dst_lock);
1016         pprev = &icmp6_dst_gc_list;
1017
1018         while ((dst = *pprev) != NULL) {
1019                 if (!atomic_read(&dst->__refcnt)) {
1020                         *pprev = dst->next;
1021                         dst_free(dst);
1022                 } else {
1023                         pprev = &dst->next;
1024                         ++more;
1025                 }
1026         }
1027
1028         spin_unlock_bh(&icmp6_dst_lock);
1029
1030         return more;
1031 }
1032
1033 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1034                             void *arg)
1035 {
1036         struct dst_entry *dst, **pprev;
1037
1038         spin_lock_bh(&icmp6_dst_lock);
1039         pprev = &icmp6_dst_gc_list;
1040         while ((dst = *pprev) != NULL) {
1041                 struct rt6_info *rt = (struct rt6_info *) dst;
1042                 if (func(rt, arg)) {
1043                         *pprev = dst->next;
1044                         dst_free(dst);
1045                 } else {
1046                         pprev = &dst->next;
1047                 }
1048         }
1049         spin_unlock_bh(&icmp6_dst_lock);
1050 }
1051
1052 static int ip6_dst_gc(struct dst_ops *ops)
1053 {
1054         unsigned long now = jiffies;
1055         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1056         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1057         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1058         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1059         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1060         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1061
1062         if (time_after(rt_last_gc + rt_min_interval, now) &&
1063             atomic_read(&ops->entries) <= rt_max_size)
1064                 goto out;
1065
1066         net->ipv6.ip6_rt_gc_expire++;
1067         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1068         net->ipv6.ip6_rt_last_gc = now;
1069         if (atomic_read(&ops->entries) < ops->gc_thresh)
1070                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1071 out:
1072         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1073         return (atomic_read(&ops->entries) > rt_max_size);
1074 }
1075
1076 /* Clean host part of a prefix. Not necessary in radix tree,
1077    but results in cleaner routing tables.
1078
1079    Remove it only when all the things will work!
1080  */
1081
1082 static int ipv6_get_mtu(struct net_device *dev)
1083 {
1084         int mtu = IPV6_MIN_MTU;
1085         struct inet6_dev *idev;
1086
1087         rcu_read_lock();
1088         idev = __in6_dev_get(dev);
1089         if (idev)
1090                 mtu = idev->cnf.mtu6;
1091         rcu_read_unlock();
1092         return mtu;
1093 }
1094
1095 int ip6_dst_hoplimit(struct dst_entry *dst)
1096 {
1097         int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1098         if (hoplimit < 0) {
1099                 struct net_device *dev = dst->dev;
1100                 struct inet6_dev *idev;
1101
1102                 rcu_read_lock();
1103                 idev = __in6_dev_get(dev);
1104                 if (idev)
1105                         hoplimit = idev->cnf.hop_limit;
1106                 else
1107                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1108                 rcu_read_unlock();
1109         }
1110         return hoplimit;
1111 }
1112
1113 /*
1114  *
1115  */
1116
1117 int ip6_route_add(struct fib6_config *cfg)
1118 {
1119         int err;
1120         struct net *net = cfg->fc_nlinfo.nl_net;
1121         struct rt6_info *rt = NULL;
1122         struct net_device *dev = NULL;
1123         struct inet6_dev *idev = NULL;
1124         struct fib6_table *table;
1125         int addr_type;
1126
1127         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1128                 return -EINVAL;
1129 #ifndef CONFIG_IPV6_SUBTREES
1130         if (cfg->fc_src_len)
1131                 return -EINVAL;
1132 #endif
1133         if (cfg->fc_ifindex) {
1134                 err = -ENODEV;
1135                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1136                 if (!dev)
1137                         goto out;
1138                 idev = in6_dev_get(dev);
1139                 if (!idev)
1140                         goto out;
1141         }
1142
1143         if (cfg->fc_metric == 0)
1144                 cfg->fc_metric = IP6_RT_PRIO_USER;
1145
1146         table = fib6_new_table(net, cfg->fc_table);
1147         if (table == NULL) {
1148                 err = -ENOBUFS;
1149                 goto out;
1150         }
1151
1152         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1153
1154         if (rt == NULL) {
1155                 err = -ENOMEM;
1156                 goto out;
1157         }
1158
1159         rt->dst.obsolete = -1;
1160         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1161                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1162                                 0;
1163
1164         if (cfg->fc_protocol == RTPROT_UNSPEC)
1165                 cfg->fc_protocol = RTPROT_BOOT;
1166         rt->rt6i_protocol = cfg->fc_protocol;
1167
1168         addr_type = ipv6_addr_type(&cfg->fc_dst);
1169
1170         if (addr_type & IPV6_ADDR_MULTICAST)
1171                 rt->dst.input = ip6_mc_input;
1172         else
1173                 rt->dst.input = ip6_forward;
1174
1175         rt->dst.output = ip6_output;
1176
1177         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1178         rt->rt6i_dst.plen = cfg->fc_dst_len;
1179         if (rt->rt6i_dst.plen == 128)
1180                rt->dst.flags = DST_HOST;
1181
1182 #ifdef CONFIG_IPV6_SUBTREES
1183         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1184         rt->rt6i_src.plen = cfg->fc_src_len;
1185 #endif
1186
1187         rt->rt6i_metric = cfg->fc_metric;
1188
1189         /* We cannot add true routes via loopback here,
1190            they would result in kernel looping; promote them to reject routes
1191          */
1192         if ((cfg->fc_flags & RTF_REJECT) ||
1193             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1194                 /* hold loopback dev/idev if we haven't done so. */
1195                 if (dev != net->loopback_dev) {
1196                         if (dev) {
1197                                 dev_put(dev);
1198                                 in6_dev_put(idev);
1199                         }
1200                         dev = net->loopback_dev;
1201                         dev_hold(dev);
1202                         idev = in6_dev_get(dev);
1203                         if (!idev) {
1204                                 err = -ENODEV;
1205                                 goto out;
1206                         }
1207                 }
1208                 rt->dst.output = ip6_pkt_discard_out;
1209                 rt->dst.input = ip6_pkt_discard;
1210                 rt->dst.error = -ENETUNREACH;
1211                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1212                 goto install_route;
1213         }
1214
1215         if (cfg->fc_flags & RTF_GATEWAY) {
1216                 struct in6_addr *gw_addr;
1217                 int gwa_type;
1218
1219                 gw_addr = &cfg->fc_gateway;
1220                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1221                 gwa_type = ipv6_addr_type(gw_addr);
1222
1223                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1224                         struct rt6_info *grt;
1225
1226                         /* IPv6 strictly inhibits using not link-local
1227                            addresses as nexthop address.
1228                            Otherwise, router will not able to send redirects.
1229                            It is very good, but in some (rare!) circumstances
1230                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1231                            some exceptions. --ANK
1232                          */
1233                         err = -EINVAL;
1234                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1235                                 goto out;
1236
1237                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1238
1239                         err = -EHOSTUNREACH;
1240                         if (grt == NULL)
1241                                 goto out;
1242                         if (dev) {
1243                                 if (dev != grt->rt6i_dev) {
1244                                         dst_release(&grt->dst);
1245                                         goto out;
1246                                 }
1247                         } else {
1248                                 dev = grt->rt6i_dev;
1249                                 idev = grt->rt6i_idev;
1250                                 dev_hold(dev);
1251                                 in6_dev_hold(grt->rt6i_idev);
1252                         }
1253                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1254                                 err = 0;
1255                         dst_release(&grt->dst);
1256
1257                         if (err)
1258                                 goto out;
1259                 }
1260                 err = -EINVAL;
1261                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1262                         goto out;
1263         }
1264
1265         err = -ENODEV;
1266         if (dev == NULL)
1267                 goto out;
1268
1269         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1270                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1271                 if (IS_ERR(rt->rt6i_nexthop)) {
1272                         err = PTR_ERR(rt->rt6i_nexthop);
1273                         rt->rt6i_nexthop = NULL;
1274                         goto out;
1275                 }
1276         }
1277
1278         rt->rt6i_flags = cfg->fc_flags;
1279
1280 install_route:
1281         if (cfg->fc_mx) {
1282                 struct nlattr *nla;
1283                 int remaining;
1284
1285                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1286                         int type = nla_type(nla);
1287
1288                         if (type) {
1289                                 if (type > RTAX_MAX) {
1290                                         err = -EINVAL;
1291                                         goto out;
1292                                 }
1293
1294                                 rt->dst.metrics[type - 1] = nla_get_u32(nla);
1295                         }
1296                 }
1297         }
1298
1299         if (dst_metric(&rt->dst, RTAX_HOPLIMIT) == 0)
1300                 rt->dst.metrics[RTAX_HOPLIMIT-1] = -1;
1301         if (!dst_mtu(&rt->dst))
1302                 rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1303         if (!dst_metric(&rt->dst, RTAX_ADVMSS))
1304                 rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
1305         rt->dst.dev = dev;
1306         rt->rt6i_idev = idev;
1307         rt->rt6i_table = table;
1308
1309         cfg->fc_nlinfo.nl_net = dev_net(dev);
1310
1311         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1312
1313 out:
1314         if (dev)
1315                 dev_put(dev);
1316         if (idev)
1317                 in6_dev_put(idev);
1318         if (rt)
1319                 dst_free(&rt->dst);
1320         return err;
1321 }
1322
1323 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1324 {
1325         int err;
1326         struct fib6_table *table;
1327         struct net *net = dev_net(rt->rt6i_dev);
1328
1329         if (rt == net->ipv6.ip6_null_entry)
1330                 return -ENOENT;
1331
1332         table = rt->rt6i_table;
1333         write_lock_bh(&table->tb6_lock);
1334
1335         err = fib6_del(rt, info);
1336         dst_release(&rt->dst);
1337
1338         write_unlock_bh(&table->tb6_lock);
1339
1340         return err;
1341 }
1342
1343 int ip6_del_rt(struct rt6_info *rt)
1344 {
1345         struct nl_info info = {
1346                 .nl_net = dev_net(rt->rt6i_dev),
1347         };
1348         return __ip6_del_rt(rt, &info);
1349 }
1350
1351 static int ip6_route_del(struct fib6_config *cfg)
1352 {
1353         struct fib6_table *table;
1354         struct fib6_node *fn;
1355         struct rt6_info *rt;
1356         int err = -ESRCH;
1357
1358         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1359         if (table == NULL)
1360                 return err;
1361
1362         read_lock_bh(&table->tb6_lock);
1363
1364         fn = fib6_locate(&table->tb6_root,
1365                          &cfg->fc_dst, cfg->fc_dst_len,
1366                          &cfg->fc_src, cfg->fc_src_len);
1367
1368         if (fn) {
1369                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1370                         if (cfg->fc_ifindex &&
1371                             (rt->rt6i_dev == NULL ||
1372                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1373                                 continue;
1374                         if (cfg->fc_flags & RTF_GATEWAY &&
1375                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1376                                 continue;
1377                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1378                                 continue;
1379                         dst_hold(&rt->dst);
1380                         read_unlock_bh(&table->tb6_lock);
1381
1382                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1383                 }
1384         }
1385         read_unlock_bh(&table->tb6_lock);
1386
1387         return err;
1388 }
1389
1390 /*
1391  *      Handle redirects
1392  */
1393 struct ip6rd_flowi {
1394         struct flowi fl;
1395         struct in6_addr gateway;
1396 };
1397
1398 static struct rt6_info *__ip6_route_redirect(struct net *net,
1399                                              struct fib6_table *table,
1400                                              struct flowi *fl,
1401                                              int flags)
1402 {
1403         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1404         struct rt6_info *rt;
1405         struct fib6_node *fn;
1406
1407         /*
1408          * Get the "current" route for this destination and
1409          * check if the redirect has come from approriate router.
1410          *
1411          * RFC 2461 specifies that redirects should only be
1412          * accepted if they come from the nexthop to the target.
1413          * Due to the way the routes are chosen, this notion
1414          * is a bit fuzzy and one might need to check all possible
1415          * routes.
1416          */
1417
1418         read_lock_bh(&table->tb6_lock);
1419         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1420 restart:
1421         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1422                 /*
1423                  * Current route is on-link; redirect is always invalid.
1424                  *
1425                  * Seems, previous statement is not true. It could
1426                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1427                  * But then router serving it might decide, that we should
1428                  * know truth 8)8) --ANK (980726).
1429                  */
1430                 if (rt6_check_expired(rt))
1431                         continue;
1432                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1433                         continue;
1434                 if (fl->oif != rt->rt6i_dev->ifindex)
1435                         continue;
1436                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1437                         continue;
1438                 break;
1439         }
1440
1441         if (!rt)
1442                 rt = net->ipv6.ip6_null_entry;
1443         BACKTRACK(net, &fl->fl6_src);
1444 out:
1445         dst_hold(&rt->dst);
1446
1447         read_unlock_bh(&table->tb6_lock);
1448
1449         return rt;
1450 };
1451
1452 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1453                                            struct in6_addr *src,
1454                                            struct in6_addr *gateway,
1455                                            struct net_device *dev)
1456 {
1457         int flags = RT6_LOOKUP_F_HAS_SADDR;
1458         struct net *net = dev_net(dev);
1459         struct ip6rd_flowi rdfl = {
1460                 .fl = {
1461                         .oif = dev->ifindex,
1462                         .nl_u = {
1463                                 .ip6_u = {
1464                                         .daddr = *dest,
1465                                         .saddr = *src,
1466                                 },
1467                         },
1468                 },
1469         };
1470
1471         ipv6_addr_copy(&rdfl.gateway, gateway);
1472
1473         if (rt6_need_strict(dest))
1474                 flags |= RT6_LOOKUP_F_IFACE;
1475
1476         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1477                                                    flags, __ip6_route_redirect);
1478 }
1479
1480 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1481                   struct in6_addr *saddr,
1482                   struct neighbour *neigh, u8 *lladdr, int on_link)
1483 {
1484         struct rt6_info *rt, *nrt = NULL;
1485         struct netevent_redirect netevent;
1486         struct net *net = dev_net(neigh->dev);
1487
1488         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1489
1490         if (rt == net->ipv6.ip6_null_entry) {
1491                 if (net_ratelimit())
1492                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1493                                "for redirect target\n");
1494                 goto out;
1495         }
1496
1497         /*
1498          *      We have finally decided to accept it.
1499          */
1500
1501         neigh_update(neigh, lladdr, NUD_STALE,
1502                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1503                      NEIGH_UPDATE_F_OVERRIDE|
1504                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1505                                      NEIGH_UPDATE_F_ISROUTER))
1506                      );
1507
1508         /*
1509          * Redirect received -> path was valid.
1510          * Look, redirects are sent only in response to data packets,
1511          * so that this nexthop apparently is reachable. --ANK
1512          */
1513         dst_confirm(&rt->dst);
1514
1515         /* Duplicate redirect: silently ignore. */
1516         if (neigh == rt->dst.neighbour)
1517                 goto out;
1518
1519         nrt = ip6_rt_copy(rt);
1520         if (nrt == NULL)
1521                 goto out;
1522
1523         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1524         if (on_link)
1525                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1526
1527         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1528         nrt->rt6i_dst.plen = 128;
1529         nrt->dst.flags |= DST_HOST;
1530
1531         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1532         nrt->rt6i_nexthop = neigh_clone(neigh);
1533         /* Reset pmtu, it may be better */
1534         nrt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1535         nrt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1536                                                         dst_mtu(&nrt->dst));
1537
1538         if (ip6_ins_rt(nrt))
1539                 goto out;
1540
1541         netevent.old = &rt->dst;
1542         netevent.new = &nrt->dst;
1543         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1544
1545         if (rt->rt6i_flags&RTF_CACHE) {
1546                 ip6_del_rt(rt);
1547                 return;
1548         }
1549
1550 out:
1551         dst_release(&rt->dst);
1552 }
1553
1554 /*
1555  *      Handle ICMP "packet too big" messages
1556  *      i.e. Path MTU discovery
1557  */
1558
1559 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1560                              struct net *net, u32 pmtu, int ifindex)
1561 {
1562         struct rt6_info *rt, *nrt;
1563         int allfrag = 0;
1564
1565         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1566         if (rt == NULL)
1567                 return;
1568
1569         if (pmtu >= dst_mtu(&rt->dst))
1570                 goto out;
1571
1572         if (pmtu < IPV6_MIN_MTU) {
1573                 /*
1574                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1575                  * MTU (1280) and a fragment header should always be included
1576                  * after a node receiving Too Big message reporting PMTU is
1577                  * less than the IPv6 Minimum Link MTU.
1578                  */
1579                 pmtu = IPV6_MIN_MTU;
1580                 allfrag = 1;
1581         }
1582
1583         /* New mtu received -> path was valid.
1584            They are sent only in response to data packets,
1585            so that this nexthop apparently is reachable. --ANK
1586          */
1587         dst_confirm(&rt->dst);
1588
1589         /* Host route. If it is static, it would be better
1590            not to override it, but add new one, so that
1591            when cache entry will expire old pmtu
1592            would return automatically.
1593          */
1594         if (rt->rt6i_flags & RTF_CACHE) {
1595                 rt->dst.metrics[RTAX_MTU-1] = pmtu;
1596                 if (allfrag)
1597                         rt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1598                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1599                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1600                 goto out;
1601         }
1602
1603         /* Network route.
1604            Two cases are possible:
1605            1. It is connected route. Action: COW
1606            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1607          */
1608         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1609                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1610         else
1611                 nrt = rt6_alloc_clone(rt, daddr);
1612
1613         if (nrt) {
1614                 nrt->dst.metrics[RTAX_MTU-1] = pmtu;
1615                 if (allfrag)
1616                         nrt->dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1617
1618                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1619                  * happened within 5 mins, the recommended timer is 10 mins.
1620                  * Here this route expiration time is set to ip6_rt_mtu_expires
1621                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1622                  * and detecting PMTU increase will be automatically happened.
1623                  */
1624                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1625                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1626
1627                 ip6_ins_rt(nrt);
1628         }
1629 out:
1630         dst_release(&rt->dst);
1631 }
1632
1633 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1634                         struct net_device *dev, u32 pmtu)
1635 {
1636         struct net *net = dev_net(dev);
1637
1638         /*
1639          * RFC 1981 states that a node "MUST reduce the size of the packets it
1640          * is sending along the path" that caused the Packet Too Big message.
1641          * Since it's not possible in the general case to determine which
1642          * interface was used to send the original packet, we update the MTU
1643          * on the interface that will be used to send future packets. We also
1644          * update the MTU on the interface that received the Packet Too Big in
1645          * case the original packet was forced out that interface with
1646          * SO_BINDTODEVICE or similar. This is the next best thing to the
1647          * correct behaviour, which would be to update the MTU on all
1648          * interfaces.
1649          */
1650         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1651         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1652 }
1653
1654 /*
1655  *      Misc support functions
1656  */
1657
1658 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1659 {
1660         struct net *net = dev_net(ort->rt6i_dev);
1661         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1662
1663         if (rt) {
1664                 rt->dst.input = ort->dst.input;
1665                 rt->dst.output = ort->dst.output;
1666
1667                 memcpy(rt->dst.metrics, ort->dst.metrics, RTAX_MAX*sizeof(u32));
1668                 rt->dst.error = ort->dst.error;
1669                 rt->dst.dev = ort->dst.dev;
1670                 if (rt->dst.dev)
1671                         dev_hold(rt->dst.dev);
1672                 rt->rt6i_idev = ort->rt6i_idev;
1673                 if (rt->rt6i_idev)
1674                         in6_dev_hold(rt->rt6i_idev);
1675                 rt->dst.lastuse = jiffies;
1676                 rt->rt6i_expires = 0;
1677
1678                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1679                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1680                 rt->rt6i_metric = 0;
1681
1682                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1683 #ifdef CONFIG_IPV6_SUBTREES
1684                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1685 #endif
1686                 rt->rt6i_table = ort->rt6i_table;
1687         }
1688         return rt;
1689 }
1690
1691 #ifdef CONFIG_IPV6_ROUTE_INFO
1692 static struct rt6_info *rt6_get_route_info(struct net *net,
1693                                            struct in6_addr *prefix, int prefixlen,
1694                                            struct in6_addr *gwaddr, int ifindex)
1695 {
1696         struct fib6_node *fn;
1697         struct rt6_info *rt = NULL;
1698         struct fib6_table *table;
1699
1700         table = fib6_get_table(net, RT6_TABLE_INFO);
1701         if (table == NULL)
1702                 return NULL;
1703
1704         write_lock_bh(&table->tb6_lock);
1705         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1706         if (!fn)
1707                 goto out;
1708
1709         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1710                 if (rt->rt6i_dev->ifindex != ifindex)
1711                         continue;
1712                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1713                         continue;
1714                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1715                         continue;
1716                 dst_hold(&rt->dst);
1717                 break;
1718         }
1719 out:
1720         write_unlock_bh(&table->tb6_lock);
1721         return rt;
1722 }
1723
1724 static struct rt6_info *rt6_add_route_info(struct net *net,
1725                                            struct in6_addr *prefix, int prefixlen,
1726                                            struct in6_addr *gwaddr, int ifindex,
1727                                            unsigned pref)
1728 {
1729         struct fib6_config cfg = {
1730                 .fc_table       = RT6_TABLE_INFO,
1731                 .fc_metric      = IP6_RT_PRIO_USER,
1732                 .fc_ifindex     = ifindex,
1733                 .fc_dst_len     = prefixlen,
1734                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1735                                   RTF_UP | RTF_PREF(pref),
1736                 .fc_nlinfo.pid = 0,
1737                 .fc_nlinfo.nlh = NULL,
1738                 .fc_nlinfo.nl_net = net,
1739         };
1740
1741         ipv6_addr_copy(&cfg.fc_dst, prefix);
1742         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1743
1744         /* We should treat it as a default route if prefix length is 0. */
1745         if (!prefixlen)
1746                 cfg.fc_flags |= RTF_DEFAULT;
1747
1748         ip6_route_add(&cfg);
1749
1750         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1751 }
1752 #endif
1753
1754 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1755 {
1756         struct rt6_info *rt;
1757         struct fib6_table *table;
1758
1759         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1760         if (table == NULL)
1761                 return NULL;
1762
1763         write_lock_bh(&table->tb6_lock);
1764         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1765                 if (dev == rt->rt6i_dev &&
1766                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1767                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1768                         break;
1769         }
1770         if (rt)
1771                 dst_hold(&rt->dst);
1772         write_unlock_bh(&table->tb6_lock);
1773         return rt;
1774 }
1775
1776 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1777                                      struct net_device *dev,
1778                                      unsigned int pref)
1779 {
1780         struct fib6_config cfg = {
1781                 .fc_table       = RT6_TABLE_DFLT,
1782                 .fc_metric      = IP6_RT_PRIO_USER,
1783                 .fc_ifindex     = dev->ifindex,
1784                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1785                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1786                 .fc_nlinfo.pid = 0,
1787                 .fc_nlinfo.nlh = NULL,
1788                 .fc_nlinfo.nl_net = dev_net(dev),
1789         };
1790
1791         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1792
1793         ip6_route_add(&cfg);
1794
1795         return rt6_get_dflt_router(gwaddr, dev);
1796 }
1797
1798 void rt6_purge_dflt_routers(struct net *net)
1799 {
1800         struct rt6_info *rt;
1801         struct fib6_table *table;
1802
1803         /* NOTE: Keep consistent with rt6_get_dflt_router */
1804         table = fib6_get_table(net, RT6_TABLE_DFLT);
1805         if (table == NULL)
1806                 return;
1807
1808 restart:
1809         read_lock_bh(&table->tb6_lock);
1810         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1811                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1812                         dst_hold(&rt->dst);
1813                         read_unlock_bh(&table->tb6_lock);
1814                         ip6_del_rt(rt);
1815                         goto restart;
1816                 }
1817         }
1818         read_unlock_bh(&table->tb6_lock);
1819 }
1820
1821 static void rtmsg_to_fib6_config(struct net *net,
1822                                  struct in6_rtmsg *rtmsg,
1823                                  struct fib6_config *cfg)
1824 {
1825         memset(cfg, 0, sizeof(*cfg));
1826
1827         cfg->fc_table = RT6_TABLE_MAIN;
1828         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1829         cfg->fc_metric = rtmsg->rtmsg_metric;
1830         cfg->fc_expires = rtmsg->rtmsg_info;
1831         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1832         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1833         cfg->fc_flags = rtmsg->rtmsg_flags;
1834
1835         cfg->fc_nlinfo.nl_net = net;
1836
1837         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1838         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1839         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1840 }
1841
1842 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1843 {
1844         struct fib6_config cfg;
1845         struct in6_rtmsg rtmsg;
1846         int err;
1847
1848         switch(cmd) {
1849         case SIOCADDRT:         /* Add a route */
1850         case SIOCDELRT:         /* Delete a route */
1851                 if (!capable(CAP_NET_ADMIN))
1852                         return -EPERM;
1853                 err = copy_from_user(&rtmsg, arg,
1854                                      sizeof(struct in6_rtmsg));
1855                 if (err)
1856                         return -EFAULT;
1857
1858                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1859
1860                 rtnl_lock();
1861                 switch (cmd) {
1862                 case SIOCADDRT:
1863                         err = ip6_route_add(&cfg);
1864                         break;
1865                 case SIOCDELRT:
1866                         err = ip6_route_del(&cfg);
1867                         break;
1868                 default:
1869                         err = -EINVAL;
1870                 }
1871                 rtnl_unlock();
1872
1873                 return err;
1874         }
1875
1876         return -EINVAL;
1877 }
1878
1879 /*
1880  *      Drop the packet on the floor
1881  */
1882
1883 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1884 {
1885         int type;
1886         struct dst_entry *dst = skb_dst(skb);
1887         switch (ipstats_mib_noroutes) {
1888         case IPSTATS_MIB_INNOROUTES:
1889                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1890                 if (type == IPV6_ADDR_ANY) {
1891                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1892                                       IPSTATS_MIB_INADDRERRORS);
1893                         break;
1894                 }
1895                 /* FALLTHROUGH */
1896         case IPSTATS_MIB_OUTNOROUTES:
1897                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1898                               ipstats_mib_noroutes);
1899                 break;
1900         }
1901         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1902         kfree_skb(skb);
1903         return 0;
1904 }
1905
1906 static int ip6_pkt_discard(struct sk_buff *skb)
1907 {
1908         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1909 }
1910
1911 static int ip6_pkt_discard_out(struct sk_buff *skb)
1912 {
1913         skb->dev = skb_dst(skb)->dev;
1914         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1915 }
1916
1917 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1918
1919 static int ip6_pkt_prohibit(struct sk_buff *skb)
1920 {
1921         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1922 }
1923
1924 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1925 {
1926         skb->dev = skb_dst(skb)->dev;
1927         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1928 }
1929
1930 #endif
1931
1932 /*
1933  *      Allocate a dst for local (unicast / anycast) address.
1934  */
1935
1936 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1937                                     const struct in6_addr *addr,
1938                                     int anycast)
1939 {
1940         struct net *net = dev_net(idev->dev);
1941         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1942         struct neighbour *neigh;
1943
1944         if (rt == NULL)
1945                 return ERR_PTR(-ENOMEM);
1946
1947         dev_hold(net->loopback_dev);
1948         in6_dev_hold(idev);
1949
1950         rt->dst.flags = DST_HOST;
1951         rt->dst.input = ip6_input;
1952         rt->dst.output = ip6_output;
1953         rt->rt6i_dev = net->loopback_dev;
1954         rt->rt6i_idev = idev;
1955         rt->dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1956         rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->dst));
1957         rt->dst.metrics[RTAX_HOPLIMIT-1] = -1;
1958         rt->dst.obsolete = -1;
1959
1960         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1961         if (anycast)
1962                 rt->rt6i_flags |= RTF_ANYCAST;
1963         else
1964                 rt->rt6i_flags |= RTF_LOCAL;
1965         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1966         if (IS_ERR(neigh)) {
1967                 dst_free(&rt->dst);
1968
1969                 /* We are casting this because that is the return
1970                  * value type.  But an errno encoded pointer is the
1971                  * same regardless of the underlying pointer type,
1972                  * and that's what we are returning.  So this is OK.
1973                  */
1974                 return (struct rt6_info *) neigh;
1975         }
1976         rt->rt6i_nexthop = neigh;
1977
1978         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1979         rt->rt6i_dst.plen = 128;
1980         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1981
1982         atomic_set(&rt->dst.__refcnt, 1);
1983
1984         return rt;
1985 }
1986
1987 struct arg_dev_net {
1988         struct net_device *dev;
1989         struct net *net;
1990 };
1991
1992 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1993 {
1994         struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1995         struct net *net = ((struct arg_dev_net *)arg)->net;
1996
1997         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1998             rt != net->ipv6.ip6_null_entry) {
1999                 RT6_TRACE("deleted by ifdown %p\n", rt);
2000                 return -1;
2001         }
2002         return 0;
2003 }
2004
2005 void rt6_ifdown(struct net *net, struct net_device *dev)
2006 {
2007         struct arg_dev_net adn = {
2008                 .dev = dev,
2009                 .net = net,
2010         };
2011
2012         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2013         icmp6_clean_all(fib6_ifdown, &adn);
2014 }
2015
2016 struct rt6_mtu_change_arg
2017 {
2018         struct net_device *dev;
2019         unsigned mtu;
2020 };
2021
2022 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2023 {
2024         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2025         struct inet6_dev *idev;
2026         struct net *net = dev_net(arg->dev);
2027
2028         /* In IPv6 pmtu discovery is not optional,
2029            so that RTAX_MTU lock cannot disable it.
2030            We still use this lock to block changes
2031            caused by addrconf/ndisc.
2032         */
2033
2034         idev = __in6_dev_get(arg->dev);
2035         if (idev == NULL)
2036                 return 0;
2037
2038         /* For administrative MTU increase, there is no way to discover
2039            IPv6 PMTU increase, so PMTU increase should be updated here.
2040            Since RFC 1981 doesn't include administrative MTU increase
2041            update PMTU increase is a MUST. (i.e. jumbo frame)
2042          */
2043         /*
2044            If new MTU is less than route PMTU, this new MTU will be the
2045            lowest MTU in the path, update the route PMTU to reflect PMTU
2046            decreases; if new MTU is greater than route PMTU, and the
2047            old MTU is the lowest MTU in the path, update the route PMTU
2048            to reflect the increase. In this case if the other nodes' MTU
2049            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2050            PMTU discouvery.
2051          */
2052         if (rt->rt6i_dev == arg->dev &&
2053             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2054             (dst_mtu(&rt->dst) >= arg->mtu ||
2055              (dst_mtu(&rt->dst) < arg->mtu &&
2056               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2057                 rt->dst.metrics[RTAX_MTU-1] = arg->mtu;
2058                 rt->dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
2059         }
2060         return 0;
2061 }
2062
2063 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2064 {
2065         struct rt6_mtu_change_arg arg = {
2066                 .dev = dev,
2067                 .mtu = mtu,
2068         };
2069
2070         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2071 }
2072
2073 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2074         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2075         [RTA_OIF]               = { .type = NLA_U32 },
2076         [RTA_IIF]               = { .type = NLA_U32 },
2077         [RTA_PRIORITY]          = { .type = NLA_U32 },
2078         [RTA_METRICS]           = { .type = NLA_NESTED },
2079 };
2080
2081 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2082                               struct fib6_config *cfg)
2083 {
2084         struct rtmsg *rtm;
2085         struct nlattr *tb[RTA_MAX+1];
2086         int err;
2087
2088         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2089         if (err < 0)
2090                 goto errout;
2091
2092         err = -EINVAL;
2093         rtm = nlmsg_data(nlh);
2094         memset(cfg, 0, sizeof(*cfg));
2095
2096         cfg->fc_table = rtm->rtm_table;
2097         cfg->fc_dst_len = rtm->rtm_dst_len;
2098         cfg->fc_src_len = rtm->rtm_src_len;
2099         cfg->fc_flags = RTF_UP;
2100         cfg->fc_protocol = rtm->rtm_protocol;
2101
2102         if (rtm->rtm_type == RTN_UNREACHABLE)
2103                 cfg->fc_flags |= RTF_REJECT;
2104
2105         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2106         cfg->fc_nlinfo.nlh = nlh;
2107         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2108
2109         if (tb[RTA_GATEWAY]) {
2110                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2111                 cfg->fc_flags |= RTF_GATEWAY;
2112         }
2113
2114         if (tb[RTA_DST]) {
2115                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2116
2117                 if (nla_len(tb[RTA_DST]) < plen)
2118                         goto errout;
2119
2120                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2121         }
2122
2123         if (tb[RTA_SRC]) {
2124                 int plen = (rtm->rtm_src_len + 7) >> 3;
2125
2126                 if (nla_len(tb[RTA_SRC]) < plen)
2127                         goto errout;
2128
2129                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2130         }
2131
2132         if (tb[RTA_OIF])
2133                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2134
2135         if (tb[RTA_PRIORITY])
2136                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2137
2138         if (tb[RTA_METRICS]) {
2139                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2140                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2141         }
2142
2143         if (tb[RTA_TABLE])
2144                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2145
2146         err = 0;
2147 errout:
2148         return err;
2149 }
2150
2151 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2152 {
2153         struct fib6_config cfg;
2154         int err;
2155
2156         err = rtm_to_fib6_config(skb, nlh, &cfg);
2157         if (err < 0)
2158                 return err;
2159
2160         return ip6_route_del(&cfg);
2161 }
2162
2163 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2164 {
2165         struct fib6_config cfg;
2166         int err;
2167
2168         err = rtm_to_fib6_config(skb, nlh, &cfg);
2169         if (err < 0)
2170                 return err;
2171
2172         return ip6_route_add(&cfg);
2173 }
2174
2175 static inline size_t rt6_nlmsg_size(void)
2176 {
2177         return NLMSG_ALIGN(sizeof(struct rtmsg))
2178                + nla_total_size(16) /* RTA_SRC */
2179                + nla_total_size(16) /* RTA_DST */
2180                + nla_total_size(16) /* RTA_GATEWAY */
2181                + nla_total_size(16) /* RTA_PREFSRC */
2182                + nla_total_size(4) /* RTA_TABLE */
2183                + nla_total_size(4) /* RTA_IIF */
2184                + nla_total_size(4) /* RTA_OIF */
2185                + nla_total_size(4) /* RTA_PRIORITY */
2186                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2187                + nla_total_size(sizeof(struct rta_cacheinfo));
2188 }
2189
2190 static int rt6_fill_node(struct net *net,
2191                          struct sk_buff *skb, struct rt6_info *rt,
2192                          struct in6_addr *dst, struct in6_addr *src,
2193                          int iif, int type, u32 pid, u32 seq,
2194                          int prefix, int nowait, unsigned int flags)
2195 {
2196         struct rtmsg *rtm;
2197         struct nlmsghdr *nlh;
2198         long expires;
2199         u32 table;
2200
2201         if (prefix) {   /* user wants prefix routes only */
2202                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2203                         /* success since this is not a prefix route */
2204                         return 1;
2205                 }
2206         }
2207
2208         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2209         if (nlh == NULL)
2210                 return -EMSGSIZE;
2211
2212         rtm = nlmsg_data(nlh);
2213         rtm->rtm_family = AF_INET6;
2214         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2215         rtm->rtm_src_len = rt->rt6i_src.plen;
2216         rtm->rtm_tos = 0;
2217         if (rt->rt6i_table)
2218                 table = rt->rt6i_table->tb6_id;
2219         else
2220                 table = RT6_TABLE_UNSPEC;
2221         rtm->rtm_table = table;
2222         NLA_PUT_U32(skb, RTA_TABLE, table);
2223         if (rt->rt6i_flags&RTF_REJECT)
2224                 rtm->rtm_type = RTN_UNREACHABLE;
2225         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2226                 rtm->rtm_type = RTN_LOCAL;
2227         else
2228                 rtm->rtm_type = RTN_UNICAST;
2229         rtm->rtm_flags = 0;
2230         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2231         rtm->rtm_protocol = rt->rt6i_protocol;
2232         if (rt->rt6i_flags&RTF_DYNAMIC)
2233                 rtm->rtm_protocol = RTPROT_REDIRECT;
2234         else if (rt->rt6i_flags & RTF_ADDRCONF)
2235                 rtm->rtm_protocol = RTPROT_KERNEL;
2236         else if (rt->rt6i_flags&RTF_DEFAULT)
2237                 rtm->rtm_protocol = RTPROT_RA;
2238
2239         if (rt->rt6i_flags&RTF_CACHE)
2240                 rtm->rtm_flags |= RTM_F_CLONED;
2241
2242         if (dst) {
2243                 NLA_PUT(skb, RTA_DST, 16, dst);
2244                 rtm->rtm_dst_len = 128;
2245         } else if (rtm->rtm_dst_len)
2246                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2247 #ifdef CONFIG_IPV6_SUBTREES
2248         if (src) {
2249                 NLA_PUT(skb, RTA_SRC, 16, src);
2250                 rtm->rtm_src_len = 128;
2251         } else if (rtm->rtm_src_len)
2252                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2253 #endif
2254         if (iif) {
2255 #ifdef CONFIG_IPV6_MROUTE
2256                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2257                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2258                         if (err <= 0) {
2259                                 if (!nowait) {
2260                                         if (err == 0)
2261                                                 return 0;
2262                                         goto nla_put_failure;
2263                                 } else {
2264                                         if (err == -EMSGSIZE)
2265                                                 goto nla_put_failure;
2266                                 }
2267                         }
2268                 } else
2269 #endif
2270                         NLA_PUT_U32(skb, RTA_IIF, iif);
2271         } else if (dst) {
2272                 struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2273                 struct in6_addr saddr_buf;
2274                 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2275                                        dst, 0, &saddr_buf) == 0)
2276                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2277         }
2278
2279         if (rtnetlink_put_metrics(skb, rt->dst.metrics) < 0)
2280                 goto nla_put_failure;
2281
2282         if (rt->dst.neighbour)
2283                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2284
2285         if (rt->dst.dev)
2286                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2287
2288         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2289
2290         if (!(rt->rt6i_flags & RTF_EXPIRES))
2291                 expires = 0;
2292         else if (rt->rt6i_expires - jiffies < INT_MAX)
2293                 expires = rt->rt6i_expires - jiffies;
2294         else
2295                 expires = INT_MAX;
2296
2297         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2298                                expires, rt->dst.error) < 0)
2299                 goto nla_put_failure;
2300
2301         return nlmsg_end(skb, nlh);
2302
2303 nla_put_failure:
2304         nlmsg_cancel(skb, nlh);
2305         return -EMSGSIZE;
2306 }
2307
2308 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2309 {
2310         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2311         int prefix;
2312
2313         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2314                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2315                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2316         } else
2317                 prefix = 0;
2318
2319         return rt6_fill_node(arg->net,
2320                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2321                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2322                      prefix, 0, NLM_F_MULTI);
2323 }
2324
2325 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2326 {
2327         struct net *net = sock_net(in_skb->sk);
2328         struct nlattr *tb[RTA_MAX+1];
2329         struct rt6_info *rt;
2330         struct sk_buff *skb;
2331         struct rtmsg *rtm;
2332         struct flowi fl;
2333         int err, iif = 0;
2334
2335         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2336         if (err < 0)
2337                 goto errout;
2338
2339         err = -EINVAL;
2340         memset(&fl, 0, sizeof(fl));
2341
2342         if (tb[RTA_SRC]) {
2343                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2344                         goto errout;
2345
2346                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2347         }
2348
2349         if (tb[RTA_DST]) {
2350                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2351                         goto errout;
2352
2353                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2354         }
2355
2356         if (tb[RTA_IIF])
2357                 iif = nla_get_u32(tb[RTA_IIF]);
2358
2359         if (tb[RTA_OIF])
2360                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2361
2362         if (iif) {
2363                 struct net_device *dev;
2364                 dev = __dev_get_by_index(net, iif);
2365                 if (!dev) {
2366                         err = -ENODEV;
2367                         goto errout;
2368                 }
2369         }
2370
2371         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2372         if (skb == NULL) {
2373                 err = -ENOBUFS;
2374                 goto errout;
2375         }
2376
2377         /* Reserve room for dummy headers, this skb can pass
2378            through good chunk of routing engine.
2379          */
2380         skb_reset_mac_header(skb);
2381         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2382
2383         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2384         skb_dst_set(skb, &rt->dst);
2385
2386         err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2387                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2388                             nlh->nlmsg_seq, 0, 0, 0);
2389         if (err < 0) {
2390                 kfree_skb(skb);
2391                 goto errout;
2392         }
2393
2394         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2395 errout:
2396         return err;
2397 }
2398
2399 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2400 {
2401         struct sk_buff *skb;
2402         struct net *net = info->nl_net;
2403         u32 seq;
2404         int err;
2405
2406         err = -ENOBUFS;
2407         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2408
2409         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2410         if (skb == NULL)
2411                 goto errout;
2412
2413         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2414                                 event, info->pid, seq, 0, 0, 0);
2415         if (err < 0) {
2416                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2417                 WARN_ON(err == -EMSGSIZE);
2418                 kfree_skb(skb);
2419                 goto errout;
2420         }
2421         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2422                     info->nlh, gfp_any());
2423         return;
2424 errout:
2425         if (err < 0)
2426                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2427 }
2428
2429 static int ip6_route_dev_notify(struct notifier_block *this,
2430                                 unsigned long event, void *data)
2431 {
2432         struct net_device *dev = (struct net_device *)data;
2433         struct net *net = dev_net(dev);
2434
2435         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2436                 net->ipv6.ip6_null_entry->dst.dev = dev;
2437                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2438 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2439                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2440                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2441                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2442                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2443 #endif
2444         }
2445
2446         return NOTIFY_OK;
2447 }
2448
2449 /*
2450  *      /proc
2451  */
2452
2453 #ifdef CONFIG_PROC_FS
2454
2455 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2456
2457 struct rt6_proc_arg
2458 {
2459         char *buffer;
2460         int offset;
2461         int length;
2462         int skip;
2463         int len;
2464 };
2465
2466 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2467 {
2468         struct seq_file *m = p_arg;
2469
2470         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2471
2472 #ifdef CONFIG_IPV6_SUBTREES
2473         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2474 #else
2475         seq_puts(m, "00000000000000000000000000000000 00 ");
2476 #endif
2477
2478         if (rt->rt6i_nexthop) {
2479                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2480         } else {
2481                 seq_puts(m, "00000000000000000000000000000000");
2482         }
2483         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2484                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2485                    rt->dst.__use, rt->rt6i_flags,
2486                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2487         return 0;
2488 }
2489
2490 static int ipv6_route_show(struct seq_file *m, void *v)
2491 {
2492         struct net *net = (struct net *)m->private;
2493         fib6_clean_all(net, rt6_info_route, 0, m);
2494         return 0;
2495 }
2496
2497 static int ipv6_route_open(struct inode *inode, struct file *file)
2498 {
2499         return single_open_net(inode, file, ipv6_route_show);
2500 }
2501
2502 static const struct file_operations ipv6_route_proc_fops = {
2503         .owner          = THIS_MODULE,
2504         .open           = ipv6_route_open,
2505         .read           = seq_read,
2506         .llseek         = seq_lseek,
2507         .release        = single_release_net,
2508 };
2509
2510 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2511 {
2512         struct net *net = (struct net *)seq->private;
2513         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2514                    net->ipv6.rt6_stats->fib_nodes,
2515                    net->ipv6.rt6_stats->fib_route_nodes,
2516                    net->ipv6.rt6_stats->fib_rt_alloc,
2517                    net->ipv6.rt6_stats->fib_rt_entries,
2518                    net->ipv6.rt6_stats->fib_rt_cache,
2519                    atomic_read(&net->ipv6.ip6_dst_ops.entries),
2520                    net->ipv6.rt6_stats->fib_discarded_routes);
2521
2522         return 0;
2523 }
2524
2525 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2526 {
2527         return single_open_net(inode, file, rt6_stats_seq_show);
2528 }
2529
2530 static const struct file_operations rt6_stats_seq_fops = {
2531         .owner   = THIS_MODULE,
2532         .open    = rt6_stats_seq_open,
2533         .read    = seq_read,
2534         .llseek  = seq_lseek,
2535         .release = single_release_net,
2536 };
2537 #endif  /* CONFIG_PROC_FS */
2538
2539 #ifdef CONFIG_SYSCTL
2540
2541 static
2542 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2543                               void __user *buffer, size_t *lenp, loff_t *ppos)
2544 {
2545         struct net *net = current->nsproxy->net_ns;
2546         int delay = net->ipv6.sysctl.flush_delay;
2547         if (write) {
2548                 proc_dointvec(ctl, write, buffer, lenp, ppos);
2549                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2550                 return 0;
2551         } else
2552                 return -EINVAL;
2553 }
2554
2555 ctl_table ipv6_route_table_template[] = {
2556         {
2557                 .procname       =       "flush",
2558                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2559                 .maxlen         =       sizeof(int),
2560                 .mode           =       0200,
2561                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2562         },
2563         {
2564                 .procname       =       "gc_thresh",
2565                 .data           =       &ip6_dst_ops_template.gc_thresh,
2566                 .maxlen         =       sizeof(int),
2567                 .mode           =       0644,
2568                 .proc_handler   =       proc_dointvec,
2569         },
2570         {
2571                 .procname       =       "max_size",
2572                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2573                 .maxlen         =       sizeof(int),
2574                 .mode           =       0644,
2575                 .proc_handler   =       proc_dointvec,
2576         },
2577         {
2578                 .procname       =       "gc_min_interval",
2579                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2580                 .maxlen         =       sizeof(int),
2581                 .mode           =       0644,
2582                 .proc_handler   =       proc_dointvec_jiffies,
2583         },
2584         {
2585                 .procname       =       "gc_timeout",
2586                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2587                 .maxlen         =       sizeof(int),
2588                 .mode           =       0644,
2589                 .proc_handler   =       proc_dointvec_jiffies,
2590         },
2591         {
2592                 .procname       =       "gc_interval",
2593                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2594                 .maxlen         =       sizeof(int),
2595                 .mode           =       0644,
2596                 .proc_handler   =       proc_dointvec_jiffies,
2597         },
2598         {
2599                 .procname       =       "gc_elasticity",
2600                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2601                 .maxlen         =       sizeof(int),
2602                 .mode           =       0644,
2603                 .proc_handler   =       proc_dointvec,
2604         },
2605         {
2606                 .procname       =       "mtu_expires",
2607                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2608                 .maxlen         =       sizeof(int),
2609                 .mode           =       0644,
2610                 .proc_handler   =       proc_dointvec_jiffies,
2611         },
2612         {
2613                 .procname       =       "min_adv_mss",
2614                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2615                 .maxlen         =       sizeof(int),
2616                 .mode           =       0644,
2617                 .proc_handler   =       proc_dointvec,
2618         },
2619         {
2620                 .procname       =       "gc_min_interval_ms",
2621                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2622                 .maxlen         =       sizeof(int),
2623                 .mode           =       0644,
2624                 .proc_handler   =       proc_dointvec_ms_jiffies,
2625         },
2626         { }
2627 };
2628
2629 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2630 {
2631         struct ctl_table *table;
2632
2633         table = kmemdup(ipv6_route_table_template,
2634                         sizeof(ipv6_route_table_template),
2635                         GFP_KERNEL);
2636
2637         if (table) {
2638                 table[0].data = &net->ipv6.sysctl.flush_delay;
2639                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2640                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2641                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2642                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2643                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2644                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2645                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2646                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2647                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2648         }
2649
2650         return table;
2651 }
2652 #endif
2653
2654 static int __net_init ip6_route_net_init(struct net *net)
2655 {
2656         int ret = -ENOMEM;
2657
2658         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2659                sizeof(net->ipv6.ip6_dst_ops));
2660
2661         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2662                                            sizeof(*net->ipv6.ip6_null_entry),
2663                                            GFP_KERNEL);
2664         if (!net->ipv6.ip6_null_entry)
2665                 goto out_ip6_dst_ops;
2666         net->ipv6.ip6_null_entry->dst.path =
2667                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2668         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2669
2670 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2671         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2672                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2673                                                GFP_KERNEL);
2674         if (!net->ipv6.ip6_prohibit_entry)
2675                 goto out_ip6_null_entry;
2676         net->ipv6.ip6_prohibit_entry->dst.path =
2677                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2678         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2679
2680         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2681                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2682                                                GFP_KERNEL);
2683         if (!net->ipv6.ip6_blk_hole_entry)
2684                 goto out_ip6_prohibit_entry;
2685         net->ipv6.ip6_blk_hole_entry->dst.path =
2686                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2687         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2688 #endif
2689
2690         net->ipv6.sysctl.flush_delay = 0;
2691         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2692         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2693         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2694         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2695         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2696         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2697         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2698
2699 #ifdef CONFIG_PROC_FS
2700         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2701         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2702 #endif
2703         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2704
2705         ret = 0;
2706 out:
2707         return ret;
2708
2709 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2710 out_ip6_prohibit_entry:
2711         kfree(net->ipv6.ip6_prohibit_entry);
2712 out_ip6_null_entry:
2713         kfree(net->ipv6.ip6_null_entry);
2714 #endif
2715 out_ip6_dst_ops:
2716         goto out;
2717 }
2718
2719 static void __net_exit ip6_route_net_exit(struct net *net)
2720 {
2721 #ifdef CONFIG_PROC_FS
2722         proc_net_remove(net, "ipv6_route");
2723         proc_net_remove(net, "rt6_stats");
2724 #endif
2725         kfree(net->ipv6.ip6_null_entry);
2726 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2727         kfree(net->ipv6.ip6_prohibit_entry);
2728         kfree(net->ipv6.ip6_blk_hole_entry);
2729 #endif
2730 }
2731
2732 static struct pernet_operations ip6_route_net_ops = {
2733         .init = ip6_route_net_init,
2734         .exit = ip6_route_net_exit,
2735 };
2736
2737 static struct notifier_block ip6_route_dev_notifier = {
2738         .notifier_call = ip6_route_dev_notify,
2739         .priority = 0,
2740 };
2741
2742 int __init ip6_route_init(void)
2743 {
2744         int ret;
2745
2746         ret = -ENOMEM;
2747         ip6_dst_ops_template.kmem_cachep =
2748                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2749                                   SLAB_HWCACHE_ALIGN, NULL);
2750         if (!ip6_dst_ops_template.kmem_cachep)
2751                 goto out;
2752
2753         ret = register_pernet_subsys(&ip6_route_net_ops);
2754         if (ret)
2755                 goto out_kmem_cache;
2756
2757         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2758
2759         /* Registering of the loopback is done before this portion of code,
2760          * the loopback reference in rt6_info will not be taken, do it
2761          * manually for init_net */
2762         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2763         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2764   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2765         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2766         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2767         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2768         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2769   #endif
2770         ret = fib6_init();
2771         if (ret)
2772                 goto out_register_subsys;
2773
2774         ret = xfrm6_init();
2775         if (ret)
2776                 goto out_fib6_init;
2777
2778         ret = fib6_rules_init();
2779         if (ret)
2780                 goto xfrm6_init;
2781
2782         ret = -ENOBUFS;
2783         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2784             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2785             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2786                 goto fib6_rules_init;
2787
2788         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2789         if (ret)
2790                 goto fib6_rules_init;
2791
2792 out:
2793         return ret;
2794
2795 fib6_rules_init:
2796         fib6_rules_cleanup();
2797 xfrm6_init:
2798         xfrm6_fini();
2799 out_fib6_init:
2800         fib6_gc_cleanup();
2801 out_register_subsys:
2802         unregister_pernet_subsys(&ip6_route_net_ops);
2803 out_kmem_cache:
2804         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2805         goto out;
2806 }
2807
2808 void ip6_route_cleanup(void)
2809 {
2810         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2811         fib6_rules_cleanup();
2812         xfrm6_fini();
2813         fib6_gc_cleanup();
2814         unregister_pernet_subsys(&ip6_route_net_ops);
2815         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2816 }