ipv6: Initialize the neighbour pointer of rt6_info on allocation
[pandora-kernel.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60
61 #include <asm/uaccess.h>
62
63 #ifdef CONFIG_SYSCTL
64 #include <linux/sysctl.h>
65 #endif
66
67 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
68                                     const struct in6_addr *dest);
69 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
70 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
71 static unsigned int      ip6_mtu(const struct dst_entry *dst);
72 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
73 static void             ip6_dst_destroy(struct dst_entry *);
74 static void             ip6_dst_ifdown(struct dst_entry *,
75                                        struct net_device *dev, int how);
76 static int               ip6_dst_gc(struct dst_ops *ops);
77
78 static int              ip6_pkt_discard(struct sk_buff *skb);
79 static int              ip6_pkt_discard_out(struct sk_buff *skb);
80 static void             ip6_link_failure(struct sk_buff *skb);
81 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
82
83 #ifdef CONFIG_IPV6_ROUTE_INFO
84 static struct rt6_info *rt6_add_route_info(struct net *net,
85                                            const struct in6_addr *prefix, int prefixlen,
86                                            const struct in6_addr *gwaddr, int ifindex,
87                                            unsigned int pref);
88 static struct rt6_info *rt6_get_route_info(struct net *net,
89                                            const struct in6_addr *prefix, int prefixlen,
90                                            const struct in6_addr *gwaddr, int ifindex);
91 #endif
92
93 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
94 {
95         struct rt6_info *rt = (struct rt6_info *) dst;
96         struct inet_peer *peer;
97         u32 *p = NULL;
98
99         if (!(rt->dst.flags & DST_HOST))
100                 return NULL;
101
102         peer = rt6_get_peer_create(rt);
103         if (peer) {
104                 u32 *old_p = __DST_METRICS_PTR(old);
105                 unsigned long prev, new;
106
107                 p = peer->metrics;
108                 if (inet_metrics_new(peer))
109                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
110
111                 new = (unsigned long) p;
112                 prev = cmpxchg(&dst->_metrics, old, new);
113
114                 if (prev != old) {
115                         p = __DST_METRICS_PTR(prev);
116                         if (prev & DST_METRICS_READ_ONLY)
117                                 p = NULL;
118                 }
119         }
120         return p;
121 }
122
123 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
124                                              struct sk_buff *skb,
125                                              const void *daddr)
126 {
127         struct in6_addr *p = &rt->rt6i_gateway;
128
129         if (!ipv6_addr_any(p))
130                 return (const void *) p;
131         else if (skb)
132                 return &ipv6_hdr(skb)->daddr;
133         return daddr;
134 }
135
136 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
137                                           struct sk_buff *skb,
138                                           const void *daddr)
139 {
140         struct rt6_info *rt = (struct rt6_info *) dst;
141         struct neighbour *n;
142
143         daddr = choose_neigh_daddr(rt, skb, daddr);
144         n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
145         if (n)
146                 return n;
147         return neigh_create(&nd_tbl, daddr, dst->dev);
148 }
149
150 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
151 {
152         struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
153         if (!n) {
154                 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
155                 if (IS_ERR(n))
156                         return PTR_ERR(n);
157         }
158         rt->n = n;
159
160         return 0;
161 }
162
163 static struct dst_ops ip6_dst_ops_template = {
164         .family                 =       AF_INET6,
165         .protocol               =       cpu_to_be16(ETH_P_IPV6),
166         .gc                     =       ip6_dst_gc,
167         .gc_thresh              =       1024,
168         .check                  =       ip6_dst_check,
169         .default_advmss         =       ip6_default_advmss,
170         .mtu                    =       ip6_mtu,
171         .cow_metrics            =       ipv6_cow_metrics,
172         .destroy                =       ip6_dst_destroy,
173         .ifdown                 =       ip6_dst_ifdown,
174         .negative_advice        =       ip6_negative_advice,
175         .link_failure           =       ip6_link_failure,
176         .update_pmtu            =       ip6_rt_update_pmtu,
177         .local_out              =       __ip6_local_out,
178         .neigh_lookup           =       ip6_neigh_lookup,
179 };
180
181 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
182 {
183         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
184
185         return mtu ? : dst->dev->mtu;
186 }
187
188 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
189 {
190 }
191
192 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
193                                          unsigned long old)
194 {
195         return NULL;
196 }
197
198 static struct dst_ops ip6_dst_blackhole_ops = {
199         .family                 =       AF_INET6,
200         .protocol               =       cpu_to_be16(ETH_P_IPV6),
201         .destroy                =       ip6_dst_destroy,
202         .check                  =       ip6_dst_check,
203         .mtu                    =       ip6_blackhole_mtu,
204         .default_advmss         =       ip6_default_advmss,
205         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
206         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
207         .neigh_lookup           =       ip6_neigh_lookup,
208 };
209
210 static const u32 ip6_template_metrics[RTAX_MAX] = {
211         [RTAX_HOPLIMIT - 1] = 255,
212 };
213
214 static struct rt6_info ip6_null_entry_template = {
215         .dst = {
216                 .__refcnt       = ATOMIC_INIT(1),
217                 .__use          = 1,
218                 .obsolete       = -1,
219                 .error          = -ENETUNREACH,
220                 .input          = ip6_pkt_discard,
221                 .output         = ip6_pkt_discard_out,
222         },
223         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
224         .rt6i_protocol  = RTPROT_KERNEL,
225         .rt6i_metric    = ~(u32) 0,
226         .rt6i_ref       = ATOMIC_INIT(1),
227 };
228
229 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
230
231 static int ip6_pkt_prohibit(struct sk_buff *skb);
232 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
233
234 static struct rt6_info ip6_prohibit_entry_template = {
235         .dst = {
236                 .__refcnt       = ATOMIC_INIT(1),
237                 .__use          = 1,
238                 .obsolete       = -1,
239                 .error          = -EACCES,
240                 .input          = ip6_pkt_prohibit,
241                 .output         = ip6_pkt_prohibit_out,
242         },
243         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
244         .rt6i_protocol  = RTPROT_KERNEL,
245         .rt6i_metric    = ~(u32) 0,
246         .rt6i_ref       = ATOMIC_INIT(1),
247 };
248
249 static struct rt6_info ip6_blk_hole_entry_template = {
250         .dst = {
251                 .__refcnt       = ATOMIC_INIT(1),
252                 .__use          = 1,
253                 .obsolete       = -1,
254                 .error          = -EINVAL,
255                 .input          = dst_discard,
256                 .output         = dst_discard,
257         },
258         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
259         .rt6i_protocol  = RTPROT_KERNEL,
260         .rt6i_metric    = ~(u32) 0,
261         .rt6i_ref       = ATOMIC_INIT(1),
262 };
263
264 #endif
265
266 /* allocate dst with ip6_dst_ops */
267 static inline struct rt6_info *ip6_dst_alloc(struct net *net,
268                                              struct net_device *dev,
269                                              int flags,
270                                              struct fib6_table *table)
271 {
272         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
273                                         0, 0, flags);
274
275         if (rt) {
276                 memset(&rt->n, 0,
277                        sizeof(*rt) - sizeof(struct dst_entry));
278                 rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
279         }
280         return rt;
281 }
282
283 static void ip6_dst_destroy(struct dst_entry *dst)
284 {
285         struct rt6_info *rt = (struct rt6_info *)dst;
286         struct inet6_dev *idev = rt->rt6i_idev;
287
288         if (rt->n)
289                 neigh_release(rt->n);
290
291         if (!(rt->dst.flags & DST_HOST))
292                 dst_destroy_metrics_generic(dst);
293
294         if (idev) {
295                 rt->rt6i_idev = NULL;
296                 in6_dev_put(idev);
297         }
298
299         if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
300                 dst_release(dst->from);
301
302         if (rt6_has_peer(rt)) {
303                 struct inet_peer *peer = rt6_peer_ptr(rt);
304                 inet_putpeer(peer);
305         }
306 }
307
308 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
309
310 static u32 rt6_peer_genid(void)
311 {
312         return atomic_read(&__rt6_peer_genid);
313 }
314
315 void rt6_bind_peer(struct rt6_info *rt, int create)
316 {
317         struct inet_peer_base *base;
318         struct inet_peer *peer;
319
320         base = inetpeer_base_ptr(rt->_rt6i_peer);
321         if (!base)
322                 return;
323
324         peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
325         if (peer) {
326                 if (!rt6_set_peer(rt, peer))
327                         inet_putpeer(peer);
328                 else
329                         rt->rt6i_peer_genid = rt6_peer_genid();
330         }
331 }
332
333 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
334                            int how)
335 {
336         struct rt6_info *rt = (struct rt6_info *)dst;
337         struct inet6_dev *idev = rt->rt6i_idev;
338         struct net_device *loopback_dev =
339                 dev_net(dev)->loopback_dev;
340
341         if (dev != loopback_dev) {
342                 if (idev && idev->dev == dev) {
343                         struct inet6_dev *loopback_idev =
344                                 in6_dev_get(loopback_dev);
345                         if (loopback_idev) {
346                                 rt->rt6i_idev = loopback_idev;
347                                 in6_dev_put(idev);
348                         }
349                 }
350                 if (rt->n && rt->n->dev == dev) {
351                         rt->n->dev = loopback_dev;
352                         dev_hold(loopback_dev);
353                         dev_put(dev);
354                 }
355         }
356 }
357
358 static bool rt6_check_expired(const struct rt6_info *rt)
359 {
360         struct rt6_info *ort = NULL;
361
362         if (rt->rt6i_flags & RTF_EXPIRES) {
363                 if (time_after(jiffies, rt->dst.expires))
364                         return true;
365         } else if (rt->dst.from) {
366                 ort = (struct rt6_info *) rt->dst.from;
367                 return (ort->rt6i_flags & RTF_EXPIRES) &&
368                         time_after(jiffies, ort->dst.expires);
369         }
370         return false;
371 }
372
373 static bool rt6_need_strict(const struct in6_addr *daddr)
374 {
375         return ipv6_addr_type(daddr) &
376                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
377 }
378
379 /*
380  *      Route lookup. Any table->tb6_lock is implied.
381  */
382
383 static inline struct rt6_info *rt6_device_match(struct net *net,
384                                                     struct rt6_info *rt,
385                                                     const struct in6_addr *saddr,
386                                                     int oif,
387                                                     int flags)
388 {
389         struct rt6_info *local = NULL;
390         struct rt6_info *sprt;
391
392         if (!oif && ipv6_addr_any(saddr))
393                 goto out;
394
395         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
396                 struct net_device *dev = sprt->dst.dev;
397
398                 if (oif) {
399                         if (dev->ifindex == oif)
400                                 return sprt;
401                         if (dev->flags & IFF_LOOPBACK) {
402                                 if (!sprt->rt6i_idev ||
403                                     sprt->rt6i_idev->dev->ifindex != oif) {
404                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
405                                                 continue;
406                                         if (local && (!oif ||
407                                                       local->rt6i_idev->dev->ifindex == oif))
408                                                 continue;
409                                 }
410                                 local = sprt;
411                         }
412                 } else {
413                         if (ipv6_chk_addr(net, saddr, dev,
414                                           flags & RT6_LOOKUP_F_IFACE))
415                                 return sprt;
416                 }
417         }
418
419         if (oif) {
420                 if (local)
421                         return local;
422
423                 if (flags & RT6_LOOKUP_F_IFACE)
424                         return net->ipv6.ip6_null_entry;
425         }
426 out:
427         return rt;
428 }
429
430 #ifdef CONFIG_IPV6_ROUTER_PREF
431 static void rt6_probe(struct rt6_info *rt)
432 {
433         struct neighbour *neigh;
434         /*
435          * Okay, this does not seem to be appropriate
436          * for now, however, we need to check if it
437          * is really so; aka Router Reachability Probing.
438          *
439          * Router Reachability Probe MUST be rate-limited
440          * to no more than one per minute.
441          */
442         rcu_read_lock();
443         neigh = rt ? rt->n : NULL;
444         if (!neigh || (neigh->nud_state & NUD_VALID))
445                 goto out;
446         read_lock_bh(&neigh->lock);
447         if (!(neigh->nud_state & NUD_VALID) &&
448             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
449                 struct in6_addr mcaddr;
450                 struct in6_addr *target;
451
452                 neigh->updated = jiffies;
453                 read_unlock_bh(&neigh->lock);
454
455                 target = (struct in6_addr *)&neigh->primary_key;
456                 addrconf_addr_solict_mult(target, &mcaddr);
457                 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
458         } else {
459                 read_unlock_bh(&neigh->lock);
460         }
461 out:
462         rcu_read_unlock();
463 }
464 #else
465 static inline void rt6_probe(struct rt6_info *rt)
466 {
467 }
468 #endif
469
470 /*
471  * Default Router Selection (RFC 2461 6.3.6)
472  */
473 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
474 {
475         struct net_device *dev = rt->dst.dev;
476         if (!oif || dev->ifindex == oif)
477                 return 2;
478         if ((dev->flags & IFF_LOOPBACK) &&
479             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
480                 return 1;
481         return 0;
482 }
483
484 static inline int rt6_check_neigh(struct rt6_info *rt)
485 {
486         struct neighbour *neigh;
487         int m;
488
489         rcu_read_lock();
490         neigh = rt->n;
491         if (rt->rt6i_flags & RTF_NONEXTHOP ||
492             !(rt->rt6i_flags & RTF_GATEWAY))
493                 m = 1;
494         else if (neigh) {
495                 read_lock_bh(&neigh->lock);
496                 if (neigh->nud_state & NUD_VALID)
497                         m = 2;
498 #ifdef CONFIG_IPV6_ROUTER_PREF
499                 else if (neigh->nud_state & NUD_FAILED)
500                         m = 0;
501 #endif
502                 else
503                         m = 1;
504                 read_unlock_bh(&neigh->lock);
505         } else
506                 m = 0;
507         rcu_read_unlock();
508         return m;
509 }
510
511 static int rt6_score_route(struct rt6_info *rt, int oif,
512                            int strict)
513 {
514         int m, n;
515
516         m = rt6_check_dev(rt, oif);
517         if (!m && (strict & RT6_LOOKUP_F_IFACE))
518                 return -1;
519 #ifdef CONFIG_IPV6_ROUTER_PREF
520         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
521 #endif
522         n = rt6_check_neigh(rt);
523         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
524                 return -1;
525         return m;
526 }
527
528 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
529                                    int *mpri, struct rt6_info *match)
530 {
531         int m;
532
533         if (rt6_check_expired(rt))
534                 goto out;
535
536         m = rt6_score_route(rt, oif, strict);
537         if (m < 0)
538                 goto out;
539
540         if (m > *mpri) {
541                 if (strict & RT6_LOOKUP_F_REACHABLE)
542                         rt6_probe(match);
543                 *mpri = m;
544                 match = rt;
545         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
546                 rt6_probe(rt);
547         }
548
549 out:
550         return match;
551 }
552
553 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
554                                      struct rt6_info *rr_head,
555                                      u32 metric, int oif, int strict)
556 {
557         struct rt6_info *rt, *match;
558         int mpri = -1;
559
560         match = NULL;
561         for (rt = rr_head; rt && rt->rt6i_metric == metric;
562              rt = rt->dst.rt6_next)
563                 match = find_match(rt, oif, strict, &mpri, match);
564         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
565              rt = rt->dst.rt6_next)
566                 match = find_match(rt, oif, strict, &mpri, match);
567
568         return match;
569 }
570
571 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
572 {
573         struct rt6_info *match, *rt0;
574         struct net *net;
575
576         rt0 = fn->rr_ptr;
577         if (!rt0)
578                 fn->rr_ptr = rt0 = fn->leaf;
579
580         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
581
582         if (!match &&
583             (strict & RT6_LOOKUP_F_REACHABLE)) {
584                 struct rt6_info *next = rt0->dst.rt6_next;
585
586                 /* no entries matched; do round-robin */
587                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
588                         next = fn->leaf;
589
590                 if (next != rt0)
591                         fn->rr_ptr = next;
592         }
593
594         net = dev_net(rt0->dst.dev);
595         return match ? match : net->ipv6.ip6_null_entry;
596 }
597
598 #ifdef CONFIG_IPV6_ROUTE_INFO
599 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
600                   const struct in6_addr *gwaddr)
601 {
602         struct net *net = dev_net(dev);
603         struct route_info *rinfo = (struct route_info *) opt;
604         struct in6_addr prefix_buf, *prefix;
605         unsigned int pref;
606         unsigned long lifetime;
607         struct rt6_info *rt;
608
609         if (len < sizeof(struct route_info)) {
610                 return -EINVAL;
611         }
612
613         /* Sanity check for prefix_len and length */
614         if (rinfo->length > 3) {
615                 return -EINVAL;
616         } else if (rinfo->prefix_len > 128) {
617                 return -EINVAL;
618         } else if (rinfo->prefix_len > 64) {
619                 if (rinfo->length < 2) {
620                         return -EINVAL;
621                 }
622         } else if (rinfo->prefix_len > 0) {
623                 if (rinfo->length < 1) {
624                         return -EINVAL;
625                 }
626         }
627
628         pref = rinfo->route_pref;
629         if (pref == ICMPV6_ROUTER_PREF_INVALID)
630                 return -EINVAL;
631
632         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
633
634         if (rinfo->length == 3)
635                 prefix = (struct in6_addr *)rinfo->prefix;
636         else {
637                 /* this function is safe */
638                 ipv6_addr_prefix(&prefix_buf,
639                                  (struct in6_addr *)rinfo->prefix,
640                                  rinfo->prefix_len);
641                 prefix = &prefix_buf;
642         }
643
644         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
645                                 dev->ifindex);
646
647         if (rt && !lifetime) {
648                 ip6_del_rt(rt);
649                 rt = NULL;
650         }
651
652         if (!rt && lifetime)
653                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
654                                         pref);
655         else if (rt)
656                 rt->rt6i_flags = RTF_ROUTEINFO |
657                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
658
659         if (rt) {
660                 if (!addrconf_finite_timeout(lifetime))
661                         rt6_clean_expires(rt);
662                 else
663                         rt6_set_expires(rt, jiffies + HZ * lifetime);
664
665                 dst_release(&rt->dst);
666         }
667         return 0;
668 }
669 #endif
670
671 #define BACKTRACK(__net, saddr)                 \
672 do { \
673         if (rt == __net->ipv6.ip6_null_entry) { \
674                 struct fib6_node *pn; \
675                 while (1) { \
676                         if (fn->fn_flags & RTN_TL_ROOT) \
677                                 goto out; \
678                         pn = fn->parent; \
679                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
680                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
681                         else \
682                                 fn = pn; \
683                         if (fn->fn_flags & RTN_RTINFO) \
684                                 goto restart; \
685                 } \
686         } \
687 } while (0)
688
689 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
690                                              struct fib6_table *table,
691                                              struct flowi6 *fl6, int flags)
692 {
693         struct fib6_node *fn;
694         struct rt6_info *rt;
695
696         read_lock_bh(&table->tb6_lock);
697         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
698 restart:
699         rt = fn->leaf;
700         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
701         BACKTRACK(net, &fl6->saddr);
702 out:
703         dst_use(&rt->dst, jiffies);
704         read_unlock_bh(&table->tb6_lock);
705         return rt;
706
707 }
708
709 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
710                                     int flags)
711 {
712         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
713 }
714 EXPORT_SYMBOL_GPL(ip6_route_lookup);
715
716 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
717                             const struct in6_addr *saddr, int oif, int strict)
718 {
719         struct flowi6 fl6 = {
720                 .flowi6_oif = oif,
721                 .daddr = *daddr,
722         };
723         struct dst_entry *dst;
724         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
725
726         if (saddr) {
727                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
728                 flags |= RT6_LOOKUP_F_HAS_SADDR;
729         }
730
731         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
732         if (dst->error == 0)
733                 return (struct rt6_info *) dst;
734
735         dst_release(dst);
736
737         return NULL;
738 }
739
740 EXPORT_SYMBOL(rt6_lookup);
741
742 /* ip6_ins_rt is called with FREE table->tb6_lock.
743    It takes new route entry, the addition fails by any reason the
744    route is freed. In any case, if caller does not hold it, it may
745    be destroyed.
746  */
747
748 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
749 {
750         int err;
751         struct fib6_table *table;
752
753         table = rt->rt6i_table;
754         write_lock_bh(&table->tb6_lock);
755         err = fib6_add(&table->tb6_root, rt, info);
756         write_unlock_bh(&table->tb6_lock);
757
758         return err;
759 }
760
761 int ip6_ins_rt(struct rt6_info *rt)
762 {
763         struct nl_info info = {
764                 .nl_net = dev_net(rt->dst.dev),
765         };
766         return __ip6_ins_rt(rt, &info);
767 }
768
769 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
770                                       const struct in6_addr *daddr,
771                                       const struct in6_addr *saddr)
772 {
773         struct rt6_info *rt;
774
775         /*
776          *      Clone the route.
777          */
778
779         rt = ip6_rt_copy(ort, daddr);
780
781         if (rt) {
782                 int attempts = !in_softirq();
783
784                 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
785                         if (ort->rt6i_dst.plen != 128 &&
786                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
787                                 rt->rt6i_flags |= RTF_ANYCAST;
788                         rt->rt6i_gateway = *daddr;
789                 }
790
791                 rt->rt6i_flags |= RTF_CACHE;
792
793 #ifdef CONFIG_IPV6_SUBTREES
794                 if (rt->rt6i_src.plen && saddr) {
795                         rt->rt6i_src.addr = *saddr;
796                         rt->rt6i_src.plen = 128;
797                 }
798 #endif
799
800         retry:
801                 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
802                         struct net *net = dev_net(rt->dst.dev);
803                         int saved_rt_min_interval =
804                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
805                         int saved_rt_elasticity =
806                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
807
808                         if (attempts-- > 0) {
809                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
810                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
811
812                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
813
814                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
815                                         saved_rt_elasticity;
816                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
817                                         saved_rt_min_interval;
818                                 goto retry;
819                         }
820
821                         net_warn_ratelimited("Neighbour table overflow\n");
822                         dst_free(&rt->dst);
823                         return NULL;
824                 }
825         }
826
827         return rt;
828 }
829
830 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
831                                         const struct in6_addr *daddr)
832 {
833         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
834
835         if (rt) {
836                 rt->rt6i_flags |= RTF_CACHE;
837                 rt->n = neigh_clone(ort->n);
838         }
839         return rt;
840 }
841
842 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
843                                       struct flowi6 *fl6, int flags)
844 {
845         struct fib6_node *fn;
846         struct rt6_info *rt, *nrt;
847         int strict = 0;
848         int attempts = 3;
849         int err;
850         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
851
852         strict |= flags & RT6_LOOKUP_F_IFACE;
853
854 relookup:
855         read_lock_bh(&table->tb6_lock);
856
857 restart_2:
858         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
859
860 restart:
861         rt = rt6_select(fn, oif, strict | reachable);
862
863         BACKTRACK(net, &fl6->saddr);
864         if (rt == net->ipv6.ip6_null_entry ||
865             rt->rt6i_flags & RTF_CACHE)
866                 goto out;
867
868         dst_hold(&rt->dst);
869         read_unlock_bh(&table->tb6_lock);
870
871         if (!rt->n && !(rt->rt6i_flags & RTF_NONEXTHOP))
872                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
873         else if (!(rt->dst.flags & DST_HOST))
874                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
875         else
876                 goto out2;
877
878         dst_release(&rt->dst);
879         rt = nrt ? : net->ipv6.ip6_null_entry;
880
881         dst_hold(&rt->dst);
882         if (nrt) {
883                 err = ip6_ins_rt(nrt);
884                 if (!err)
885                         goto out2;
886         }
887
888         if (--attempts <= 0)
889                 goto out2;
890
891         /*
892          * Race condition! In the gap, when table->tb6_lock was
893          * released someone could insert this route.  Relookup.
894          */
895         dst_release(&rt->dst);
896         goto relookup;
897
898 out:
899         if (reachable) {
900                 reachable = 0;
901                 goto restart_2;
902         }
903         dst_hold(&rt->dst);
904         read_unlock_bh(&table->tb6_lock);
905 out2:
906         rt->dst.lastuse = jiffies;
907         rt->dst.__use++;
908
909         return rt;
910 }
911
912 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
913                                             struct flowi6 *fl6, int flags)
914 {
915         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
916 }
917
918 static struct dst_entry *ip6_route_input_lookup(struct net *net,
919                                                 struct net_device *dev,
920                                                 struct flowi6 *fl6, int flags)
921 {
922         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
923                 flags |= RT6_LOOKUP_F_IFACE;
924
925         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
926 }
927
928 void ip6_route_input(struct sk_buff *skb)
929 {
930         const struct ipv6hdr *iph = ipv6_hdr(skb);
931         struct net *net = dev_net(skb->dev);
932         int flags = RT6_LOOKUP_F_HAS_SADDR;
933         struct flowi6 fl6 = {
934                 .flowi6_iif = skb->dev->ifindex,
935                 .daddr = iph->daddr,
936                 .saddr = iph->saddr,
937                 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
938                 .flowi6_mark = skb->mark,
939                 .flowi6_proto = iph->nexthdr,
940         };
941
942         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
943 }
944
945 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
946                                              struct flowi6 *fl6, int flags)
947 {
948         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
949 }
950
951 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
952                                     struct flowi6 *fl6)
953 {
954         int flags = 0;
955
956         fl6->flowi6_iif = net->loopback_dev->ifindex;
957
958         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
959                 flags |= RT6_LOOKUP_F_IFACE;
960
961         if (!ipv6_addr_any(&fl6->saddr))
962                 flags |= RT6_LOOKUP_F_HAS_SADDR;
963         else if (sk)
964                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
965
966         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
967 }
968
969 EXPORT_SYMBOL(ip6_route_output);
970
971 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
972 {
973         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
974         struct dst_entry *new = NULL;
975
976         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
977         if (rt) {
978                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
979                 rt6_init_peer(rt, net->ipv6.peers);
980
981                 new = &rt->dst;
982
983                 new->__use = 1;
984                 new->input = dst_discard;
985                 new->output = dst_discard;
986
987                 if (dst_metrics_read_only(&ort->dst))
988                         new->_metrics = ort->dst._metrics;
989                 else
990                         dst_copy_metrics(new, &ort->dst);
991                 rt->rt6i_idev = ort->rt6i_idev;
992                 if (rt->rt6i_idev)
993                         in6_dev_hold(rt->rt6i_idev);
994
995                 rt->rt6i_gateway = ort->rt6i_gateway;
996                 rt->rt6i_flags = ort->rt6i_flags;
997                 rt6_clean_expires(rt);
998                 rt->rt6i_metric = 0;
999
1000                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1001 #ifdef CONFIG_IPV6_SUBTREES
1002                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1003 #endif
1004
1005                 dst_free(new);
1006         }
1007
1008         dst_release(dst_orig);
1009         return new ? new : ERR_PTR(-ENOMEM);
1010 }
1011
1012 /*
1013  *      Destination cache support functions
1014  */
1015
1016 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1017 {
1018         struct rt6_info *rt;
1019
1020         rt = (struct rt6_info *) dst;
1021
1022         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
1023                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
1024                         if (!rt6_has_peer(rt))
1025                                 rt6_bind_peer(rt, 0);
1026                         rt->rt6i_peer_genid = rt6_peer_genid();
1027                 }
1028                 return dst;
1029         }
1030         return NULL;
1031 }
1032
1033 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1034 {
1035         struct rt6_info *rt = (struct rt6_info *) dst;
1036
1037         if (rt) {
1038                 if (rt->rt6i_flags & RTF_CACHE) {
1039                         if (rt6_check_expired(rt)) {
1040                                 ip6_del_rt(rt);
1041                                 dst = NULL;
1042                         }
1043                 } else {
1044                         dst_release(dst);
1045                         dst = NULL;
1046                 }
1047         }
1048         return dst;
1049 }
1050
1051 static void ip6_link_failure(struct sk_buff *skb)
1052 {
1053         struct rt6_info *rt;
1054
1055         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1056
1057         rt = (struct rt6_info *) skb_dst(skb);
1058         if (rt) {
1059                 if (rt->rt6i_flags & RTF_CACHE)
1060                         rt6_update_expires(rt, 0);
1061                 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1062                         rt->rt6i_node->fn_sernum = -1;
1063         }
1064 }
1065
1066 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1067 {
1068         struct rt6_info *rt6 = (struct rt6_info*)dst;
1069
1070         dst_confirm(dst);
1071         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1072                 struct net *net = dev_net(dst->dev);
1073
1074                 rt6->rt6i_flags |= RTF_MODIFIED;
1075                 if (mtu < IPV6_MIN_MTU) {
1076                         u32 features = dst_metric(dst, RTAX_FEATURES);
1077                         mtu = IPV6_MIN_MTU;
1078                         features |= RTAX_FEATURE_ALLFRAG;
1079                         dst_metric_set(dst, RTAX_FEATURES, features);
1080                 }
1081                 dst_metric_set(dst, RTAX_MTU, mtu);
1082                 rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
1083         }
1084 }
1085
1086 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1087                      int oif, u32 mark)
1088 {
1089         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1090         struct dst_entry *dst;
1091         struct flowi6 fl6;
1092
1093         memset(&fl6, 0, sizeof(fl6));
1094         fl6.flowi6_oif = oif;
1095         fl6.flowi6_mark = mark;
1096         fl6.flowi6_flags = FLOWI_FLAG_PRECOW_METRICS;
1097         fl6.daddr = iph->daddr;
1098         fl6.saddr = iph->saddr;
1099         fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
1100
1101         dst = ip6_route_output(net, NULL, &fl6);
1102         if (!dst->error)
1103                 ip6_rt_update_pmtu(dst, ntohl(mtu));
1104         dst_release(dst);
1105 }
1106 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1107
1108 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1109 {
1110         ip6_update_pmtu(skb, sock_net(sk), mtu,
1111                         sk->sk_bound_dev_if, sk->sk_mark);
1112 }
1113 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1114
1115 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1116 {
1117         struct net_device *dev = dst->dev;
1118         unsigned int mtu = dst_mtu(dst);
1119         struct net *net = dev_net(dev);
1120
1121         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1122
1123         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1124                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1125
1126         /*
1127          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1128          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1129          * IPV6_MAXPLEN is also valid and means: "any MSS,
1130          * rely only on pmtu discovery"
1131          */
1132         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1133                 mtu = IPV6_MAXPLEN;
1134         return mtu;
1135 }
1136
1137 static unsigned int ip6_mtu(const struct dst_entry *dst)
1138 {
1139         struct inet6_dev *idev;
1140         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1141
1142         if (mtu)
1143                 return mtu;
1144
1145         mtu = IPV6_MIN_MTU;
1146
1147         rcu_read_lock();
1148         idev = __in6_dev_get(dst->dev);
1149         if (idev)
1150                 mtu = idev->cnf.mtu6;
1151         rcu_read_unlock();
1152
1153         return mtu;
1154 }
1155
1156 static struct dst_entry *icmp6_dst_gc_list;
1157 static DEFINE_SPINLOCK(icmp6_dst_lock);
1158
1159 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1160                                   struct neighbour *neigh,
1161                                   struct flowi6 *fl6)
1162 {
1163         struct dst_entry *dst;
1164         struct rt6_info *rt;
1165         struct inet6_dev *idev = in6_dev_get(dev);
1166         struct net *net = dev_net(dev);
1167
1168         if (unlikely(!idev))
1169                 return ERR_PTR(-ENODEV);
1170
1171         rt = ip6_dst_alloc(net, dev, 0, NULL);
1172         if (unlikely(!rt)) {
1173                 in6_dev_put(idev);
1174                 dst = ERR_PTR(-ENOMEM);
1175                 goto out;
1176         }
1177
1178         if (neigh)
1179                 neigh_hold(neigh);
1180         else {
1181                 neigh = ip6_neigh_lookup(&rt->dst, NULL, &fl6->daddr);
1182                 if (IS_ERR(neigh)) {
1183                         in6_dev_put(idev);
1184                         dst_free(&rt->dst);
1185                         return ERR_CAST(neigh);
1186                 }
1187         }
1188
1189         rt->dst.flags |= DST_HOST;
1190         rt->dst.output  = ip6_output;
1191         rt->n = neigh;
1192         atomic_set(&rt->dst.__refcnt, 1);
1193         rt->rt6i_dst.addr = fl6->daddr;
1194         rt->rt6i_dst.plen = 128;
1195         rt->rt6i_idev     = idev;
1196         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1197
1198         spin_lock_bh(&icmp6_dst_lock);
1199         rt->dst.next = icmp6_dst_gc_list;
1200         icmp6_dst_gc_list = &rt->dst;
1201         spin_unlock_bh(&icmp6_dst_lock);
1202
1203         fib6_force_start_gc(net);
1204
1205         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1206
1207 out:
1208         return dst;
1209 }
1210
1211 int icmp6_dst_gc(void)
1212 {
1213         struct dst_entry *dst, **pprev;
1214         int more = 0;
1215
1216         spin_lock_bh(&icmp6_dst_lock);
1217         pprev = &icmp6_dst_gc_list;
1218
1219         while ((dst = *pprev) != NULL) {
1220                 if (!atomic_read(&dst->__refcnt)) {
1221                         *pprev = dst->next;
1222                         dst_free(dst);
1223                 } else {
1224                         pprev = &dst->next;
1225                         ++more;
1226                 }
1227         }
1228
1229         spin_unlock_bh(&icmp6_dst_lock);
1230
1231         return more;
1232 }
1233
1234 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1235                             void *arg)
1236 {
1237         struct dst_entry *dst, **pprev;
1238
1239         spin_lock_bh(&icmp6_dst_lock);
1240         pprev = &icmp6_dst_gc_list;
1241         while ((dst = *pprev) != NULL) {
1242                 struct rt6_info *rt = (struct rt6_info *) dst;
1243                 if (func(rt, arg)) {
1244                         *pprev = dst->next;
1245                         dst_free(dst);
1246                 } else {
1247                         pprev = &dst->next;
1248                 }
1249         }
1250         spin_unlock_bh(&icmp6_dst_lock);
1251 }
1252
1253 static int ip6_dst_gc(struct dst_ops *ops)
1254 {
1255         unsigned long now = jiffies;
1256         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1257         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1258         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1259         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1260         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1261         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1262         int entries;
1263
1264         entries = dst_entries_get_fast(ops);
1265         if (time_after(rt_last_gc + rt_min_interval, now) &&
1266             entries <= rt_max_size)
1267                 goto out;
1268
1269         net->ipv6.ip6_rt_gc_expire++;
1270         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1271         net->ipv6.ip6_rt_last_gc = now;
1272         entries = dst_entries_get_slow(ops);
1273         if (entries < ops->gc_thresh)
1274                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1275 out:
1276         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1277         return entries > rt_max_size;
1278 }
1279
1280 /* Clean host part of a prefix. Not necessary in radix tree,
1281    but results in cleaner routing tables.
1282
1283    Remove it only when all the things will work!
1284  */
1285
1286 int ip6_dst_hoplimit(struct dst_entry *dst)
1287 {
1288         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1289         if (hoplimit == 0) {
1290                 struct net_device *dev = dst->dev;
1291                 struct inet6_dev *idev;
1292
1293                 rcu_read_lock();
1294                 idev = __in6_dev_get(dev);
1295                 if (idev)
1296                         hoplimit = idev->cnf.hop_limit;
1297                 else
1298                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1299                 rcu_read_unlock();
1300         }
1301         return hoplimit;
1302 }
1303 EXPORT_SYMBOL(ip6_dst_hoplimit);
1304
1305 /*
1306  *
1307  */
1308
1309 int ip6_route_add(struct fib6_config *cfg)
1310 {
1311         int err;
1312         struct net *net = cfg->fc_nlinfo.nl_net;
1313         struct rt6_info *rt = NULL;
1314         struct net_device *dev = NULL;
1315         struct inet6_dev *idev = NULL;
1316         struct fib6_table *table;
1317         int addr_type;
1318
1319         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1320                 return -EINVAL;
1321 #ifndef CONFIG_IPV6_SUBTREES
1322         if (cfg->fc_src_len)
1323                 return -EINVAL;
1324 #endif
1325         if (cfg->fc_ifindex) {
1326                 err = -ENODEV;
1327                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1328                 if (!dev)
1329                         goto out;
1330                 idev = in6_dev_get(dev);
1331                 if (!idev)
1332                         goto out;
1333         }
1334
1335         if (cfg->fc_metric == 0)
1336                 cfg->fc_metric = IP6_RT_PRIO_USER;
1337
1338         err = -ENOBUFS;
1339         if (cfg->fc_nlinfo.nlh &&
1340             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1341                 table = fib6_get_table(net, cfg->fc_table);
1342                 if (!table) {
1343                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1344                         table = fib6_new_table(net, cfg->fc_table);
1345                 }
1346         } else {
1347                 table = fib6_new_table(net, cfg->fc_table);
1348         }
1349
1350         if (!table)
1351                 goto out;
1352
1353         rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT, table);
1354
1355         if (!rt) {
1356                 err = -ENOMEM;
1357                 goto out;
1358         }
1359
1360         rt->dst.obsolete = -1;
1361
1362         if (cfg->fc_flags & RTF_EXPIRES)
1363                 rt6_set_expires(rt, jiffies +
1364                                 clock_t_to_jiffies(cfg->fc_expires));
1365         else
1366                 rt6_clean_expires(rt);
1367
1368         if (cfg->fc_protocol == RTPROT_UNSPEC)
1369                 cfg->fc_protocol = RTPROT_BOOT;
1370         rt->rt6i_protocol = cfg->fc_protocol;
1371
1372         addr_type = ipv6_addr_type(&cfg->fc_dst);
1373
1374         if (addr_type & IPV6_ADDR_MULTICAST)
1375                 rt->dst.input = ip6_mc_input;
1376         else if (cfg->fc_flags & RTF_LOCAL)
1377                 rt->dst.input = ip6_input;
1378         else
1379                 rt->dst.input = ip6_forward;
1380
1381         rt->dst.output = ip6_output;
1382
1383         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1384         rt->rt6i_dst.plen = cfg->fc_dst_len;
1385         if (rt->rt6i_dst.plen == 128)
1386                rt->dst.flags |= DST_HOST;
1387
1388         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1389                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1390                 if (!metrics) {
1391                         err = -ENOMEM;
1392                         goto out;
1393                 }
1394                 dst_init_metrics(&rt->dst, metrics, 0);
1395         }
1396 #ifdef CONFIG_IPV6_SUBTREES
1397         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1398         rt->rt6i_src.plen = cfg->fc_src_len;
1399 #endif
1400
1401         rt->rt6i_metric = cfg->fc_metric;
1402
1403         /* We cannot add true routes via loopback here,
1404            they would result in kernel looping; promote them to reject routes
1405          */
1406         if ((cfg->fc_flags & RTF_REJECT) ||
1407             (dev && (dev->flags & IFF_LOOPBACK) &&
1408              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1409              !(cfg->fc_flags & RTF_LOCAL))) {
1410                 /* hold loopback dev/idev if we haven't done so. */
1411                 if (dev != net->loopback_dev) {
1412                         if (dev) {
1413                                 dev_put(dev);
1414                                 in6_dev_put(idev);
1415                         }
1416                         dev = net->loopback_dev;
1417                         dev_hold(dev);
1418                         idev = in6_dev_get(dev);
1419                         if (!idev) {
1420                                 err = -ENODEV;
1421                                 goto out;
1422                         }
1423                 }
1424                 rt->dst.output = ip6_pkt_discard_out;
1425                 rt->dst.input = ip6_pkt_discard;
1426                 rt->dst.error = -ENETUNREACH;
1427                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1428                 goto install_route;
1429         }
1430
1431         if (cfg->fc_flags & RTF_GATEWAY) {
1432                 const struct in6_addr *gw_addr;
1433                 int gwa_type;
1434
1435                 gw_addr = &cfg->fc_gateway;
1436                 rt->rt6i_gateway = *gw_addr;
1437                 gwa_type = ipv6_addr_type(gw_addr);
1438
1439                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1440                         struct rt6_info *grt;
1441
1442                         /* IPv6 strictly inhibits using not link-local
1443                            addresses as nexthop address.
1444                            Otherwise, router will not able to send redirects.
1445                            It is very good, but in some (rare!) circumstances
1446                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1447                            some exceptions. --ANK
1448                          */
1449                         err = -EINVAL;
1450                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1451                                 goto out;
1452
1453                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1454
1455                         err = -EHOSTUNREACH;
1456                         if (!grt)
1457                                 goto out;
1458                         if (dev) {
1459                                 if (dev != grt->dst.dev) {
1460                                         dst_release(&grt->dst);
1461                                         goto out;
1462                                 }
1463                         } else {
1464                                 dev = grt->dst.dev;
1465                                 idev = grt->rt6i_idev;
1466                                 dev_hold(dev);
1467                                 in6_dev_hold(grt->rt6i_idev);
1468                         }
1469                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1470                                 err = 0;
1471                         dst_release(&grt->dst);
1472
1473                         if (err)
1474                                 goto out;
1475                 }
1476                 err = -EINVAL;
1477                 if (!dev || (dev->flags & IFF_LOOPBACK))
1478                         goto out;
1479         }
1480
1481         err = -ENODEV;
1482         if (!dev)
1483                 goto out;
1484
1485         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1486                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1487                         err = -EINVAL;
1488                         goto out;
1489                 }
1490                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1491                 rt->rt6i_prefsrc.plen = 128;
1492         } else
1493                 rt->rt6i_prefsrc.plen = 0;
1494
1495         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1496                 err = rt6_bind_neighbour(rt, dev);
1497                 if (err)
1498                         goto out;
1499         }
1500
1501         rt->rt6i_flags = cfg->fc_flags;
1502
1503 install_route:
1504         if (cfg->fc_mx) {
1505                 struct nlattr *nla;
1506                 int remaining;
1507
1508                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1509                         int type = nla_type(nla);
1510
1511                         if (type) {
1512                                 if (type > RTAX_MAX) {
1513                                         err = -EINVAL;
1514                                         goto out;
1515                                 }
1516
1517                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1518                         }
1519                 }
1520         }
1521
1522         rt->dst.dev = dev;
1523         rt->rt6i_idev = idev;
1524         rt->rt6i_table = table;
1525
1526         cfg->fc_nlinfo.nl_net = dev_net(dev);
1527
1528         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1529
1530 out:
1531         if (dev)
1532                 dev_put(dev);
1533         if (idev)
1534                 in6_dev_put(idev);
1535         if (rt)
1536                 dst_free(&rt->dst);
1537         return err;
1538 }
1539
1540 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1541 {
1542         int err;
1543         struct fib6_table *table;
1544         struct net *net = dev_net(rt->dst.dev);
1545
1546         if (rt == net->ipv6.ip6_null_entry)
1547                 return -ENOENT;
1548
1549         table = rt->rt6i_table;
1550         write_lock_bh(&table->tb6_lock);
1551
1552         err = fib6_del(rt, info);
1553         dst_release(&rt->dst);
1554
1555         write_unlock_bh(&table->tb6_lock);
1556
1557         return err;
1558 }
1559
1560 int ip6_del_rt(struct rt6_info *rt)
1561 {
1562         struct nl_info info = {
1563                 .nl_net = dev_net(rt->dst.dev),
1564         };
1565         return __ip6_del_rt(rt, &info);
1566 }
1567
1568 static int ip6_route_del(struct fib6_config *cfg)
1569 {
1570         struct fib6_table *table;
1571         struct fib6_node *fn;
1572         struct rt6_info *rt;
1573         int err = -ESRCH;
1574
1575         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1576         if (!table)
1577                 return err;
1578
1579         read_lock_bh(&table->tb6_lock);
1580
1581         fn = fib6_locate(&table->tb6_root,
1582                          &cfg->fc_dst, cfg->fc_dst_len,
1583                          &cfg->fc_src, cfg->fc_src_len);
1584
1585         if (fn) {
1586                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1587                         if (cfg->fc_ifindex &&
1588                             (!rt->dst.dev ||
1589                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1590                                 continue;
1591                         if (cfg->fc_flags & RTF_GATEWAY &&
1592                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1593                                 continue;
1594                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1595                                 continue;
1596                         dst_hold(&rt->dst);
1597                         read_unlock_bh(&table->tb6_lock);
1598
1599                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1600                 }
1601         }
1602         read_unlock_bh(&table->tb6_lock);
1603
1604         return err;
1605 }
1606
1607 /*
1608  *      Handle redirects
1609  */
1610 struct ip6rd_flowi {
1611         struct flowi6 fl6;
1612         struct in6_addr gateway;
1613 };
1614
1615 static struct rt6_info *__ip6_route_redirect(struct net *net,
1616                                              struct fib6_table *table,
1617                                              struct flowi6 *fl6,
1618                                              int flags)
1619 {
1620         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1621         struct rt6_info *rt;
1622         struct fib6_node *fn;
1623
1624         /*
1625          * Get the "current" route for this destination and
1626          * check if the redirect has come from approriate router.
1627          *
1628          * RFC 2461 specifies that redirects should only be
1629          * accepted if they come from the nexthop to the target.
1630          * Due to the way the routes are chosen, this notion
1631          * is a bit fuzzy and one might need to check all possible
1632          * routes.
1633          */
1634
1635         read_lock_bh(&table->tb6_lock);
1636         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1637 restart:
1638         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1639                 /*
1640                  * Current route is on-link; redirect is always invalid.
1641                  *
1642                  * Seems, previous statement is not true. It could
1643                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1644                  * But then router serving it might decide, that we should
1645                  * know truth 8)8) --ANK (980726).
1646                  */
1647                 if (rt6_check_expired(rt))
1648                         continue;
1649                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1650                         continue;
1651                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1652                         continue;
1653                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1654                         continue;
1655                 break;
1656         }
1657
1658         if (!rt)
1659                 rt = net->ipv6.ip6_null_entry;
1660         BACKTRACK(net, &fl6->saddr);
1661 out:
1662         dst_hold(&rt->dst);
1663
1664         read_unlock_bh(&table->tb6_lock);
1665
1666         return rt;
1667 };
1668
1669 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1670                                            const struct in6_addr *src,
1671                                            const struct in6_addr *gateway,
1672                                            struct net_device *dev)
1673 {
1674         int flags = RT6_LOOKUP_F_HAS_SADDR;
1675         struct net *net = dev_net(dev);
1676         struct ip6rd_flowi rdfl = {
1677                 .fl6 = {
1678                         .flowi6_oif = dev->ifindex,
1679                         .daddr = *dest,
1680                         .saddr = *src,
1681                 },
1682         };
1683
1684         rdfl.gateway = *gateway;
1685
1686         if (rt6_need_strict(dest))
1687                 flags |= RT6_LOOKUP_F_IFACE;
1688
1689         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1690                                                    flags, __ip6_route_redirect);
1691 }
1692
1693 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1694                   const struct in6_addr *saddr,
1695                   struct neighbour *neigh, u8 *lladdr, int on_link)
1696 {
1697         struct rt6_info *rt, *nrt = NULL;
1698         struct netevent_redirect netevent;
1699         struct net *net = dev_net(neigh->dev);
1700         struct neighbour *old_neigh;
1701
1702         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1703
1704         if (rt == net->ipv6.ip6_null_entry) {
1705                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1706                 goto out;
1707         }
1708
1709         /*
1710          *      We have finally decided to accept it.
1711          */
1712
1713         neigh_update(neigh, lladdr, NUD_STALE,
1714                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1715                      NEIGH_UPDATE_F_OVERRIDE|
1716                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1717                                      NEIGH_UPDATE_F_ISROUTER))
1718                      );
1719
1720         /*
1721          * Redirect received -> path was valid.
1722          * Look, redirects are sent only in response to data packets,
1723          * so that this nexthop apparently is reachable. --ANK
1724          */
1725         dst_confirm(&rt->dst);
1726
1727         /* Duplicate redirect: silently ignore. */
1728         old_neigh = rt->n;
1729         if (neigh == old_neigh)
1730                 goto out;
1731
1732         nrt = ip6_rt_copy(rt, dest);
1733         if (!nrt)
1734                 goto out;
1735
1736         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1737         if (on_link)
1738                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1739
1740         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1741         nrt->n = neigh_clone(neigh);
1742
1743         if (ip6_ins_rt(nrt))
1744                 goto out;
1745
1746         netevent.old = &rt->dst;
1747         netevent.old_neigh = old_neigh;
1748         netevent.new = &nrt->dst;
1749         netevent.new_neigh = neigh;
1750         netevent.daddr = dest;
1751         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1752
1753         if (rt->rt6i_flags & RTF_CACHE) {
1754                 ip6_del_rt(rt);
1755                 return;
1756         }
1757
1758 out:
1759         dst_release(&rt->dst);
1760 }
1761
1762 /*
1763  *      Misc support functions
1764  */
1765
1766 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1767                                     const struct in6_addr *dest)
1768 {
1769         struct net *net = dev_net(ort->dst.dev);
1770         struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
1771                                             ort->rt6i_table);
1772
1773         if (rt) {
1774                 rt->dst.input = ort->dst.input;
1775                 rt->dst.output = ort->dst.output;
1776                 rt->dst.flags |= DST_HOST;
1777
1778                 rt->rt6i_dst.addr = *dest;
1779                 rt->rt6i_dst.plen = 128;
1780                 dst_copy_metrics(&rt->dst, &ort->dst);
1781                 rt->dst.error = ort->dst.error;
1782                 rt->rt6i_idev = ort->rt6i_idev;
1783                 if (rt->rt6i_idev)
1784                         in6_dev_hold(rt->rt6i_idev);
1785                 rt->dst.lastuse = jiffies;
1786
1787                 rt->rt6i_gateway = ort->rt6i_gateway;
1788                 rt->rt6i_flags = ort->rt6i_flags;
1789                 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1790                     (RTF_DEFAULT | RTF_ADDRCONF))
1791                         rt6_set_from(rt, ort);
1792                 else
1793                         rt6_clean_expires(rt);
1794                 rt->rt6i_metric = 0;
1795
1796 #ifdef CONFIG_IPV6_SUBTREES
1797                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1798 #endif
1799                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1800                 rt->rt6i_table = ort->rt6i_table;
1801         }
1802         return rt;
1803 }
1804
1805 #ifdef CONFIG_IPV6_ROUTE_INFO
1806 static struct rt6_info *rt6_get_route_info(struct net *net,
1807                                            const struct in6_addr *prefix, int prefixlen,
1808                                            const struct in6_addr *gwaddr, int ifindex)
1809 {
1810         struct fib6_node *fn;
1811         struct rt6_info *rt = NULL;
1812         struct fib6_table *table;
1813
1814         table = fib6_get_table(net, RT6_TABLE_INFO);
1815         if (!table)
1816                 return NULL;
1817
1818         write_lock_bh(&table->tb6_lock);
1819         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1820         if (!fn)
1821                 goto out;
1822
1823         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1824                 if (rt->dst.dev->ifindex != ifindex)
1825                         continue;
1826                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1827                         continue;
1828                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1829                         continue;
1830                 dst_hold(&rt->dst);
1831                 break;
1832         }
1833 out:
1834         write_unlock_bh(&table->tb6_lock);
1835         return rt;
1836 }
1837
1838 static struct rt6_info *rt6_add_route_info(struct net *net,
1839                                            const struct in6_addr *prefix, int prefixlen,
1840                                            const struct in6_addr *gwaddr, int ifindex,
1841                                            unsigned int pref)
1842 {
1843         struct fib6_config cfg = {
1844                 .fc_table       = RT6_TABLE_INFO,
1845                 .fc_metric      = IP6_RT_PRIO_USER,
1846                 .fc_ifindex     = ifindex,
1847                 .fc_dst_len     = prefixlen,
1848                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1849                                   RTF_UP | RTF_PREF(pref),
1850                 .fc_nlinfo.pid = 0,
1851                 .fc_nlinfo.nlh = NULL,
1852                 .fc_nlinfo.nl_net = net,
1853         };
1854
1855         cfg.fc_dst = *prefix;
1856         cfg.fc_gateway = *gwaddr;
1857
1858         /* We should treat it as a default route if prefix length is 0. */
1859         if (!prefixlen)
1860                 cfg.fc_flags |= RTF_DEFAULT;
1861
1862         ip6_route_add(&cfg);
1863
1864         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1865 }
1866 #endif
1867
1868 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1869 {
1870         struct rt6_info *rt;
1871         struct fib6_table *table;
1872
1873         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1874         if (!table)
1875                 return NULL;
1876
1877         write_lock_bh(&table->tb6_lock);
1878         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1879                 if (dev == rt->dst.dev &&
1880                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1881                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1882                         break;
1883         }
1884         if (rt)
1885                 dst_hold(&rt->dst);
1886         write_unlock_bh(&table->tb6_lock);
1887         return rt;
1888 }
1889
1890 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1891                                      struct net_device *dev,
1892                                      unsigned int pref)
1893 {
1894         struct fib6_config cfg = {
1895                 .fc_table       = RT6_TABLE_DFLT,
1896                 .fc_metric      = IP6_RT_PRIO_USER,
1897                 .fc_ifindex     = dev->ifindex,
1898                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1899                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1900                 .fc_nlinfo.pid = 0,
1901                 .fc_nlinfo.nlh = NULL,
1902                 .fc_nlinfo.nl_net = dev_net(dev),
1903         };
1904
1905         cfg.fc_gateway = *gwaddr;
1906
1907         ip6_route_add(&cfg);
1908
1909         return rt6_get_dflt_router(gwaddr, dev);
1910 }
1911
1912 void rt6_purge_dflt_routers(struct net *net)
1913 {
1914         struct rt6_info *rt;
1915         struct fib6_table *table;
1916
1917         /* NOTE: Keep consistent with rt6_get_dflt_router */
1918         table = fib6_get_table(net, RT6_TABLE_DFLT);
1919         if (!table)
1920                 return;
1921
1922 restart:
1923         read_lock_bh(&table->tb6_lock);
1924         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1925                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1926                         dst_hold(&rt->dst);
1927                         read_unlock_bh(&table->tb6_lock);
1928                         ip6_del_rt(rt);
1929                         goto restart;
1930                 }
1931         }
1932         read_unlock_bh(&table->tb6_lock);
1933 }
1934
1935 static void rtmsg_to_fib6_config(struct net *net,
1936                                  struct in6_rtmsg *rtmsg,
1937                                  struct fib6_config *cfg)
1938 {
1939         memset(cfg, 0, sizeof(*cfg));
1940
1941         cfg->fc_table = RT6_TABLE_MAIN;
1942         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1943         cfg->fc_metric = rtmsg->rtmsg_metric;
1944         cfg->fc_expires = rtmsg->rtmsg_info;
1945         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1946         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1947         cfg->fc_flags = rtmsg->rtmsg_flags;
1948
1949         cfg->fc_nlinfo.nl_net = net;
1950
1951         cfg->fc_dst = rtmsg->rtmsg_dst;
1952         cfg->fc_src = rtmsg->rtmsg_src;
1953         cfg->fc_gateway = rtmsg->rtmsg_gateway;
1954 }
1955
1956 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1957 {
1958         struct fib6_config cfg;
1959         struct in6_rtmsg rtmsg;
1960         int err;
1961
1962         switch(cmd) {
1963         case SIOCADDRT:         /* Add a route */
1964         case SIOCDELRT:         /* Delete a route */
1965                 if (!capable(CAP_NET_ADMIN))
1966                         return -EPERM;
1967                 err = copy_from_user(&rtmsg, arg,
1968                                      sizeof(struct in6_rtmsg));
1969                 if (err)
1970                         return -EFAULT;
1971
1972                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1973
1974                 rtnl_lock();
1975                 switch (cmd) {
1976                 case SIOCADDRT:
1977                         err = ip6_route_add(&cfg);
1978                         break;
1979                 case SIOCDELRT:
1980                         err = ip6_route_del(&cfg);
1981                         break;
1982                 default:
1983                         err = -EINVAL;
1984                 }
1985                 rtnl_unlock();
1986
1987                 return err;
1988         }
1989
1990         return -EINVAL;
1991 }
1992
1993 /*
1994  *      Drop the packet on the floor
1995  */
1996
1997 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1998 {
1999         int type;
2000         struct dst_entry *dst = skb_dst(skb);
2001         switch (ipstats_mib_noroutes) {
2002         case IPSTATS_MIB_INNOROUTES:
2003                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2004                 if (type == IPV6_ADDR_ANY) {
2005                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2006                                       IPSTATS_MIB_INADDRERRORS);
2007                         break;
2008                 }
2009                 /* FALLTHROUGH */
2010         case IPSTATS_MIB_OUTNOROUTES:
2011                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2012                               ipstats_mib_noroutes);
2013                 break;
2014         }
2015         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2016         kfree_skb(skb);
2017         return 0;
2018 }
2019
2020 static int ip6_pkt_discard(struct sk_buff *skb)
2021 {
2022         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2023 }
2024
2025 static int ip6_pkt_discard_out(struct sk_buff *skb)
2026 {
2027         skb->dev = skb_dst(skb)->dev;
2028         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2029 }
2030
2031 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2032
2033 static int ip6_pkt_prohibit(struct sk_buff *skb)
2034 {
2035         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2036 }
2037
2038 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2039 {
2040         skb->dev = skb_dst(skb)->dev;
2041         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2042 }
2043
2044 #endif
2045
2046 /*
2047  *      Allocate a dst for local (unicast / anycast) address.
2048  */
2049
2050 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2051                                     const struct in6_addr *addr,
2052                                     bool anycast)
2053 {
2054         struct net *net = dev_net(idev->dev);
2055         struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL);
2056         int err;
2057
2058         if (!rt) {
2059                 net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2060                 return ERR_PTR(-ENOMEM);
2061         }
2062
2063         in6_dev_hold(idev);
2064
2065         rt->dst.flags |= DST_HOST;
2066         rt->dst.input = ip6_input;
2067         rt->dst.output = ip6_output;
2068         rt->rt6i_idev = idev;
2069         rt->dst.obsolete = -1;
2070
2071         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2072         if (anycast)
2073                 rt->rt6i_flags |= RTF_ANYCAST;
2074         else
2075                 rt->rt6i_flags |= RTF_LOCAL;
2076         err = rt6_bind_neighbour(rt, rt->dst.dev);
2077         if (err) {
2078                 dst_free(&rt->dst);
2079                 return ERR_PTR(err);
2080         }
2081
2082         rt->rt6i_dst.addr = *addr;
2083         rt->rt6i_dst.plen = 128;
2084         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2085
2086         atomic_set(&rt->dst.__refcnt, 1);
2087
2088         return rt;
2089 }
2090
2091 int ip6_route_get_saddr(struct net *net,
2092                         struct rt6_info *rt,
2093                         const struct in6_addr *daddr,
2094                         unsigned int prefs,
2095                         struct in6_addr *saddr)
2096 {
2097         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2098         int err = 0;
2099         if (rt->rt6i_prefsrc.plen)
2100                 *saddr = rt->rt6i_prefsrc.addr;
2101         else
2102                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2103                                          daddr, prefs, saddr);
2104         return err;
2105 }
2106
2107 /* remove deleted ip from prefsrc entries */
2108 struct arg_dev_net_ip {
2109         struct net_device *dev;
2110         struct net *net;
2111         struct in6_addr *addr;
2112 };
2113
2114 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2115 {
2116         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2117         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2118         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2119
2120         if (((void *)rt->dst.dev == dev || !dev) &&
2121             rt != net->ipv6.ip6_null_entry &&
2122             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2123                 /* remove prefsrc entry */
2124                 rt->rt6i_prefsrc.plen = 0;
2125         }
2126         return 0;
2127 }
2128
2129 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2130 {
2131         struct net *net = dev_net(ifp->idev->dev);
2132         struct arg_dev_net_ip adni = {
2133                 .dev = ifp->idev->dev,
2134                 .net = net,
2135                 .addr = &ifp->addr,
2136         };
2137         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2138 }
2139
2140 struct arg_dev_net {
2141         struct net_device *dev;
2142         struct net *net;
2143 };
2144
2145 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2146 {
2147         const struct arg_dev_net *adn = arg;
2148         const struct net_device *dev = adn->dev;
2149
2150         if ((rt->dst.dev == dev || !dev) &&
2151             rt != adn->net->ipv6.ip6_null_entry)
2152                 return -1;
2153
2154         return 0;
2155 }
2156
2157 void rt6_ifdown(struct net *net, struct net_device *dev)
2158 {
2159         struct arg_dev_net adn = {
2160                 .dev = dev,
2161                 .net = net,
2162         };
2163
2164         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2165         icmp6_clean_all(fib6_ifdown, &adn);
2166 }
2167
2168 struct rt6_mtu_change_arg {
2169         struct net_device *dev;
2170         unsigned int mtu;
2171 };
2172
2173 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2174 {
2175         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2176         struct inet6_dev *idev;
2177
2178         /* In IPv6 pmtu discovery is not optional,
2179            so that RTAX_MTU lock cannot disable it.
2180            We still use this lock to block changes
2181            caused by addrconf/ndisc.
2182         */
2183
2184         idev = __in6_dev_get(arg->dev);
2185         if (!idev)
2186                 return 0;
2187
2188         /* For administrative MTU increase, there is no way to discover
2189            IPv6 PMTU increase, so PMTU increase should be updated here.
2190            Since RFC 1981 doesn't include administrative MTU increase
2191            update PMTU increase is a MUST. (i.e. jumbo frame)
2192          */
2193         /*
2194            If new MTU is less than route PMTU, this new MTU will be the
2195            lowest MTU in the path, update the route PMTU to reflect PMTU
2196            decreases; if new MTU is greater than route PMTU, and the
2197            old MTU is the lowest MTU in the path, update the route PMTU
2198            to reflect the increase. In this case if the other nodes' MTU
2199            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2200            PMTU discouvery.
2201          */
2202         if (rt->dst.dev == arg->dev &&
2203             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2204             (dst_mtu(&rt->dst) >= arg->mtu ||
2205              (dst_mtu(&rt->dst) < arg->mtu &&
2206               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2207                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2208         }
2209         return 0;
2210 }
2211
2212 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2213 {
2214         struct rt6_mtu_change_arg arg = {
2215                 .dev = dev,
2216                 .mtu = mtu,
2217         };
2218
2219         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2220 }
2221
2222 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2223         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2224         [RTA_OIF]               = { .type = NLA_U32 },
2225         [RTA_IIF]               = { .type = NLA_U32 },
2226         [RTA_PRIORITY]          = { .type = NLA_U32 },
2227         [RTA_METRICS]           = { .type = NLA_NESTED },
2228 };
2229
2230 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2231                               struct fib6_config *cfg)
2232 {
2233         struct rtmsg *rtm;
2234         struct nlattr *tb[RTA_MAX+1];
2235         int err;
2236
2237         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2238         if (err < 0)
2239                 goto errout;
2240
2241         err = -EINVAL;
2242         rtm = nlmsg_data(nlh);
2243         memset(cfg, 0, sizeof(*cfg));
2244
2245         cfg->fc_table = rtm->rtm_table;
2246         cfg->fc_dst_len = rtm->rtm_dst_len;
2247         cfg->fc_src_len = rtm->rtm_src_len;
2248         cfg->fc_flags = RTF_UP;
2249         cfg->fc_protocol = rtm->rtm_protocol;
2250
2251         if (rtm->rtm_type == RTN_UNREACHABLE)
2252                 cfg->fc_flags |= RTF_REJECT;
2253
2254         if (rtm->rtm_type == RTN_LOCAL)
2255                 cfg->fc_flags |= RTF_LOCAL;
2256
2257         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2258         cfg->fc_nlinfo.nlh = nlh;
2259         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2260
2261         if (tb[RTA_GATEWAY]) {
2262                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2263                 cfg->fc_flags |= RTF_GATEWAY;
2264         }
2265
2266         if (tb[RTA_DST]) {
2267                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2268
2269                 if (nla_len(tb[RTA_DST]) < plen)
2270                         goto errout;
2271
2272                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2273         }
2274
2275         if (tb[RTA_SRC]) {
2276                 int plen = (rtm->rtm_src_len + 7) >> 3;
2277
2278                 if (nla_len(tb[RTA_SRC]) < plen)
2279                         goto errout;
2280
2281                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2282         }
2283
2284         if (tb[RTA_PREFSRC])
2285                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2286
2287         if (tb[RTA_OIF])
2288                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2289
2290         if (tb[RTA_PRIORITY])
2291                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2292
2293         if (tb[RTA_METRICS]) {
2294                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2295                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2296         }
2297
2298         if (tb[RTA_TABLE])
2299                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2300
2301         err = 0;
2302 errout:
2303         return err;
2304 }
2305
2306 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2307 {
2308         struct fib6_config cfg;
2309         int err;
2310
2311         err = rtm_to_fib6_config(skb, nlh, &cfg);
2312         if (err < 0)
2313                 return err;
2314
2315         return ip6_route_del(&cfg);
2316 }
2317
2318 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2319 {
2320         struct fib6_config cfg;
2321         int err;
2322
2323         err = rtm_to_fib6_config(skb, nlh, &cfg);
2324         if (err < 0)
2325                 return err;
2326
2327         return ip6_route_add(&cfg);
2328 }
2329
2330 static inline size_t rt6_nlmsg_size(void)
2331 {
2332         return NLMSG_ALIGN(sizeof(struct rtmsg))
2333                + nla_total_size(16) /* RTA_SRC */
2334                + nla_total_size(16) /* RTA_DST */
2335                + nla_total_size(16) /* RTA_GATEWAY */
2336                + nla_total_size(16) /* RTA_PREFSRC */
2337                + nla_total_size(4) /* RTA_TABLE */
2338                + nla_total_size(4) /* RTA_IIF */
2339                + nla_total_size(4) /* RTA_OIF */
2340                + nla_total_size(4) /* RTA_PRIORITY */
2341                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2342                + nla_total_size(sizeof(struct rta_cacheinfo));
2343 }
2344
2345 static int rt6_fill_node(struct net *net,
2346                          struct sk_buff *skb, struct rt6_info *rt,
2347                          struct in6_addr *dst, struct in6_addr *src,
2348                          int iif, int type, u32 pid, u32 seq,
2349                          int prefix, int nowait, unsigned int flags)
2350 {
2351         const struct inet_peer *peer;
2352         struct rtmsg *rtm;
2353         struct nlmsghdr *nlh;
2354         long expires;
2355         u32 table;
2356         struct neighbour *n;
2357         u32 ts, tsage;
2358
2359         if (prefix) {   /* user wants prefix routes only */
2360                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2361                         /* success since this is not a prefix route */
2362                         return 1;
2363                 }
2364         }
2365
2366         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2367         if (!nlh)
2368                 return -EMSGSIZE;
2369
2370         rtm = nlmsg_data(nlh);
2371         rtm->rtm_family = AF_INET6;
2372         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2373         rtm->rtm_src_len = rt->rt6i_src.plen;
2374         rtm->rtm_tos = 0;
2375         if (rt->rt6i_table)
2376                 table = rt->rt6i_table->tb6_id;
2377         else
2378                 table = RT6_TABLE_UNSPEC;
2379         rtm->rtm_table = table;
2380         if (nla_put_u32(skb, RTA_TABLE, table))
2381                 goto nla_put_failure;
2382         if (rt->rt6i_flags & RTF_REJECT)
2383                 rtm->rtm_type = RTN_UNREACHABLE;
2384         else if (rt->rt6i_flags & RTF_LOCAL)
2385                 rtm->rtm_type = RTN_LOCAL;
2386         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2387                 rtm->rtm_type = RTN_LOCAL;
2388         else
2389                 rtm->rtm_type = RTN_UNICAST;
2390         rtm->rtm_flags = 0;
2391         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2392         rtm->rtm_protocol = rt->rt6i_protocol;
2393         if (rt->rt6i_flags & RTF_DYNAMIC)
2394                 rtm->rtm_protocol = RTPROT_REDIRECT;
2395         else if (rt->rt6i_flags & RTF_ADDRCONF)
2396                 rtm->rtm_protocol = RTPROT_KERNEL;
2397         else if (rt->rt6i_flags & RTF_DEFAULT)
2398                 rtm->rtm_protocol = RTPROT_RA;
2399
2400         if (rt->rt6i_flags & RTF_CACHE)
2401                 rtm->rtm_flags |= RTM_F_CLONED;
2402
2403         if (dst) {
2404                 if (nla_put(skb, RTA_DST, 16, dst))
2405                         goto nla_put_failure;
2406                 rtm->rtm_dst_len = 128;
2407         } else if (rtm->rtm_dst_len)
2408                 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2409                         goto nla_put_failure;
2410 #ifdef CONFIG_IPV6_SUBTREES
2411         if (src) {
2412                 if (nla_put(skb, RTA_SRC, 16, src))
2413                         goto nla_put_failure;
2414                 rtm->rtm_src_len = 128;
2415         } else if (rtm->rtm_src_len &&
2416                    nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2417                 goto nla_put_failure;
2418 #endif
2419         if (iif) {
2420 #ifdef CONFIG_IPV6_MROUTE
2421                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2422                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2423                         if (err <= 0) {
2424                                 if (!nowait) {
2425                                         if (err == 0)
2426                                                 return 0;
2427                                         goto nla_put_failure;
2428                                 } else {
2429                                         if (err == -EMSGSIZE)
2430                                                 goto nla_put_failure;
2431                                 }
2432                         }
2433                 } else
2434 #endif
2435                         if (nla_put_u32(skb, RTA_IIF, iif))
2436                                 goto nla_put_failure;
2437         } else if (dst) {
2438                 struct in6_addr saddr_buf;
2439                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2440                     nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2441                         goto nla_put_failure;
2442         }
2443
2444         if (rt->rt6i_prefsrc.plen) {
2445                 struct in6_addr saddr_buf;
2446                 saddr_buf = rt->rt6i_prefsrc.addr;
2447                 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2448                         goto nla_put_failure;
2449         }
2450
2451         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2452                 goto nla_put_failure;
2453
2454         rcu_read_lock();
2455         n = rt->n;
2456         if (n) {
2457                 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2458                         rcu_read_unlock();
2459                         goto nla_put_failure;
2460                 }
2461         }
2462         rcu_read_unlock();
2463
2464         if (rt->dst.dev &&
2465             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2466                 goto nla_put_failure;
2467         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2468                 goto nla_put_failure;
2469         if (!(rt->rt6i_flags & RTF_EXPIRES))
2470                 expires = 0;
2471         else if (rt->dst.expires - jiffies < INT_MAX)
2472                 expires = rt->dst.expires - jiffies;
2473         else
2474                 expires = INT_MAX;
2475
2476         peer = NULL;
2477         if (rt6_has_peer(rt))
2478                 peer = rt6_peer_ptr(rt);
2479         ts = tsage = 0;
2480         if (peer && peer->tcp_ts_stamp) {
2481                 ts = peer->tcp_ts;
2482                 tsage = get_seconds() - peer->tcp_ts_stamp;
2483         }
2484
2485         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2486                                expires, rt->dst.error) < 0)
2487                 goto nla_put_failure;
2488
2489         return nlmsg_end(skb, nlh);
2490
2491 nla_put_failure:
2492         nlmsg_cancel(skb, nlh);
2493         return -EMSGSIZE;
2494 }
2495
2496 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2497 {
2498         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2499         int prefix;
2500
2501         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2502                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2503                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2504         } else
2505                 prefix = 0;
2506
2507         return rt6_fill_node(arg->net,
2508                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2509                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2510                      prefix, 0, NLM_F_MULTI);
2511 }
2512
2513 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2514 {
2515         struct net *net = sock_net(in_skb->sk);
2516         struct nlattr *tb[RTA_MAX+1];
2517         struct rt6_info *rt;
2518         struct sk_buff *skb;
2519         struct rtmsg *rtm;
2520         struct flowi6 fl6;
2521         int err, iif = 0, oif = 0;
2522
2523         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2524         if (err < 0)
2525                 goto errout;
2526
2527         err = -EINVAL;
2528         memset(&fl6, 0, sizeof(fl6));
2529
2530         if (tb[RTA_SRC]) {
2531                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2532                         goto errout;
2533
2534                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2535         }
2536
2537         if (tb[RTA_DST]) {
2538                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2539                         goto errout;
2540
2541                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2542         }
2543
2544         if (tb[RTA_IIF])
2545                 iif = nla_get_u32(tb[RTA_IIF]);
2546
2547         if (tb[RTA_OIF])
2548                 oif = nla_get_u32(tb[RTA_OIF]);
2549
2550         if (iif) {
2551                 struct net_device *dev;
2552                 int flags = 0;
2553
2554                 dev = __dev_get_by_index(net, iif);
2555                 if (!dev) {
2556                         err = -ENODEV;
2557                         goto errout;
2558                 }
2559
2560                 fl6.flowi6_iif = iif;
2561
2562                 if (!ipv6_addr_any(&fl6.saddr))
2563                         flags |= RT6_LOOKUP_F_HAS_SADDR;
2564
2565                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2566                                                                flags);
2567         } else {
2568                 fl6.flowi6_oif = oif;
2569
2570                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2571         }
2572
2573         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2574         if (!skb) {
2575                 dst_release(&rt->dst);
2576                 err = -ENOBUFS;
2577                 goto errout;
2578         }
2579
2580         /* Reserve room for dummy headers, this skb can pass
2581            through good chunk of routing engine.
2582          */
2583         skb_reset_mac_header(skb);
2584         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2585
2586         skb_dst_set(skb, &rt->dst);
2587
2588         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2589                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2590                             nlh->nlmsg_seq, 0, 0, 0);
2591         if (err < 0) {
2592                 kfree_skb(skb);
2593                 goto errout;
2594         }
2595
2596         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2597 errout:
2598         return err;
2599 }
2600
2601 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2602 {
2603         struct sk_buff *skb;
2604         struct net *net = info->nl_net;
2605         u32 seq;
2606         int err;
2607
2608         err = -ENOBUFS;
2609         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2610
2611         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2612         if (!skb)
2613                 goto errout;
2614
2615         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2616                                 event, info->pid, seq, 0, 0, 0);
2617         if (err < 0) {
2618                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2619                 WARN_ON(err == -EMSGSIZE);
2620                 kfree_skb(skb);
2621                 goto errout;
2622         }
2623         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2624                     info->nlh, gfp_any());
2625         return;
2626 errout:
2627         if (err < 0)
2628                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2629 }
2630
2631 static int ip6_route_dev_notify(struct notifier_block *this,
2632                                 unsigned long event, void *data)
2633 {
2634         struct net_device *dev = (struct net_device *)data;
2635         struct net *net = dev_net(dev);
2636
2637         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2638                 net->ipv6.ip6_null_entry->dst.dev = dev;
2639                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2640 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2641                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2642                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2643                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2644                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2645 #endif
2646         }
2647
2648         return NOTIFY_OK;
2649 }
2650
2651 /*
2652  *      /proc
2653  */
2654
2655 #ifdef CONFIG_PROC_FS
2656
2657 struct rt6_proc_arg
2658 {
2659         char *buffer;
2660         int offset;
2661         int length;
2662         int skip;
2663         int len;
2664 };
2665
2666 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2667 {
2668         struct seq_file *m = p_arg;
2669         struct neighbour *n;
2670
2671         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2672
2673 #ifdef CONFIG_IPV6_SUBTREES
2674         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2675 #else
2676         seq_puts(m, "00000000000000000000000000000000 00 ");
2677 #endif
2678         rcu_read_lock();
2679         n = rt->n;
2680         if (n) {
2681                 seq_printf(m, "%pi6", n->primary_key);
2682         } else {
2683                 seq_puts(m, "00000000000000000000000000000000");
2684         }
2685         rcu_read_unlock();
2686         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2687                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2688                    rt->dst.__use, rt->rt6i_flags,
2689                    rt->dst.dev ? rt->dst.dev->name : "");
2690         return 0;
2691 }
2692
2693 static int ipv6_route_show(struct seq_file *m, void *v)
2694 {
2695         struct net *net = (struct net *)m->private;
2696         fib6_clean_all_ro(net, rt6_info_route, 0, m);
2697         return 0;
2698 }
2699
2700 static int ipv6_route_open(struct inode *inode, struct file *file)
2701 {
2702         return single_open_net(inode, file, ipv6_route_show);
2703 }
2704
2705 static const struct file_operations ipv6_route_proc_fops = {
2706         .owner          = THIS_MODULE,
2707         .open           = ipv6_route_open,
2708         .read           = seq_read,
2709         .llseek         = seq_lseek,
2710         .release        = single_release_net,
2711 };
2712
2713 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2714 {
2715         struct net *net = (struct net *)seq->private;
2716         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2717                    net->ipv6.rt6_stats->fib_nodes,
2718                    net->ipv6.rt6_stats->fib_route_nodes,
2719                    net->ipv6.rt6_stats->fib_rt_alloc,
2720                    net->ipv6.rt6_stats->fib_rt_entries,
2721                    net->ipv6.rt6_stats->fib_rt_cache,
2722                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2723                    net->ipv6.rt6_stats->fib_discarded_routes);
2724
2725         return 0;
2726 }
2727
2728 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2729 {
2730         return single_open_net(inode, file, rt6_stats_seq_show);
2731 }
2732
2733 static const struct file_operations rt6_stats_seq_fops = {
2734         .owner   = THIS_MODULE,
2735         .open    = rt6_stats_seq_open,
2736         .read    = seq_read,
2737         .llseek  = seq_lseek,
2738         .release = single_release_net,
2739 };
2740 #endif  /* CONFIG_PROC_FS */
2741
2742 #ifdef CONFIG_SYSCTL
2743
2744 static
2745 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2746                               void __user *buffer, size_t *lenp, loff_t *ppos)
2747 {
2748         struct net *net;
2749         int delay;
2750         if (!write)
2751                 return -EINVAL;
2752
2753         net = (struct net *)ctl->extra1;
2754         delay = net->ipv6.sysctl.flush_delay;
2755         proc_dointvec(ctl, write, buffer, lenp, ppos);
2756         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2757         return 0;
2758 }
2759
2760 ctl_table ipv6_route_table_template[] = {
2761         {
2762                 .procname       =       "flush",
2763                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2764                 .maxlen         =       sizeof(int),
2765                 .mode           =       0200,
2766                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2767         },
2768         {
2769                 .procname       =       "gc_thresh",
2770                 .data           =       &ip6_dst_ops_template.gc_thresh,
2771                 .maxlen         =       sizeof(int),
2772                 .mode           =       0644,
2773                 .proc_handler   =       proc_dointvec,
2774         },
2775         {
2776                 .procname       =       "max_size",
2777                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2778                 .maxlen         =       sizeof(int),
2779                 .mode           =       0644,
2780                 .proc_handler   =       proc_dointvec,
2781         },
2782         {
2783                 .procname       =       "gc_min_interval",
2784                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2785                 .maxlen         =       sizeof(int),
2786                 .mode           =       0644,
2787                 .proc_handler   =       proc_dointvec_jiffies,
2788         },
2789         {
2790                 .procname       =       "gc_timeout",
2791                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2792                 .maxlen         =       sizeof(int),
2793                 .mode           =       0644,
2794                 .proc_handler   =       proc_dointvec_jiffies,
2795         },
2796         {
2797                 .procname       =       "gc_interval",
2798                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2799                 .maxlen         =       sizeof(int),
2800                 .mode           =       0644,
2801                 .proc_handler   =       proc_dointvec_jiffies,
2802         },
2803         {
2804                 .procname       =       "gc_elasticity",
2805                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2806                 .maxlen         =       sizeof(int),
2807                 .mode           =       0644,
2808                 .proc_handler   =       proc_dointvec,
2809         },
2810         {
2811                 .procname       =       "mtu_expires",
2812                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2813                 .maxlen         =       sizeof(int),
2814                 .mode           =       0644,
2815                 .proc_handler   =       proc_dointvec_jiffies,
2816         },
2817         {
2818                 .procname       =       "min_adv_mss",
2819                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2820                 .maxlen         =       sizeof(int),
2821                 .mode           =       0644,
2822                 .proc_handler   =       proc_dointvec,
2823         },
2824         {
2825                 .procname       =       "gc_min_interval_ms",
2826                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2827                 .maxlen         =       sizeof(int),
2828                 .mode           =       0644,
2829                 .proc_handler   =       proc_dointvec_ms_jiffies,
2830         },
2831         { }
2832 };
2833
2834 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2835 {
2836         struct ctl_table *table;
2837
2838         table = kmemdup(ipv6_route_table_template,
2839                         sizeof(ipv6_route_table_template),
2840                         GFP_KERNEL);
2841
2842         if (table) {
2843                 table[0].data = &net->ipv6.sysctl.flush_delay;
2844                 table[0].extra1 = net;
2845                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2846                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2847                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2848                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2849                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2850                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2851                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2852                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2853                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2854         }
2855
2856         return table;
2857 }
2858 #endif
2859
2860 static int __net_init ip6_route_net_init(struct net *net)
2861 {
2862         int ret = -ENOMEM;
2863
2864         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2865                sizeof(net->ipv6.ip6_dst_ops));
2866
2867         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2868                 goto out_ip6_dst_ops;
2869
2870         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2871                                            sizeof(*net->ipv6.ip6_null_entry),
2872                                            GFP_KERNEL);
2873         if (!net->ipv6.ip6_null_entry)
2874                 goto out_ip6_dst_entries;
2875         net->ipv6.ip6_null_entry->dst.path =
2876                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2877         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2878         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2879                          ip6_template_metrics, true);
2880
2881 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2882         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2883                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2884                                                GFP_KERNEL);
2885         if (!net->ipv6.ip6_prohibit_entry)
2886                 goto out_ip6_null_entry;
2887         net->ipv6.ip6_prohibit_entry->dst.path =
2888                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2889         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2890         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2891                          ip6_template_metrics, true);
2892
2893         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2894                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2895                                                GFP_KERNEL);
2896         if (!net->ipv6.ip6_blk_hole_entry)
2897                 goto out_ip6_prohibit_entry;
2898         net->ipv6.ip6_blk_hole_entry->dst.path =
2899                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2900         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2901         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2902                          ip6_template_metrics, true);
2903 #endif
2904
2905         net->ipv6.sysctl.flush_delay = 0;
2906         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2907         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2908         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2909         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2910         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2911         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2912         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2913
2914         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2915
2916         ret = 0;
2917 out:
2918         return ret;
2919
2920 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2921 out_ip6_prohibit_entry:
2922         kfree(net->ipv6.ip6_prohibit_entry);
2923 out_ip6_null_entry:
2924         kfree(net->ipv6.ip6_null_entry);
2925 #endif
2926 out_ip6_dst_entries:
2927         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2928 out_ip6_dst_ops:
2929         goto out;
2930 }
2931
2932 static void __net_exit ip6_route_net_exit(struct net *net)
2933 {
2934         kfree(net->ipv6.ip6_null_entry);
2935 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2936         kfree(net->ipv6.ip6_prohibit_entry);
2937         kfree(net->ipv6.ip6_blk_hole_entry);
2938 #endif
2939         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2940 }
2941
2942 static int __net_init ip6_route_net_init_late(struct net *net)
2943 {
2944 #ifdef CONFIG_PROC_FS
2945         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2946         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2947 #endif
2948         return 0;
2949 }
2950
2951 static void __net_exit ip6_route_net_exit_late(struct net *net)
2952 {
2953 #ifdef CONFIG_PROC_FS
2954         proc_net_remove(net, "ipv6_route");
2955         proc_net_remove(net, "rt6_stats");
2956 #endif
2957 }
2958
2959 static struct pernet_operations ip6_route_net_ops = {
2960         .init = ip6_route_net_init,
2961         .exit = ip6_route_net_exit,
2962 };
2963
2964 static int __net_init ipv6_inetpeer_init(struct net *net)
2965 {
2966         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2967
2968         if (!bp)
2969                 return -ENOMEM;
2970         inet_peer_base_init(bp);
2971         net->ipv6.peers = bp;
2972         return 0;
2973 }
2974
2975 static void __net_exit ipv6_inetpeer_exit(struct net *net)
2976 {
2977         struct inet_peer_base *bp = net->ipv6.peers;
2978
2979         net->ipv6.peers = NULL;
2980         inetpeer_invalidate_tree(bp);
2981         kfree(bp);
2982 }
2983
2984 static struct pernet_operations ipv6_inetpeer_ops = {
2985         .init   =       ipv6_inetpeer_init,
2986         .exit   =       ipv6_inetpeer_exit,
2987 };
2988
2989 static struct pernet_operations ip6_route_net_late_ops = {
2990         .init = ip6_route_net_init_late,
2991         .exit = ip6_route_net_exit_late,
2992 };
2993
2994 static struct notifier_block ip6_route_dev_notifier = {
2995         .notifier_call = ip6_route_dev_notify,
2996         .priority = 0,
2997 };
2998
2999 int __init ip6_route_init(void)
3000 {
3001         int ret;
3002
3003         ret = -ENOMEM;
3004         ip6_dst_ops_template.kmem_cachep =
3005                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3006                                   SLAB_HWCACHE_ALIGN, NULL);
3007         if (!ip6_dst_ops_template.kmem_cachep)
3008                 goto out;
3009
3010         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3011         if (ret)
3012                 goto out_kmem_cache;
3013
3014         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3015         if (ret)
3016                 goto out_dst_entries;
3017
3018         ret = register_pernet_subsys(&ip6_route_net_ops);
3019         if (ret)
3020                 goto out_register_inetpeer;
3021
3022         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3023
3024         /* Registering of the loopback is done before this portion of code,
3025          * the loopback reference in rt6_info will not be taken, do it
3026          * manually for init_net */
3027         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3028         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3029   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3030         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3031         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3032         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3033         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3034   #endif
3035         ret = fib6_init();
3036         if (ret)
3037                 goto out_register_subsys;
3038
3039         ret = xfrm6_init();
3040         if (ret)
3041                 goto out_fib6_init;
3042
3043         ret = fib6_rules_init();
3044         if (ret)
3045                 goto xfrm6_init;
3046
3047         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3048         if (ret)
3049                 goto fib6_rules_init;
3050
3051         ret = -ENOBUFS;
3052         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3053             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3054             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3055                 goto out_register_late_subsys;
3056
3057         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3058         if (ret)
3059                 goto out_register_late_subsys;
3060
3061 out:
3062         return ret;
3063
3064 out_register_late_subsys:
3065         unregister_pernet_subsys(&ip6_route_net_late_ops);
3066 fib6_rules_init:
3067         fib6_rules_cleanup();
3068 xfrm6_init:
3069         xfrm6_fini();
3070 out_fib6_init:
3071         fib6_gc_cleanup();
3072 out_register_subsys:
3073         unregister_pernet_subsys(&ip6_route_net_ops);
3074 out_register_inetpeer:
3075         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3076 out_dst_entries:
3077         dst_entries_destroy(&ip6_dst_blackhole_ops);
3078 out_kmem_cache:
3079         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3080         goto out;
3081 }
3082
3083 void ip6_route_cleanup(void)
3084 {
3085         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3086         unregister_pernet_subsys(&ip6_route_net_late_ops);
3087         fib6_rules_cleanup();
3088         xfrm6_fini();
3089         fib6_gc_cleanup();
3090         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3091         unregister_pernet_subsys(&ip6_route_net_ops);
3092         dst_entries_destroy(&ip6_dst_blackhole_ops);
3093         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3094 }