[IPV6]: ROUTE: Flag RTF_DEFAULT for Route Infomation for ::/0.
[pandora-kernel.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/config.h>
29 #include <linux/errno.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/init.h>
39 #include <linux/netlink.h>
40 #include <linux/if_arp.h>
41
42 #ifdef  CONFIG_PROC_FS
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #endif
46
47 #include <net/snmp.h>
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
53 #include <net/tcp.h>
54 #include <linux/rtnetlink.h>
55 #include <net/dst.h>
56 #include <net/xfrm.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 #define RT6_SELECT_F_IFACE      0x1
78 #define RT6_SELECT_F_REACHABLE  0x2
79
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(void);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct sk_buff *skb);
98 static void             ip6_link_failure(struct sk_buff *skb);
99 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103                                            struct in6_addr *gwaddr, int ifindex,
104                                            unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106                                            struct in6_addr *gwaddr, int ifindex);
107 #endif
108
109 static struct dst_ops ip6_dst_ops = {
110         .family                 =       AF_INET6,
111         .protocol               =       __constant_htons(ETH_P_IPV6),
112         .gc                     =       ip6_dst_gc,
113         .gc_thresh              =       1024,
114         .check                  =       ip6_dst_check,
115         .destroy                =       ip6_dst_destroy,
116         .ifdown                 =       ip6_dst_ifdown,
117         .negative_advice        =       ip6_negative_advice,
118         .link_failure           =       ip6_link_failure,
119         .update_pmtu            =       ip6_rt_update_pmtu,
120         .entry_size             =       sizeof(struct rt6_info),
121 };
122
123 struct rt6_info ip6_null_entry = {
124         .u = {
125                 .dst = {
126                         .__refcnt       = ATOMIC_INIT(1),
127                         .__use          = 1,
128                         .dev            = &loopback_dev,
129                         .obsolete       = -1,
130                         .error          = -ENETUNREACH,
131                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
132                         .input          = ip6_pkt_discard,
133                         .output         = ip6_pkt_discard_out,
134                         .ops            = &ip6_dst_ops,
135                         .path           = (struct dst_entry*)&ip6_null_entry,
136                 }
137         },
138         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
139         .rt6i_metric    = ~(u32) 0,
140         .rt6i_ref       = ATOMIC_INIT(1),
141 };
142
143 struct fib6_node ip6_routing_table = {
144         .leaf           = &ip6_null_entry,
145         .fn_flags       = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
146 };
147
148 /* Protects all the ip6 fib */
149
150 DEFINE_RWLOCK(rt6_lock);
151
152
153 /* allocate dst with ip6_dst_ops */
154 static __inline__ struct rt6_info *ip6_dst_alloc(void)
155 {
156         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
157 }
158
159 static void ip6_dst_destroy(struct dst_entry *dst)
160 {
161         struct rt6_info *rt = (struct rt6_info *)dst;
162         struct inet6_dev *idev = rt->rt6i_idev;
163
164         if (idev != NULL) {
165                 rt->rt6i_idev = NULL;
166                 in6_dev_put(idev);
167         }       
168 }
169
170 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
171                            int how)
172 {
173         struct rt6_info *rt = (struct rt6_info *)dst;
174         struct inet6_dev *idev = rt->rt6i_idev;
175
176         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
177                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
178                 if (loopback_idev != NULL) {
179                         rt->rt6i_idev = loopback_idev;
180                         in6_dev_put(idev);
181                 }
182         }
183 }
184
185 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
186 {
187         return (rt->rt6i_flags & RTF_EXPIRES &&
188                 time_after(jiffies, rt->rt6i_expires));
189 }
190
191 /*
192  *      Route lookup. Any rt6_lock is implied.
193  */
194
195 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
196                                                     int oif,
197                                                     int strict)
198 {
199         struct rt6_info *local = NULL;
200         struct rt6_info *sprt;
201
202         if (oif) {
203                 for (sprt = rt; sprt; sprt = sprt->u.next) {
204                         struct net_device *dev = sprt->rt6i_dev;
205                         if (dev->ifindex == oif)
206                                 return sprt;
207                         if (dev->flags & IFF_LOOPBACK) {
208                                 if (sprt->rt6i_idev == NULL ||
209                                     sprt->rt6i_idev->dev->ifindex != oif) {
210                                         if (strict && oif)
211                                                 continue;
212                                         if (local && (!oif || 
213                                                       local->rt6i_idev->dev->ifindex == oif))
214                                                 continue;
215                                 }
216                                 local = sprt;
217                         }
218                 }
219
220                 if (local)
221                         return local;
222
223                 if (strict)
224                         return &ip6_null_entry;
225         }
226         return rt;
227 }
228
229 #ifdef CONFIG_IPV6_ROUTER_PREF
230 static void rt6_probe(struct rt6_info *rt)
231 {
232         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
233         /*
234          * Okay, this does not seem to be appropriate
235          * for now, however, we need to check if it
236          * is really so; aka Router Reachability Probing.
237          *
238          * Router Reachability Probe MUST be rate-limited
239          * to no more than one per minute.
240          */
241         if (!neigh || (neigh->nud_state & NUD_VALID))
242                 return;
243         read_lock_bh(&neigh->lock);
244         if (!(neigh->nud_state & NUD_VALID) &&
245             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
246                 struct in6_addr mcaddr;
247                 struct in6_addr *target;
248
249                 neigh->updated = jiffies;
250                 read_unlock_bh(&neigh->lock);
251
252                 target = (struct in6_addr *)&neigh->primary_key;
253                 addrconf_addr_solict_mult(target, &mcaddr);
254                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
255         } else
256                 read_unlock_bh(&neigh->lock);
257 }
258 #else
259 static inline void rt6_probe(struct rt6_info *rt)
260 {
261         return;
262 }
263 #endif
264
265 /*
266  * Default Router Selection (RFC 2461 6.3.6)
267  */
268 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
269 {
270         struct net_device *dev = rt->rt6i_dev;
271         if (!oif || dev->ifindex == oif)
272                 return 2;
273         if ((dev->flags & IFF_LOOPBACK) &&
274             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
275                 return 1;
276         return 0;
277 }
278
279 static int inline rt6_check_neigh(struct rt6_info *rt)
280 {
281         struct neighbour *neigh = rt->rt6i_nexthop;
282         int m = 0;
283         if (neigh) {
284                 read_lock_bh(&neigh->lock);
285                 if (neigh->nud_state & NUD_VALID)
286                         m = 1;
287                 read_unlock_bh(&neigh->lock);
288         }
289         return m;
290 }
291
292 static int rt6_score_route(struct rt6_info *rt, int oif,
293                            int strict)
294 {
295         int m = rt6_check_dev(rt, oif);
296         if (!m && (strict & RT6_SELECT_F_IFACE))
297                 return -1;
298 #ifdef CONFIG_IPV6_ROUTER_PREF
299         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
300 #endif
301         if (rt6_check_neigh(rt))
302                 m |= 16;
303         else if (strict & RT6_SELECT_F_REACHABLE)
304                 return -1;
305         return m;
306 }
307
308 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
309                                    int strict)
310 {
311         struct rt6_info *match = NULL, *last = NULL;
312         struct rt6_info *rt, *rt0 = *head;
313         u32 metric;
314         int mpri = -1;
315
316         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
317                   __FUNCTION__, head, head ? *head : NULL, oif);
318
319         for (rt = rt0, metric = rt0->rt6i_metric;
320              rt && rt->rt6i_metric == metric;
321              rt = rt->u.next) {
322                 int m;
323
324                 if (rt6_check_expired(rt))
325                         continue;
326
327                 last = rt;
328
329                 m = rt6_score_route(rt, oif, strict);
330                 if (m < 0)
331                         continue;
332
333                 if (m > mpri) {
334                         rt6_probe(match);
335                         match = rt;
336                         mpri = m;
337                 } else {
338                         rt6_probe(rt);
339                 }
340         }
341
342         if (!match &&
343             (strict & RT6_SELECT_F_REACHABLE) &&
344             last && last != rt0) {
345                 /* no entries matched; do round-robin */
346                 *head = rt0->u.next;
347                 rt0->u.next = last->u.next;
348                 last->u.next = rt0;
349         }
350
351         RT6_TRACE("%s() => %p, score=%d\n",
352                   __FUNCTION__, match, mpri);
353
354         return (match ? match : &ip6_null_entry);
355 }
356
357 #ifdef CONFIG_IPV6_ROUTE_INFO
358 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
359                   struct in6_addr *gwaddr)
360 {
361         struct route_info *rinfo = (struct route_info *) opt;
362         struct in6_addr prefix_buf, *prefix;
363         unsigned int pref;
364         u32 lifetime;
365         struct rt6_info *rt;
366
367         if (len < sizeof(struct route_info)) {
368                 return -EINVAL;
369         }
370
371         /* Sanity check for prefix_len and length */
372         if (rinfo->length > 3) {
373                 return -EINVAL;
374         } else if (rinfo->prefix_len > 128) {
375                 return -EINVAL;
376         } else if (rinfo->prefix_len > 64) {
377                 if (rinfo->length < 2) {
378                         return -EINVAL;
379                 }
380         } else if (rinfo->prefix_len > 0) {
381                 if (rinfo->length < 1) {
382                         return -EINVAL;
383                 }
384         }
385
386         pref = rinfo->route_pref;
387         if (pref == ICMPV6_ROUTER_PREF_INVALID)
388                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
389
390         lifetime = htonl(rinfo->lifetime);
391         if (lifetime == 0xffffffff) {
392                 /* infinity */
393         } else if (lifetime > 0x7fffffff/HZ) {
394                 /* Avoid arithmetic overflow */
395                 lifetime = 0x7fffffff/HZ - 1;
396         }
397
398         if (rinfo->length == 3)
399                 prefix = (struct in6_addr *)rinfo->prefix;
400         else {
401                 /* this function is safe */
402                 ipv6_addr_prefix(&prefix_buf,
403                                  (struct in6_addr *)rinfo->prefix,
404                                  rinfo->prefix_len);
405                 prefix = &prefix_buf;
406         }
407
408         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
409
410         if (rt && !lifetime) {
411                 ip6_del_rt(rt, NULL, NULL, NULL);
412                 rt = NULL;
413         }
414
415         if (!rt && lifetime)
416                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
417                                         pref);
418         else if (rt)
419                 rt->rt6i_flags = RTF_ROUTEINFO |
420                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
421
422         if (rt) {
423                 if (lifetime == 0xffffffff) {
424                         rt->rt6i_flags &= ~RTF_EXPIRES;
425                 } else {
426                         rt->rt6i_expires = jiffies + HZ * lifetime;
427                         rt->rt6i_flags |= RTF_EXPIRES;
428                 }
429                 dst_release(&rt->u.dst);
430         }
431         return 0;
432 }
433 #endif
434
435 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
436                             int oif, int strict)
437 {
438         struct fib6_node *fn;
439         struct rt6_info *rt;
440
441         read_lock_bh(&rt6_lock);
442         fn = fib6_lookup(&ip6_routing_table, daddr, saddr);
443         rt = rt6_device_match(fn->leaf, oif, strict);
444         dst_hold(&rt->u.dst);
445         rt->u.dst.__use++;
446         read_unlock_bh(&rt6_lock);
447
448         rt->u.dst.lastuse = jiffies;
449         if (rt->u.dst.error == 0)
450                 return rt;
451         dst_release(&rt->u.dst);
452         return NULL;
453 }
454
455 /* ip6_ins_rt is called with FREE rt6_lock.
456    It takes new route entry, the addition fails by any reason the
457    route is freed. In any case, if caller does not hold it, it may
458    be destroyed.
459  */
460
461 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
462                 void *_rtattr, struct netlink_skb_parms *req)
463 {
464         int err;
465
466         write_lock_bh(&rt6_lock);
467         err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr, req);
468         write_unlock_bh(&rt6_lock);
469
470         return err;
471 }
472
473 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
474                                       struct in6_addr *saddr)
475 {
476         struct rt6_info *rt;
477
478         /*
479          *      Clone the route.
480          */
481
482         rt = ip6_rt_copy(ort);
483
484         if (rt) {
485                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
486                         if (rt->rt6i_dst.plen != 128 &&
487                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
488                                 rt->rt6i_flags |= RTF_ANYCAST;
489                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
490                 }
491
492                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
493                 rt->rt6i_dst.plen = 128;
494                 rt->rt6i_flags |= RTF_CACHE;
495                 rt->u.dst.flags |= DST_HOST;
496
497 #ifdef CONFIG_IPV6_SUBTREES
498                 if (rt->rt6i_src.plen && saddr) {
499                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
500                         rt->rt6i_src.plen = 128;
501                 }
502 #endif
503
504                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
505
506         }
507
508         return rt;
509 }
510
511 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
512 {
513         struct rt6_info *rt = ip6_rt_copy(ort);
514         if (rt) {
515                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
516                 rt->rt6i_dst.plen = 128;
517                 rt->rt6i_flags |= RTF_CACHE;
518                 if (rt->rt6i_flags & RTF_REJECT)
519                         rt->u.dst.error = ort->u.dst.error;
520                 rt->u.dst.flags |= DST_HOST;
521                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
522         }
523         return rt;
524 }
525
526 #define BACKTRACK() \
527 if (rt == &ip6_null_entry) { \
528        while ((fn = fn->parent) != NULL) { \
529                 if (fn->fn_flags & RTN_ROOT) { \
530                         goto out; \
531                 } \
532                 if (fn->fn_flags & RTN_RTINFO) \
533                         goto restart; \
534         } \
535 }
536
537
538 void ip6_route_input(struct sk_buff *skb)
539 {
540         struct fib6_node *fn;
541         struct rt6_info *rt, *nrt;
542         int strict;
543         int attempts = 3;
544         int err;
545         int reachable = RT6_SELECT_F_REACHABLE;
546
547         strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
548
549 relookup:
550         read_lock_bh(&rt6_lock);
551
552 restart_2:
553         fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr,
554                          &skb->nh.ipv6h->saddr);
555
556 restart:
557         rt = rt6_select(&fn->leaf, skb->dev->ifindex, strict | reachable);
558         BACKTRACK();
559         if (rt == &ip6_null_entry ||
560             rt->rt6i_flags & RTF_CACHE)
561                 goto out;
562
563         dst_hold(&rt->u.dst);
564         read_unlock_bh(&rt6_lock);
565
566         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
567                 nrt = rt6_alloc_cow(rt, &skb->nh.ipv6h->daddr, &skb->nh.ipv6h->saddr);
568         else {
569 #if CLONE_OFFLINK_ROUTE
570                 nrt = rt6_alloc_clone(rt, &skb->nh.ipv6h->daddr);
571 #else
572                 goto out2;
573 #endif
574         }
575
576         dst_release(&rt->u.dst);
577         rt = nrt ? : &ip6_null_entry;
578
579         dst_hold(&rt->u.dst);
580         if (nrt) {
581                 err = ip6_ins_rt(nrt, NULL, NULL, &NETLINK_CB(skb));
582                 if (!err)
583                         goto out2;
584         }
585
586         if (--attempts <= 0)
587                 goto out2;
588
589         /*
590          * Race condition! In the gap, when rt6_lock was
591          * released someone could insert this route.  Relookup.
592          */
593         dst_release(&rt->u.dst);
594         goto relookup;
595
596 out:
597         if (reachable) {
598                 reachable = 0;
599                 goto restart_2;
600         }
601         dst_hold(&rt->u.dst);
602         read_unlock_bh(&rt6_lock);
603 out2:
604         rt->u.dst.lastuse = jiffies;
605         rt->u.dst.__use++;
606         skb->dst = (struct dst_entry *) rt;
607         return;
608 }
609
610 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
611 {
612         struct fib6_node *fn;
613         struct rt6_info *rt, *nrt;
614         int strict;
615         int attempts = 3;
616         int err;
617         int reachable = RT6_SELECT_F_REACHABLE;
618
619         strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL) ? RT6_SELECT_F_IFACE : 0;
620
621 relookup:
622         read_lock_bh(&rt6_lock);
623
624 restart_2:
625         fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src);
626
627 restart:
628         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
629         BACKTRACK();
630         if (rt == &ip6_null_entry ||
631             rt->rt6i_flags & RTF_CACHE)
632                 goto out;
633
634         dst_hold(&rt->u.dst);
635         read_unlock_bh(&rt6_lock);
636
637         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
638                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
639         else {
640 #if CLONE_OFFLINK_ROUTE
641                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
642 #else
643                 goto out2;
644 #endif
645         }
646
647         dst_release(&rt->u.dst);
648         rt = nrt ? : &ip6_null_entry;
649
650         dst_hold(&rt->u.dst);
651         if (nrt) {
652                 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
653                 if (!err)
654                         goto out2;
655         }
656
657         if (--attempts <= 0)
658                 goto out2;
659
660         /*
661          * Race condition! In the gap, when rt6_lock was
662          * released someone could insert this route.  Relookup.
663          */
664         dst_release(&rt->u.dst);
665         goto relookup;
666
667 out:
668         if (reachable) {
669                 reachable = 0;
670                 goto restart_2;
671         }
672         dst_hold(&rt->u.dst);
673         read_unlock_bh(&rt6_lock);
674 out2:
675         rt->u.dst.lastuse = jiffies;
676         rt->u.dst.__use++;
677         return &rt->u.dst;
678 }
679
680
681 /*
682  *      Destination cache support functions
683  */
684
685 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
686 {
687         struct rt6_info *rt;
688
689         rt = (struct rt6_info *) dst;
690
691         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
692                 return dst;
693
694         return NULL;
695 }
696
697 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
698 {
699         struct rt6_info *rt = (struct rt6_info *) dst;
700
701         if (rt) {
702                 if (rt->rt6i_flags & RTF_CACHE)
703                         ip6_del_rt(rt, NULL, NULL, NULL);
704                 else
705                         dst_release(dst);
706         }
707         return NULL;
708 }
709
710 static void ip6_link_failure(struct sk_buff *skb)
711 {
712         struct rt6_info *rt;
713
714         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
715
716         rt = (struct rt6_info *) skb->dst;
717         if (rt) {
718                 if (rt->rt6i_flags&RTF_CACHE) {
719                         dst_set_expires(&rt->u.dst, 0);
720                         rt->rt6i_flags |= RTF_EXPIRES;
721                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
722                         rt->rt6i_node->fn_sernum = -1;
723         }
724 }
725
726 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
727 {
728         struct rt6_info *rt6 = (struct rt6_info*)dst;
729
730         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
731                 rt6->rt6i_flags |= RTF_MODIFIED;
732                 if (mtu < IPV6_MIN_MTU) {
733                         mtu = IPV6_MIN_MTU;
734                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
735                 }
736                 dst->metrics[RTAX_MTU-1] = mtu;
737         }
738 }
739
740 /* Protected by rt6_lock.  */
741 static struct dst_entry *ndisc_dst_gc_list;
742 static int ipv6_get_mtu(struct net_device *dev);
743
744 static inline unsigned int ipv6_advmss(unsigned int mtu)
745 {
746         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
747
748         if (mtu < ip6_rt_min_advmss)
749                 mtu = ip6_rt_min_advmss;
750
751         /*
752          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
753          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
754          * IPV6_MAXPLEN is also valid and means: "any MSS, 
755          * rely only on pmtu discovery"
756          */
757         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
758                 mtu = IPV6_MAXPLEN;
759         return mtu;
760 }
761
762 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
763                                   struct neighbour *neigh,
764                                   struct in6_addr *addr,
765                                   int (*output)(struct sk_buff *))
766 {
767         struct rt6_info *rt;
768         struct inet6_dev *idev = in6_dev_get(dev);
769
770         if (unlikely(idev == NULL))
771                 return NULL;
772
773         rt = ip6_dst_alloc();
774         if (unlikely(rt == NULL)) {
775                 in6_dev_put(idev);
776                 goto out;
777         }
778
779         dev_hold(dev);
780         if (neigh)
781                 neigh_hold(neigh);
782         else
783                 neigh = ndisc_get_neigh(dev, addr);
784
785         rt->rt6i_dev      = dev;
786         rt->rt6i_idev     = idev;
787         rt->rt6i_nexthop  = neigh;
788         atomic_set(&rt->u.dst.__refcnt, 1);
789         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
790         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
791         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
792         rt->u.dst.output  = output;
793
794 #if 0   /* there's no chance to use these for ndisc */
795         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
796                                 ? DST_HOST 
797                                 : 0;
798         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
799         rt->rt6i_dst.plen = 128;
800 #endif
801
802         write_lock_bh(&rt6_lock);
803         rt->u.dst.next = ndisc_dst_gc_list;
804         ndisc_dst_gc_list = &rt->u.dst;
805         write_unlock_bh(&rt6_lock);
806
807         fib6_force_start_gc();
808
809 out:
810         return (struct dst_entry *)rt;
811 }
812
813 int ndisc_dst_gc(int *more)
814 {
815         struct dst_entry *dst, *next, **pprev;
816         int freed;
817
818         next = NULL;
819         pprev = &ndisc_dst_gc_list;
820         freed = 0;
821         while ((dst = *pprev) != NULL) {
822                 if (!atomic_read(&dst->__refcnt)) {
823                         *pprev = dst->next;
824                         dst_free(dst);
825                         freed++;
826                 } else {
827                         pprev = &dst->next;
828                         (*more)++;
829                 }
830         }
831
832         return freed;
833 }
834
835 static int ip6_dst_gc(void)
836 {
837         static unsigned expire = 30*HZ;
838         static unsigned long last_gc;
839         unsigned long now = jiffies;
840
841         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
842             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
843                 goto out;
844
845         expire++;
846         fib6_run_gc(expire);
847         last_gc = now;
848         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
849                 expire = ip6_rt_gc_timeout>>1;
850
851 out:
852         expire -= expire>>ip6_rt_gc_elasticity;
853         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
854 }
855
856 /* Clean host part of a prefix. Not necessary in radix tree,
857    but results in cleaner routing tables.
858
859    Remove it only when all the things will work!
860  */
861
862 static int ipv6_get_mtu(struct net_device *dev)
863 {
864         int mtu = IPV6_MIN_MTU;
865         struct inet6_dev *idev;
866
867         idev = in6_dev_get(dev);
868         if (idev) {
869                 mtu = idev->cnf.mtu6;
870                 in6_dev_put(idev);
871         }
872         return mtu;
873 }
874
875 int ipv6_get_hoplimit(struct net_device *dev)
876 {
877         int hoplimit = ipv6_devconf.hop_limit;
878         struct inet6_dev *idev;
879
880         idev = in6_dev_get(dev);
881         if (idev) {
882                 hoplimit = idev->cnf.hop_limit;
883                 in6_dev_put(idev);
884         }
885         return hoplimit;
886 }
887
888 /*
889  *
890  */
891
892 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, 
893                 void *_rtattr, struct netlink_skb_parms *req)
894 {
895         int err;
896         struct rtmsg *r;
897         struct rtattr **rta;
898         struct rt6_info *rt = NULL;
899         struct net_device *dev = NULL;
900         struct inet6_dev *idev = NULL;
901         int addr_type;
902
903         rta = (struct rtattr **) _rtattr;
904
905         if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
906                 return -EINVAL;
907 #ifndef CONFIG_IPV6_SUBTREES
908         if (rtmsg->rtmsg_src_len)
909                 return -EINVAL;
910 #endif
911         if (rtmsg->rtmsg_ifindex) {
912                 err = -ENODEV;
913                 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
914                 if (!dev)
915                         goto out;
916                 idev = in6_dev_get(dev);
917                 if (!idev)
918                         goto out;
919         }
920
921         if (rtmsg->rtmsg_metric == 0)
922                 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
923
924         rt = ip6_dst_alloc();
925
926         if (rt == NULL) {
927                 err = -ENOMEM;
928                 goto out;
929         }
930
931         rt->u.dst.obsolete = -1;
932         rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
933         if (nlh && (r = NLMSG_DATA(nlh))) {
934                 rt->rt6i_protocol = r->rtm_protocol;
935         } else {
936                 rt->rt6i_protocol = RTPROT_BOOT;
937         }
938
939         addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
940
941         if (addr_type & IPV6_ADDR_MULTICAST)
942                 rt->u.dst.input = ip6_mc_input;
943         else
944                 rt->u.dst.input = ip6_forward;
945
946         rt->u.dst.output = ip6_output;
947
948         ipv6_addr_prefix(&rt->rt6i_dst.addr, 
949                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
950         rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
951         if (rt->rt6i_dst.plen == 128)
952                rt->u.dst.flags = DST_HOST;
953
954 #ifdef CONFIG_IPV6_SUBTREES
955         ipv6_addr_prefix(&rt->rt6i_src.addr, 
956                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
957         rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
958 #endif
959
960         rt->rt6i_metric = rtmsg->rtmsg_metric;
961
962         /* We cannot add true routes via loopback here,
963            they would result in kernel looping; promote them to reject routes
964          */
965         if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
966             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
967                 /* hold loopback dev/idev if we haven't done so. */
968                 if (dev != &loopback_dev) {
969                         if (dev) {
970                                 dev_put(dev);
971                                 in6_dev_put(idev);
972                         }
973                         dev = &loopback_dev;
974                         dev_hold(dev);
975                         idev = in6_dev_get(dev);
976                         if (!idev) {
977                                 err = -ENODEV;
978                                 goto out;
979                         }
980                 }
981                 rt->u.dst.output = ip6_pkt_discard_out;
982                 rt->u.dst.input = ip6_pkt_discard;
983                 rt->u.dst.error = -ENETUNREACH;
984                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
985                 goto install_route;
986         }
987
988         if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
989                 struct in6_addr *gw_addr;
990                 int gwa_type;
991
992                 gw_addr = &rtmsg->rtmsg_gateway;
993                 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
994                 gwa_type = ipv6_addr_type(gw_addr);
995
996                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
997                         struct rt6_info *grt;
998
999                         /* IPv6 strictly inhibits using not link-local
1000                            addresses as nexthop address.
1001                            Otherwise, router will not able to send redirects.
1002                            It is very good, but in some (rare!) circumstances
1003                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1004                            some exceptions. --ANK
1005                          */
1006                         err = -EINVAL;
1007                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1008                                 goto out;
1009
1010                         grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1011
1012                         err = -EHOSTUNREACH;
1013                         if (grt == NULL)
1014                                 goto out;
1015                         if (dev) {
1016                                 if (dev != grt->rt6i_dev) {
1017                                         dst_release(&grt->u.dst);
1018                                         goto out;
1019                                 }
1020                         } else {
1021                                 dev = grt->rt6i_dev;
1022                                 idev = grt->rt6i_idev;
1023                                 dev_hold(dev);
1024                                 in6_dev_hold(grt->rt6i_idev);
1025                         }
1026                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1027                                 err = 0;
1028                         dst_release(&grt->u.dst);
1029
1030                         if (err)
1031                                 goto out;
1032                 }
1033                 err = -EINVAL;
1034                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1035                         goto out;
1036         }
1037
1038         err = -ENODEV;
1039         if (dev == NULL)
1040                 goto out;
1041
1042         if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1043                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1044                 if (IS_ERR(rt->rt6i_nexthop)) {
1045                         err = PTR_ERR(rt->rt6i_nexthop);
1046                         rt->rt6i_nexthop = NULL;
1047                         goto out;
1048                 }
1049         }
1050
1051         rt->rt6i_flags = rtmsg->rtmsg_flags;
1052
1053 install_route:
1054         if (rta && rta[RTA_METRICS-1]) {
1055                 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1056                 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1057
1058                 while (RTA_OK(attr, attrlen)) {
1059                         unsigned flavor = attr->rta_type;
1060                         if (flavor) {
1061                                 if (flavor > RTAX_MAX) {
1062                                         err = -EINVAL;
1063                                         goto out;
1064                                 }
1065                                 rt->u.dst.metrics[flavor-1] =
1066                                         *(u32 *)RTA_DATA(attr);
1067                         }
1068                         attr = RTA_NEXT(attr, attrlen);
1069                 }
1070         }
1071
1072         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1073                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1074         if (!rt->u.dst.metrics[RTAX_MTU-1])
1075                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1076         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1077                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1078         rt->u.dst.dev = dev;
1079         rt->rt6i_idev = idev;
1080         return ip6_ins_rt(rt, nlh, _rtattr, req);
1081
1082 out:
1083         if (dev)
1084                 dev_put(dev);
1085         if (idev)
1086                 in6_dev_put(idev);
1087         if (rt)
1088                 dst_free((struct dst_entry *) rt);
1089         return err;
1090 }
1091
1092 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1093 {
1094         int err;
1095
1096         write_lock_bh(&rt6_lock);
1097
1098         err = fib6_del(rt, nlh, _rtattr, req);
1099         dst_release(&rt->u.dst);
1100
1101         write_unlock_bh(&rt6_lock);
1102
1103         return err;
1104 }
1105
1106 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1107 {
1108         struct fib6_node *fn;
1109         struct rt6_info *rt;
1110         int err = -ESRCH;
1111
1112         read_lock_bh(&rt6_lock);
1113
1114         fn = fib6_locate(&ip6_routing_table,
1115                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1116                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1117         
1118         if (fn) {
1119                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1120                         if (rtmsg->rtmsg_ifindex &&
1121                             (rt->rt6i_dev == NULL ||
1122                              rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1123                                 continue;
1124                         if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1125                             !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1126                                 continue;
1127                         if (rtmsg->rtmsg_metric &&
1128                             rtmsg->rtmsg_metric != rt->rt6i_metric)
1129                                 continue;
1130                         dst_hold(&rt->u.dst);
1131                         read_unlock_bh(&rt6_lock);
1132
1133                         return ip6_del_rt(rt, nlh, _rtattr, req);
1134                 }
1135         }
1136         read_unlock_bh(&rt6_lock);
1137
1138         return err;
1139 }
1140
1141 /*
1142  *      Handle redirects
1143  */
1144 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1145                   struct neighbour *neigh, u8 *lladdr, int on_link)
1146 {
1147         struct rt6_info *rt, *nrt;
1148
1149         /* Locate old route to this destination. */
1150         rt = rt6_lookup(dest, NULL, neigh->dev->ifindex, 1);
1151
1152         if (rt == NULL)
1153                 return;
1154
1155         if (neigh->dev != rt->rt6i_dev)
1156                 goto out;
1157
1158         /*
1159          * Current route is on-link; redirect is always invalid.
1160          * 
1161          * Seems, previous statement is not true. It could
1162          * be node, which looks for us as on-link (f.e. proxy ndisc)
1163          * But then router serving it might decide, that we should
1164          * know truth 8)8) --ANK (980726).
1165          */
1166         if (!(rt->rt6i_flags&RTF_GATEWAY))
1167                 goto out;
1168
1169         /*
1170          *      RFC 2461 specifies that redirects should only be
1171          *      accepted if they come from the nexthop to the target.
1172          *      Due to the way default routers are chosen, this notion
1173          *      is a bit fuzzy and one might need to check all default
1174          *      routers.
1175          */
1176         if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway)) {
1177                 if (rt->rt6i_flags & RTF_DEFAULT) {
1178                         struct rt6_info *rt1;
1179
1180                         read_lock(&rt6_lock);
1181                         for (rt1 = ip6_routing_table.leaf; rt1; rt1 = rt1->u.next) {
1182                                 if (ipv6_addr_equal(saddr, &rt1->rt6i_gateway)) {
1183                                         dst_hold(&rt1->u.dst);
1184                                         dst_release(&rt->u.dst);
1185                                         read_unlock(&rt6_lock);
1186                                         rt = rt1;
1187                                         goto source_ok;
1188                                 }
1189                         }
1190                         read_unlock(&rt6_lock);
1191                 }
1192                 if (net_ratelimit())
1193                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1194                                "for redirect target\n");
1195                 goto out;
1196         }
1197
1198 source_ok:
1199
1200         /*
1201          *      We have finally decided to accept it.
1202          */
1203
1204         neigh_update(neigh, lladdr, NUD_STALE, 
1205                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1206                      NEIGH_UPDATE_F_OVERRIDE|
1207                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1208                                      NEIGH_UPDATE_F_ISROUTER))
1209                      );
1210
1211         /*
1212          * Redirect received -> path was valid.
1213          * Look, redirects are sent only in response to data packets,
1214          * so that this nexthop apparently is reachable. --ANK
1215          */
1216         dst_confirm(&rt->u.dst);
1217
1218         /* Duplicate redirect: silently ignore. */
1219         if (neigh == rt->u.dst.neighbour)
1220                 goto out;
1221
1222         nrt = ip6_rt_copy(rt);
1223         if (nrt == NULL)
1224                 goto out;
1225
1226         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1227         if (on_link)
1228                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1229
1230         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1231         nrt->rt6i_dst.plen = 128;
1232         nrt->u.dst.flags |= DST_HOST;
1233
1234         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1235         nrt->rt6i_nexthop = neigh_clone(neigh);
1236         /* Reset pmtu, it may be better */
1237         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1238         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1239
1240         if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1241                 goto out;
1242
1243         if (rt->rt6i_flags&RTF_CACHE) {
1244                 ip6_del_rt(rt, NULL, NULL, NULL);
1245                 return;
1246         }
1247
1248 out:
1249         dst_release(&rt->u.dst);
1250         return;
1251 }
1252
1253 /*
1254  *      Handle ICMP "packet too big" messages
1255  *      i.e. Path MTU discovery
1256  */
1257
1258 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1259                         struct net_device *dev, u32 pmtu)
1260 {
1261         struct rt6_info *rt, *nrt;
1262         int allfrag = 0;
1263
1264         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1265         if (rt == NULL)
1266                 return;
1267
1268         if (pmtu >= dst_mtu(&rt->u.dst))
1269                 goto out;
1270
1271         if (pmtu < IPV6_MIN_MTU) {
1272                 /*
1273                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1274                  * MTU (1280) and a fragment header should always be included
1275                  * after a node receiving Too Big message reporting PMTU is
1276                  * less than the IPv6 Minimum Link MTU.
1277                  */
1278                 pmtu = IPV6_MIN_MTU;
1279                 allfrag = 1;
1280         }
1281
1282         /* New mtu received -> path was valid.
1283            They are sent only in response to data packets,
1284            so that this nexthop apparently is reachable. --ANK
1285          */
1286         dst_confirm(&rt->u.dst);
1287
1288         /* Host route. If it is static, it would be better
1289            not to override it, but add new one, so that
1290            when cache entry will expire old pmtu
1291            would return automatically.
1292          */
1293         if (rt->rt6i_flags & RTF_CACHE) {
1294                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1295                 if (allfrag)
1296                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1297                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1298                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1299                 goto out;
1300         }
1301
1302         /* Network route.
1303            Two cases are possible:
1304            1. It is connected route. Action: COW
1305            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1306          */
1307         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1308                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1309         else
1310                 nrt = rt6_alloc_clone(rt, daddr);
1311
1312         if (nrt) {
1313                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1314                 if (allfrag)
1315                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1316
1317                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1318                  * happened within 5 mins, the recommended timer is 10 mins.
1319                  * Here this route expiration time is set to ip6_rt_mtu_expires
1320                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1321                  * and detecting PMTU increase will be automatically happened.
1322                  */
1323                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1324                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1325
1326                 ip6_ins_rt(nrt, NULL, NULL, NULL);
1327         }
1328 out:
1329         dst_release(&rt->u.dst);
1330 }
1331
1332 /*
1333  *      Misc support functions
1334  */
1335
1336 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1337 {
1338         struct rt6_info *rt = ip6_dst_alloc();
1339
1340         if (rt) {
1341                 rt->u.dst.input = ort->u.dst.input;
1342                 rt->u.dst.output = ort->u.dst.output;
1343
1344                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1345                 rt->u.dst.dev = ort->u.dst.dev;
1346                 if (rt->u.dst.dev)
1347                         dev_hold(rt->u.dst.dev);
1348                 rt->rt6i_idev = ort->rt6i_idev;
1349                 if (rt->rt6i_idev)
1350                         in6_dev_hold(rt->rt6i_idev);
1351                 rt->u.dst.lastuse = jiffies;
1352                 rt->rt6i_expires = 0;
1353
1354                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1355                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1356                 rt->rt6i_metric = 0;
1357
1358                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1359 #ifdef CONFIG_IPV6_SUBTREES
1360                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1361 #endif
1362         }
1363         return rt;
1364 }
1365
1366 #ifdef CONFIG_IPV6_ROUTE_INFO
1367 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1368                                            struct in6_addr *gwaddr, int ifindex)
1369 {
1370         struct fib6_node *fn;
1371         struct rt6_info *rt = NULL;
1372
1373         write_lock_bh(&rt6_lock);
1374         fn = fib6_locate(&ip6_routing_table, prefix ,prefixlen, NULL, 0);
1375         if (!fn)
1376                 goto out;
1377
1378         for (rt = fn->leaf; rt; rt = rt->u.next) {
1379                 if (rt->rt6i_dev->ifindex != ifindex)
1380                         continue;
1381                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1382                         continue;
1383                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1384                         continue;
1385                 dst_hold(&rt->u.dst);
1386                 break;
1387         }
1388 out:
1389         write_unlock_bh(&rt6_lock);
1390         return rt;
1391 }
1392
1393 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1394                                            struct in6_addr *gwaddr, int ifindex,
1395                                            unsigned pref)
1396 {
1397         struct in6_rtmsg rtmsg;
1398
1399         memset(&rtmsg, 0, sizeof(rtmsg));
1400         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1401         ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1402         rtmsg.rtmsg_dst_len = prefixlen;
1403         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1404         rtmsg.rtmsg_metric = 1024;
1405         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1406         /* We should treat it as a default route if prefix length is 0. */
1407         if (!prefixlen)
1408                 rtmsg.rtmsg_flags |= RTF_DEFAULT;
1409         rtmsg.rtmsg_ifindex = ifindex;
1410
1411         ip6_route_add(&rtmsg, NULL, NULL, NULL);
1412
1413         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1414 }
1415 #endif
1416
1417 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1418 {       
1419         struct rt6_info *rt;
1420         struct fib6_node *fn;
1421
1422         fn = &ip6_routing_table;
1423
1424         write_lock_bh(&rt6_lock);
1425         for (rt = fn->leaf; rt; rt=rt->u.next) {
1426                 if (dev == rt->rt6i_dev &&
1427                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1428                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1429                         break;
1430         }
1431         if (rt)
1432                 dst_hold(&rt->u.dst);
1433         write_unlock_bh(&rt6_lock);
1434         return rt;
1435 }
1436
1437 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1438                                      struct net_device *dev,
1439                                      unsigned int pref)
1440 {
1441         struct in6_rtmsg rtmsg;
1442
1443         memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1444         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1445         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1446         rtmsg.rtmsg_metric = 1024;
1447         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1448                             RTF_PREF(pref);
1449
1450         rtmsg.rtmsg_ifindex = dev->ifindex;
1451
1452         ip6_route_add(&rtmsg, NULL, NULL, NULL);
1453         return rt6_get_dflt_router(gwaddr, dev);
1454 }
1455
1456 void rt6_purge_dflt_routers(void)
1457 {
1458         struct rt6_info *rt;
1459
1460 restart:
1461         read_lock_bh(&rt6_lock);
1462         for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) {
1463                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1464                         dst_hold(&rt->u.dst);
1465
1466                         read_unlock_bh(&rt6_lock);
1467
1468                         ip6_del_rt(rt, NULL, NULL, NULL);
1469
1470                         goto restart;
1471                 }
1472         }
1473         read_unlock_bh(&rt6_lock);
1474 }
1475
1476 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1477 {
1478         struct in6_rtmsg rtmsg;
1479         int err;
1480
1481         switch(cmd) {
1482         case SIOCADDRT:         /* Add a route */
1483         case SIOCDELRT:         /* Delete a route */
1484                 if (!capable(CAP_NET_ADMIN))
1485                         return -EPERM;
1486                 err = copy_from_user(&rtmsg, arg,
1487                                      sizeof(struct in6_rtmsg));
1488                 if (err)
1489                         return -EFAULT;
1490                         
1491                 rtnl_lock();
1492                 switch (cmd) {
1493                 case SIOCADDRT:
1494                         err = ip6_route_add(&rtmsg, NULL, NULL, NULL);
1495                         break;
1496                 case SIOCDELRT:
1497                         err = ip6_route_del(&rtmsg, NULL, NULL, NULL);
1498                         break;
1499                 default:
1500                         err = -EINVAL;
1501                 }
1502                 rtnl_unlock();
1503
1504                 return err;
1505         };
1506
1507         return -EINVAL;
1508 }
1509
1510 /*
1511  *      Drop the packet on the floor
1512  */
1513
1514 static int ip6_pkt_discard(struct sk_buff *skb)
1515 {
1516         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1517         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1518         kfree_skb(skb);
1519         return 0;
1520 }
1521
1522 static int ip6_pkt_discard_out(struct sk_buff *skb)
1523 {
1524         skb->dev = skb->dst->dev;
1525         return ip6_pkt_discard(skb);
1526 }
1527
1528 /*
1529  *      Allocate a dst for local (unicast / anycast) address.
1530  */
1531
1532 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1533                                     const struct in6_addr *addr,
1534                                     int anycast)
1535 {
1536         struct rt6_info *rt = ip6_dst_alloc();
1537
1538         if (rt == NULL)
1539                 return ERR_PTR(-ENOMEM);
1540
1541         dev_hold(&loopback_dev);
1542         in6_dev_hold(idev);
1543
1544         rt->u.dst.flags = DST_HOST;
1545         rt->u.dst.input = ip6_input;
1546         rt->u.dst.output = ip6_output;
1547         rt->rt6i_dev = &loopback_dev;
1548         rt->rt6i_idev = idev;
1549         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1550         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1551         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1552         rt->u.dst.obsolete = -1;
1553
1554         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1555         if (anycast)
1556                 rt->rt6i_flags |= RTF_ANYCAST;
1557         else
1558                 rt->rt6i_flags |= RTF_LOCAL;
1559         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1560         if (rt->rt6i_nexthop == NULL) {
1561                 dst_free((struct dst_entry *) rt);
1562                 return ERR_PTR(-ENOMEM);
1563         }
1564
1565         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1566         rt->rt6i_dst.plen = 128;
1567
1568         atomic_set(&rt->u.dst.__refcnt, 1);
1569
1570         return rt;
1571 }
1572
1573 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1574 {
1575         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1576             rt != &ip6_null_entry) {
1577                 RT6_TRACE("deleted by ifdown %p\n", rt);
1578                 return -1;
1579         }
1580         return 0;
1581 }
1582
1583 void rt6_ifdown(struct net_device *dev)
1584 {
1585         write_lock_bh(&rt6_lock);
1586         fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev);
1587         write_unlock_bh(&rt6_lock);
1588 }
1589
1590 struct rt6_mtu_change_arg
1591 {
1592         struct net_device *dev;
1593         unsigned mtu;
1594 };
1595
1596 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1597 {
1598         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1599         struct inet6_dev *idev;
1600
1601         /* In IPv6 pmtu discovery is not optional,
1602            so that RTAX_MTU lock cannot disable it.
1603            We still use this lock to block changes
1604            caused by addrconf/ndisc.
1605         */
1606
1607         idev = __in6_dev_get(arg->dev);
1608         if (idev == NULL)
1609                 return 0;
1610
1611         /* For administrative MTU increase, there is no way to discover
1612            IPv6 PMTU increase, so PMTU increase should be updated here.
1613            Since RFC 1981 doesn't include administrative MTU increase
1614            update PMTU increase is a MUST. (i.e. jumbo frame)
1615          */
1616         /*
1617            If new MTU is less than route PMTU, this new MTU will be the
1618            lowest MTU in the path, update the route PMTU to reflect PMTU
1619            decreases; if new MTU is greater than route PMTU, and the
1620            old MTU is the lowest MTU in the path, update the route PMTU
1621            to reflect the increase. In this case if the other nodes' MTU
1622            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1623            PMTU discouvery.
1624          */
1625         if (rt->rt6i_dev == arg->dev &&
1626             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1627             (dst_mtu(&rt->u.dst) > arg->mtu ||
1628              (dst_mtu(&rt->u.dst) < arg->mtu &&
1629               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1630                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1631         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1632         return 0;
1633 }
1634
1635 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1636 {
1637         struct rt6_mtu_change_arg arg;
1638
1639         arg.dev = dev;
1640         arg.mtu = mtu;
1641         read_lock_bh(&rt6_lock);
1642         fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg);
1643         read_unlock_bh(&rt6_lock);
1644 }
1645
1646 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1647                               struct in6_rtmsg *rtmsg)
1648 {
1649         memset(rtmsg, 0, sizeof(*rtmsg));
1650
1651         rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1652         rtmsg->rtmsg_src_len = r->rtm_src_len;
1653         rtmsg->rtmsg_flags = RTF_UP;
1654         if (r->rtm_type == RTN_UNREACHABLE)
1655                 rtmsg->rtmsg_flags |= RTF_REJECT;
1656
1657         if (rta[RTA_GATEWAY-1]) {
1658                 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1659                         return -EINVAL;
1660                 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1661                 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1662         }
1663         if (rta[RTA_DST-1]) {
1664                 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1665                         return -EINVAL;
1666                 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1667         }
1668         if (rta[RTA_SRC-1]) {
1669                 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1670                         return -EINVAL;
1671                 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1672         }
1673         if (rta[RTA_OIF-1]) {
1674                 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1675                         return -EINVAL;
1676                 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1677         }
1678         if (rta[RTA_PRIORITY-1]) {
1679                 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1680                         return -EINVAL;
1681                 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1682         }
1683         return 0;
1684 }
1685
1686 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1687 {
1688         struct rtmsg *r = NLMSG_DATA(nlh);
1689         struct in6_rtmsg rtmsg;
1690
1691         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1692                 return -EINVAL;
1693         return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1694 }
1695
1696 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1697 {
1698         struct rtmsg *r = NLMSG_DATA(nlh);
1699         struct in6_rtmsg rtmsg;
1700
1701         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1702                 return -EINVAL;
1703         return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb));
1704 }
1705
1706 struct rt6_rtnl_dump_arg
1707 {
1708         struct sk_buff *skb;
1709         struct netlink_callback *cb;
1710 };
1711
1712 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1713                          struct in6_addr *dst, struct in6_addr *src,
1714                          int iif, int type, u32 pid, u32 seq,
1715                          int prefix, unsigned int flags)
1716 {
1717         struct rtmsg *rtm;
1718         struct nlmsghdr  *nlh;
1719         unsigned char    *b = skb->tail;
1720         struct rta_cacheinfo ci;
1721
1722         if (prefix) {   /* user wants prefix routes only */
1723                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1724                         /* success since this is not a prefix route */
1725                         return 1;
1726                 }
1727         }
1728
1729         nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1730         rtm = NLMSG_DATA(nlh);
1731         rtm->rtm_family = AF_INET6;
1732         rtm->rtm_dst_len = rt->rt6i_dst.plen;
1733         rtm->rtm_src_len = rt->rt6i_src.plen;
1734         rtm->rtm_tos = 0;
1735         rtm->rtm_table = RT_TABLE_MAIN;
1736         if (rt->rt6i_flags&RTF_REJECT)
1737                 rtm->rtm_type = RTN_UNREACHABLE;
1738         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1739                 rtm->rtm_type = RTN_LOCAL;
1740         else
1741                 rtm->rtm_type = RTN_UNICAST;
1742         rtm->rtm_flags = 0;
1743         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1744         rtm->rtm_protocol = rt->rt6i_protocol;
1745         if (rt->rt6i_flags&RTF_DYNAMIC)
1746                 rtm->rtm_protocol = RTPROT_REDIRECT;
1747         else if (rt->rt6i_flags & RTF_ADDRCONF)
1748                 rtm->rtm_protocol = RTPROT_KERNEL;
1749         else if (rt->rt6i_flags&RTF_DEFAULT)
1750                 rtm->rtm_protocol = RTPROT_RA;
1751
1752         if (rt->rt6i_flags&RTF_CACHE)
1753                 rtm->rtm_flags |= RTM_F_CLONED;
1754
1755         if (dst) {
1756                 RTA_PUT(skb, RTA_DST, 16, dst);
1757                 rtm->rtm_dst_len = 128;
1758         } else if (rtm->rtm_dst_len)
1759                 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1760 #ifdef CONFIG_IPV6_SUBTREES
1761         if (src) {
1762                 RTA_PUT(skb, RTA_SRC, 16, src);
1763                 rtm->rtm_src_len = 128;
1764         } else if (rtm->rtm_src_len)
1765                 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1766 #endif
1767         if (iif)
1768                 RTA_PUT(skb, RTA_IIF, 4, &iif);
1769         else if (dst) {
1770                 struct in6_addr saddr_buf;
1771                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1772                         RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1773         }
1774         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1775                 goto rtattr_failure;
1776         if (rt->u.dst.neighbour)
1777                 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1778         if (rt->u.dst.dev)
1779                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1780         RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1781         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1782         if (rt->rt6i_expires)
1783                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1784         else
1785                 ci.rta_expires = 0;
1786         ci.rta_used = rt->u.dst.__use;
1787         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1788         ci.rta_error = rt->u.dst.error;
1789         ci.rta_id = 0;
1790         ci.rta_ts = 0;
1791         ci.rta_tsage = 0;
1792         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1793         nlh->nlmsg_len = skb->tail - b;
1794         return skb->len;
1795
1796 nlmsg_failure:
1797 rtattr_failure:
1798         skb_trim(skb, b - skb->data);
1799         return -1;
1800 }
1801
1802 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1803 {
1804         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1805         int prefix;
1806
1807         if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1808                 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1809                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1810         } else
1811                 prefix = 0;
1812
1813         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1814                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1815                      prefix, NLM_F_MULTI);
1816 }
1817
1818 static int fib6_dump_node(struct fib6_walker_t *w)
1819 {
1820         int res;
1821         struct rt6_info *rt;
1822
1823         for (rt = w->leaf; rt; rt = rt->u.next) {
1824                 res = rt6_dump_route(rt, w->args);
1825                 if (res < 0) {
1826                         /* Frame is full, suspend walking */
1827                         w->leaf = rt;
1828                         return 1;
1829                 }
1830                 BUG_TRAP(res!=0);
1831         }
1832         w->leaf = NULL;
1833         return 0;
1834 }
1835
1836 static void fib6_dump_end(struct netlink_callback *cb)
1837 {
1838         struct fib6_walker_t *w = (void*)cb->args[0];
1839
1840         if (w) {
1841                 cb->args[0] = 0;
1842                 fib6_walker_unlink(w);
1843                 kfree(w);
1844         }
1845         cb->done = (void*)cb->args[1];
1846         cb->args[1] = 0;
1847 }
1848
1849 static int fib6_dump_done(struct netlink_callback *cb)
1850 {
1851         fib6_dump_end(cb);
1852         return cb->done ? cb->done(cb) : 0;
1853 }
1854
1855 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
1856 {
1857         struct rt6_rtnl_dump_arg arg;
1858         struct fib6_walker_t *w;
1859         int res;
1860
1861         arg.skb = skb;
1862         arg.cb = cb;
1863
1864         w = (void*)cb->args[0];
1865         if (w == NULL) {
1866                 /* New dump:
1867                  * 
1868                  * 1. hook callback destructor.
1869                  */
1870                 cb->args[1] = (long)cb->done;
1871                 cb->done = fib6_dump_done;
1872
1873                 /*
1874                  * 2. allocate and initialize walker.
1875                  */
1876                 w = kmalloc(sizeof(*w), GFP_ATOMIC);
1877                 if (w == NULL)
1878                         return -ENOMEM;
1879                 RT6_TRACE("dump<%p", w);
1880                 memset(w, 0, sizeof(*w));
1881                 w->root = &ip6_routing_table;
1882                 w->func = fib6_dump_node;
1883                 w->args = &arg;
1884                 cb->args[0] = (long)w;
1885                 read_lock_bh(&rt6_lock);
1886                 res = fib6_walk(w);
1887                 read_unlock_bh(&rt6_lock);
1888         } else {
1889                 w->args = &arg;
1890                 read_lock_bh(&rt6_lock);
1891                 res = fib6_walk_continue(w);
1892                 read_unlock_bh(&rt6_lock);
1893         }
1894 #if RT6_DEBUG >= 3
1895         if (res <= 0 && skb->len == 0)
1896                 RT6_TRACE("%p>dump end\n", w);
1897 #endif
1898         res = res < 0 ? res : skb->len;
1899         /* res < 0 is an error. (really, impossible)
1900            res == 0 means that dump is complete, but skb still can contain data.
1901            res > 0 dump is not complete, but frame is full.
1902          */
1903         /* Destroy walker, if dump of this table is complete. */
1904         if (res <= 0)
1905                 fib6_dump_end(cb);
1906         return res;
1907 }
1908
1909 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
1910 {
1911         struct rtattr **rta = arg;
1912         int iif = 0;
1913         int err = -ENOBUFS;
1914         struct sk_buff *skb;
1915         struct flowi fl;
1916         struct rt6_info *rt;
1917
1918         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1919         if (skb == NULL)
1920                 goto out;
1921
1922         /* Reserve room for dummy headers, this skb can pass
1923            through good chunk of routing engine.
1924          */
1925         skb->mac.raw = skb->data;
1926         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
1927
1928         memset(&fl, 0, sizeof(fl));
1929         if (rta[RTA_SRC-1])
1930                 ipv6_addr_copy(&fl.fl6_src,
1931                                (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
1932         if (rta[RTA_DST-1])
1933                 ipv6_addr_copy(&fl.fl6_dst,
1934                                (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
1935
1936         if (rta[RTA_IIF-1])
1937                 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
1938
1939         if (iif) {
1940                 struct net_device *dev;
1941                 dev = __dev_get_by_index(iif);
1942                 if (!dev) {
1943                         err = -ENODEV;
1944                         goto out_free;
1945                 }
1946         }
1947
1948         fl.oif = 0;
1949         if (rta[RTA_OIF-1])
1950                 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1951
1952         rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
1953
1954         skb->dst = &rt->u.dst;
1955
1956         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
1957         err = rt6_fill_node(skb, rt, 
1958                             &fl.fl6_dst, &fl.fl6_src,
1959                             iif,
1960                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
1961                             nlh->nlmsg_seq, 0, 0);
1962         if (err < 0) {
1963                 err = -EMSGSIZE;
1964                 goto out_free;
1965         }
1966
1967         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
1968         if (err > 0)
1969                 err = 0;
1970 out:
1971         return err;
1972 out_free:
1973         kfree_skb(skb);
1974         goto out;       
1975 }
1976
1977 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh, 
1978                         struct netlink_skb_parms *req)
1979 {
1980         struct sk_buff *skb;
1981         int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
1982         u32 pid = current->pid;
1983         u32 seq = 0;
1984
1985         if (req)
1986                 pid = req->pid;
1987         if (nlh)
1988                 seq = nlh->nlmsg_seq;
1989         
1990         skb = alloc_skb(size, gfp_any());
1991         if (!skb) {
1992                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
1993                 return;
1994         }
1995         if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
1996                 kfree_skb(skb);
1997                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
1998                 return;
1999         }
2000         NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2001         netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2002 }
2003
2004 /*
2005  *      /proc
2006  */
2007
2008 #ifdef CONFIG_PROC_FS
2009
2010 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2011
2012 struct rt6_proc_arg
2013 {
2014         char *buffer;
2015         int offset;
2016         int length;
2017         int skip;
2018         int len;
2019 };
2020
2021 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2022 {
2023         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2024         int i;
2025
2026         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2027                 arg->skip++;
2028                 return 0;
2029         }
2030
2031         if (arg->len >= arg->length)
2032                 return 0;
2033
2034         for (i=0; i<16; i++) {
2035                 sprintf(arg->buffer + arg->len, "%02x",
2036                         rt->rt6i_dst.addr.s6_addr[i]);
2037                 arg->len += 2;
2038         }
2039         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2040                             rt->rt6i_dst.plen);
2041
2042 #ifdef CONFIG_IPV6_SUBTREES
2043         for (i=0; i<16; i++) {
2044                 sprintf(arg->buffer + arg->len, "%02x",
2045                         rt->rt6i_src.addr.s6_addr[i]);
2046                 arg->len += 2;
2047         }
2048         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2049                             rt->rt6i_src.plen);
2050 #else
2051         sprintf(arg->buffer + arg->len,
2052                 "00000000000000000000000000000000 00 ");
2053         arg->len += 36;
2054 #endif
2055
2056         if (rt->rt6i_nexthop) {
2057                 for (i=0; i<16; i++) {
2058                         sprintf(arg->buffer + arg->len, "%02x",
2059                                 rt->rt6i_nexthop->primary_key[i]);
2060                         arg->len += 2;
2061                 }
2062         } else {
2063                 sprintf(arg->buffer + arg->len,
2064                         "00000000000000000000000000000000");
2065                 arg->len += 32;
2066         }
2067         arg->len += sprintf(arg->buffer + arg->len,
2068                             " %08x %08x %08x %08x %8s\n",
2069                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2070                             rt->u.dst.__use, rt->rt6i_flags, 
2071                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2072         return 0;
2073 }
2074
2075 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2076 {
2077         struct rt6_proc_arg arg;
2078         arg.buffer = buffer;
2079         arg.offset = offset;
2080         arg.length = length;
2081         arg.skip = 0;
2082         arg.len = 0;
2083
2084         read_lock_bh(&rt6_lock);
2085         fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg);
2086         read_unlock_bh(&rt6_lock);
2087
2088         *start = buffer;
2089         if (offset)
2090                 *start += offset % RT6_INFO_LEN;
2091
2092         arg.len -= offset % RT6_INFO_LEN;
2093
2094         if (arg.len > length)
2095                 arg.len = length;
2096         if (arg.len < 0)
2097                 arg.len = 0;
2098
2099         return arg.len;
2100 }
2101
2102 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2103 {
2104         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2105                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2106                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2107                       rt6_stats.fib_rt_cache,
2108                       atomic_read(&ip6_dst_ops.entries),
2109                       rt6_stats.fib_discarded_routes);
2110
2111         return 0;
2112 }
2113
2114 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2115 {
2116         return single_open(file, rt6_stats_seq_show, NULL);
2117 }
2118
2119 static struct file_operations rt6_stats_seq_fops = {
2120         .owner   = THIS_MODULE,
2121         .open    = rt6_stats_seq_open,
2122         .read    = seq_read,
2123         .llseek  = seq_lseek,
2124         .release = single_release,
2125 };
2126 #endif  /* CONFIG_PROC_FS */
2127
2128 #ifdef CONFIG_SYSCTL
2129
2130 static int flush_delay;
2131
2132 static
2133 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2134                               void __user *buffer, size_t *lenp, loff_t *ppos)
2135 {
2136         if (write) {
2137                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2138                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2139                 return 0;
2140         } else
2141                 return -EINVAL;
2142 }
2143
2144 ctl_table ipv6_route_table[] = {
2145         {
2146                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2147                 .procname       =       "flush",
2148                 .data           =       &flush_delay,
2149                 .maxlen         =       sizeof(int),
2150                 .mode           =       0200,
2151                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2152         },
2153         {
2154                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2155                 .procname       =       "gc_thresh",
2156                 .data           =       &ip6_dst_ops.gc_thresh,
2157                 .maxlen         =       sizeof(int),
2158                 .mode           =       0644,
2159                 .proc_handler   =       &proc_dointvec,
2160         },
2161         {
2162                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2163                 .procname       =       "max_size",
2164                 .data           =       &ip6_rt_max_size,
2165                 .maxlen         =       sizeof(int),
2166                 .mode           =       0644,
2167                 .proc_handler   =       &proc_dointvec,
2168         },
2169         {
2170                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2171                 .procname       =       "gc_min_interval",
2172                 .data           =       &ip6_rt_gc_min_interval,
2173                 .maxlen         =       sizeof(int),
2174                 .mode           =       0644,
2175                 .proc_handler   =       &proc_dointvec_jiffies,
2176                 .strategy       =       &sysctl_jiffies,
2177         },
2178         {
2179                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2180                 .procname       =       "gc_timeout",
2181                 .data           =       &ip6_rt_gc_timeout,
2182                 .maxlen         =       sizeof(int),
2183                 .mode           =       0644,
2184                 .proc_handler   =       &proc_dointvec_jiffies,
2185                 .strategy       =       &sysctl_jiffies,
2186         },
2187         {
2188                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2189                 .procname       =       "gc_interval",
2190                 .data           =       &ip6_rt_gc_interval,
2191                 .maxlen         =       sizeof(int),
2192                 .mode           =       0644,
2193                 .proc_handler   =       &proc_dointvec_jiffies,
2194                 .strategy       =       &sysctl_jiffies,
2195         },
2196         {
2197                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2198                 .procname       =       "gc_elasticity",
2199                 .data           =       &ip6_rt_gc_elasticity,
2200                 .maxlen         =       sizeof(int),
2201                 .mode           =       0644,
2202                 .proc_handler   =       &proc_dointvec_jiffies,
2203                 .strategy       =       &sysctl_jiffies,
2204         },
2205         {
2206                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2207                 .procname       =       "mtu_expires",
2208                 .data           =       &ip6_rt_mtu_expires,
2209                 .maxlen         =       sizeof(int),
2210                 .mode           =       0644,
2211                 .proc_handler   =       &proc_dointvec_jiffies,
2212                 .strategy       =       &sysctl_jiffies,
2213         },
2214         {
2215                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2216                 .procname       =       "min_adv_mss",
2217                 .data           =       &ip6_rt_min_advmss,
2218                 .maxlen         =       sizeof(int),
2219                 .mode           =       0644,
2220                 .proc_handler   =       &proc_dointvec_jiffies,
2221                 .strategy       =       &sysctl_jiffies,
2222         },
2223         {
2224                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2225                 .procname       =       "gc_min_interval_ms",
2226                 .data           =       &ip6_rt_gc_min_interval,
2227                 .maxlen         =       sizeof(int),
2228                 .mode           =       0644,
2229                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2230                 .strategy       =       &sysctl_ms_jiffies,
2231         },
2232         { .ctl_name = 0 }
2233 };
2234
2235 #endif
2236
2237 void __init ip6_route_init(void)
2238 {
2239         struct proc_dir_entry *p;
2240
2241         ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2242                                                      sizeof(struct rt6_info),
2243                                                      0, SLAB_HWCACHE_ALIGN,
2244                                                      NULL, NULL);
2245         if (!ip6_dst_ops.kmem_cachep)
2246                 panic("cannot create ip6_dst_cache");
2247
2248         fib6_init();
2249 #ifdef  CONFIG_PROC_FS
2250         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2251         if (p)
2252                 p->owner = THIS_MODULE;
2253
2254         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2255 #endif
2256 #ifdef CONFIG_XFRM
2257         xfrm6_init();
2258 #endif
2259 }
2260
2261 void ip6_route_cleanup(void)
2262 {
2263 #ifdef CONFIG_PROC_FS
2264         proc_net_remove("ipv6_route");
2265         proc_net_remove("rt6_stats");
2266 #endif
2267 #ifdef CONFIG_XFRM
2268         xfrm6_fini();
2269 #endif
2270         rt6_ifdown(NULL);
2271         fib6_gc_cleanup();
2272         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2273 }