[IPv4]: FIB configuration using struct fib_config
[pandora-kernel.git] / net / ipv4 / fib_semantics.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              IPv4 Forwarding Information Base: semantics.
7  *
8  * Version:     $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9  *
10  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  */
17
18 #include <asm/uaccess.h>
19 #include <asm/system.h>
20 #include <linux/bitops.h>
21 #include <linux/types.h>
22 #include <linux/kernel.h>
23 #include <linux/jiffies.h>
24 #include <linux/mm.h>
25 #include <linux/string.h>
26 #include <linux/socket.h>
27 #include <linux/sockios.h>
28 #include <linux/errno.h>
29 #include <linux/in.h>
30 #include <linux/inet.h>
31 #include <linux/inetdevice.h>
32 #include <linux/netdevice.h>
33 #include <linux/if_arp.h>
34 #include <linux/proc_fs.h>
35 #include <linux/skbuff.h>
36 #include <linux/init.h>
37
38 #include <net/arp.h>
39 #include <net/ip.h>
40 #include <net/protocol.h>
41 #include <net/route.h>
42 #include <net/tcp.h>
43 #include <net/sock.h>
44 #include <net/ip_fib.h>
45 #include <net/ip_mp_alg.h>
46 #include <net/netlink.h>
47 #include <net/nexthop.h>
48
49 #include "fib_lookup.h"
50
51 #define FSprintk(a...)
52
53 static DEFINE_SPINLOCK(fib_info_lock);
54 static struct hlist_head *fib_info_hash;
55 static struct hlist_head *fib_info_laddrhash;
56 static unsigned int fib_hash_size;
57 static unsigned int fib_info_cnt;
58
59 #define DEVINDEX_HASHBITS 8
60 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
61 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
62
63 #ifdef CONFIG_IP_ROUTE_MULTIPATH
64
65 static DEFINE_SPINLOCK(fib_multipath_lock);
66
67 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
68 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
69
70 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
71 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
72
73 #else /* CONFIG_IP_ROUTE_MULTIPATH */
74
75 /* Hope, that gcc will optimize it to get rid of dummy loop */
76
77 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
78 for (nhsel=0; nhsel < 1; nhsel++)
79
80 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
81 for (nhsel=0; nhsel < 1; nhsel++)
82
83 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
84
85 #define endfor_nexthops(fi) }
86
87
88 static const struct 
89 {
90         int     error;
91         u8      scope;
92 } fib_props[RTA_MAX + 1] = {
93         {
94                 .error  = 0,
95                 .scope  = RT_SCOPE_NOWHERE,
96         },      /* RTN_UNSPEC */
97         {
98                 .error  = 0,
99                 .scope  = RT_SCOPE_UNIVERSE,
100         },      /* RTN_UNICAST */
101         {
102                 .error  = 0,
103                 .scope  = RT_SCOPE_HOST,
104         },      /* RTN_LOCAL */
105         {
106                 .error  = 0,
107                 .scope  = RT_SCOPE_LINK,
108         },      /* RTN_BROADCAST */
109         {
110                 .error  = 0,
111                 .scope  = RT_SCOPE_LINK,
112         },      /* RTN_ANYCAST */
113         {
114                 .error  = 0,
115                 .scope  = RT_SCOPE_UNIVERSE,
116         },      /* RTN_MULTICAST */
117         {
118                 .error  = -EINVAL,
119                 .scope  = RT_SCOPE_UNIVERSE,
120         },      /* RTN_BLACKHOLE */
121         {
122                 .error  = -EHOSTUNREACH,
123                 .scope  = RT_SCOPE_UNIVERSE,
124         },      /* RTN_UNREACHABLE */
125         {
126                 .error  = -EACCES,
127                 .scope  = RT_SCOPE_UNIVERSE,
128         },      /* RTN_PROHIBIT */
129         {
130                 .error  = -EAGAIN,
131                 .scope  = RT_SCOPE_UNIVERSE,
132         },      /* RTN_THROW */
133         {
134                 .error  = -EINVAL,
135                 .scope  = RT_SCOPE_NOWHERE,
136         },      /* RTN_NAT */
137         {
138                 .error  = -EINVAL,
139                 .scope  = RT_SCOPE_NOWHERE,
140         },      /* RTN_XRESOLVE */
141 };
142
143
144 /* Release a nexthop info record */
145
146 void free_fib_info(struct fib_info *fi)
147 {
148         if (fi->fib_dead == 0) {
149                 printk("Freeing alive fib_info %p\n", fi);
150                 return;
151         }
152         change_nexthops(fi) {
153                 if (nh->nh_dev)
154                         dev_put(nh->nh_dev);
155                 nh->nh_dev = NULL;
156         } endfor_nexthops(fi);
157         fib_info_cnt--;
158         kfree(fi);
159 }
160
161 void fib_release_info(struct fib_info *fi)
162 {
163         spin_lock_bh(&fib_info_lock);
164         if (fi && --fi->fib_treeref == 0) {
165                 hlist_del(&fi->fib_hash);
166                 if (fi->fib_prefsrc)
167                         hlist_del(&fi->fib_lhash);
168                 change_nexthops(fi) {
169                         if (!nh->nh_dev)
170                                 continue;
171                         hlist_del(&nh->nh_hash);
172                 } endfor_nexthops(fi)
173                 fi->fib_dead = 1;
174                 fib_info_put(fi);
175         }
176         spin_unlock_bh(&fib_info_lock);
177 }
178
179 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
180 {
181         const struct fib_nh *onh = ofi->fib_nh;
182
183         for_nexthops(fi) {
184                 if (nh->nh_oif != onh->nh_oif ||
185                     nh->nh_gw  != onh->nh_gw ||
186                     nh->nh_scope != onh->nh_scope ||
187 #ifdef CONFIG_IP_ROUTE_MULTIPATH
188                     nh->nh_weight != onh->nh_weight ||
189 #endif
190 #ifdef CONFIG_NET_CLS_ROUTE
191                     nh->nh_tclassid != onh->nh_tclassid ||
192 #endif
193                     ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
194                         return -1;
195                 onh++;
196         } endfor_nexthops(fi);
197         return 0;
198 }
199
200 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
201 {
202         unsigned int mask = (fib_hash_size - 1);
203         unsigned int val = fi->fib_nhs;
204
205         val ^= fi->fib_protocol;
206         val ^= fi->fib_prefsrc;
207         val ^= fi->fib_priority;
208
209         return (val ^ (val >> 7) ^ (val >> 12)) & mask;
210 }
211
212 static struct fib_info *fib_find_info(const struct fib_info *nfi)
213 {
214         struct hlist_head *head;
215         struct hlist_node *node;
216         struct fib_info *fi;
217         unsigned int hash;
218
219         hash = fib_info_hashfn(nfi);
220         head = &fib_info_hash[hash];
221
222         hlist_for_each_entry(fi, node, head, fib_hash) {
223                 if (fi->fib_nhs != nfi->fib_nhs)
224                         continue;
225                 if (nfi->fib_protocol == fi->fib_protocol &&
226                     nfi->fib_prefsrc == fi->fib_prefsrc &&
227                     nfi->fib_priority == fi->fib_priority &&
228                     memcmp(nfi->fib_metrics, fi->fib_metrics,
229                            sizeof(fi->fib_metrics)) == 0 &&
230                     ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
231                     (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
232                         return fi;
233         }
234
235         return NULL;
236 }
237
238 static inline unsigned int fib_devindex_hashfn(unsigned int val)
239 {
240         unsigned int mask = DEVINDEX_HASHSIZE - 1;
241
242         return (val ^
243                 (val >> DEVINDEX_HASHBITS) ^
244                 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
245 }
246
247 /* Check, that the gateway is already configured.
248    Used only by redirect accept routine.
249  */
250
251 int ip_fib_check_default(u32 gw, struct net_device *dev)
252 {
253         struct hlist_head *head;
254         struct hlist_node *node;
255         struct fib_nh *nh;
256         unsigned int hash;
257
258         spin_lock(&fib_info_lock);
259
260         hash = fib_devindex_hashfn(dev->ifindex);
261         head = &fib_info_devhash[hash];
262         hlist_for_each_entry(nh, node, head, nh_hash) {
263                 if (nh->nh_dev == dev &&
264                     nh->nh_gw == gw &&
265                     !(nh->nh_flags&RTNH_F_DEAD)) {
266                         spin_unlock(&fib_info_lock);
267                         return 0;
268                 }
269         }
270
271         spin_unlock(&fib_info_lock);
272
273         return -1;
274 }
275
276 void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
277                int dst_len, u32 tb_id, struct nl_info *info)
278 {
279         struct sk_buff *skb;
280         int payload = sizeof(struct rtmsg) + 256;
281         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
282         int err = -ENOBUFS;
283
284         skb = nlmsg_new(nlmsg_total_size(payload), GFP_KERNEL);
285         if (skb == NULL)
286                 goto errout;
287
288         err = fib_dump_info(skb, info->pid, seq, event, tb_id,
289                             fa->fa_type, fa->fa_scope, &key, dst_len,
290                             fa->fa_tos, fa->fa_info, 0);
291         if (err < 0) {
292                 kfree_skb(skb);
293                 goto errout;
294         }
295
296         err = rtnl_notify(skb, info->pid, RTNLGRP_IPV4_ROUTE,
297                           info->nlh, GFP_KERNEL);
298 errout:
299         if (err < 0)
300                 rtnl_set_sk_err(RTNLGRP_IPV4_ROUTE, err);
301 }
302
303 /* Return the first fib alias matching TOS with
304  * priority less than or equal to PRIO.
305  */
306 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
307 {
308         if (fah) {
309                 struct fib_alias *fa;
310                 list_for_each_entry(fa, fah, fa_list) {
311                         if (fa->fa_tos > tos)
312                                 continue;
313                         if (fa->fa_info->fib_priority >= prio ||
314                             fa->fa_tos < tos)
315                                 return fa;
316                 }
317         }
318         return NULL;
319 }
320
321 int fib_detect_death(struct fib_info *fi, int order,
322                      struct fib_info **last_resort, int *last_idx, int *dflt)
323 {
324         struct neighbour *n;
325         int state = NUD_NONE;
326
327         n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
328         if (n) {
329                 state = n->nud_state;
330                 neigh_release(n);
331         }
332         if (state==NUD_REACHABLE)
333                 return 0;
334         if ((state&NUD_VALID) && order != *dflt)
335                 return 0;
336         if ((state&NUD_VALID) ||
337             (*last_idx<0 && order > *dflt)) {
338                 *last_resort = fi;
339                 *last_idx = order;
340         }
341         return 1;
342 }
343
344 #ifdef CONFIG_IP_ROUTE_MULTIPATH
345
346 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
347 {
348         int nhs = 0;
349
350         while (rtnh_ok(rtnh, remaining)) {
351                 nhs++;
352                 rtnh = rtnh_next(rtnh, &remaining);
353         }
354
355         /* leftover implies invalid nexthop configuration, discard it */
356         return remaining > 0 ? 0 : nhs;
357 }
358
359 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
360                        int remaining, struct fib_config *cfg)
361 {
362         change_nexthops(fi) {
363                 int attrlen;
364
365                 if (!rtnh_ok(rtnh, remaining))
366                         return -EINVAL;
367
368                 nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
369                 nh->nh_oif = rtnh->rtnh_ifindex;
370                 nh->nh_weight = rtnh->rtnh_hops + 1;
371
372                 attrlen = rtnh_attrlen(rtnh);
373                 if (attrlen > 0) {
374                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
375
376                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
377                         nh->nh_gw = nla ? nla_get_u32(nla) : 0;
378 #ifdef CONFIG_NET_CLS_ROUTE
379                         nla = nla_find(attrs, attrlen, RTA_FLOW);
380                         nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
381 #endif
382                 }
383
384                 rtnh = rtnh_next(rtnh, &remaining);
385         } endfor_nexthops(fi);
386
387         return 0;
388 }
389
390 #endif
391
392 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
393 {
394 #ifdef CONFIG_IP_ROUTE_MULTIPATH
395         struct rtnexthop *rtnh;
396         int remaining;
397 #endif
398
399         if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
400                 return 1;
401
402         if (cfg->fc_oif || cfg->fc_gw) {
403                 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
404                     (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
405                         return 0;
406                 return 1;
407         }
408
409 #ifdef CONFIG_IP_ROUTE_MULTIPATH
410         if (cfg->fc_mp == NULL)
411                 return 0;
412
413         rtnh = cfg->fc_mp;
414         remaining = cfg->fc_mp_len;
415         
416         for_nexthops(fi) {
417                 int attrlen;
418
419                 if (!rtnh_ok(rtnh, remaining))
420                         return -EINVAL;
421
422                 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
423                         return 1;
424
425                 attrlen = rtnh_attrlen(rtnh);
426                 if (attrlen < 0) {
427                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
428
429                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
430                         if (nla && nla_get_u32(nla) != nh->nh_gw)
431                                 return 1;
432 #ifdef CONFIG_NET_CLS_ROUTE
433                         nla = nla_find(attrs, attrlen, RTA_FLOW);
434                         if (nla && nla_get_u32(nla) != nh->nh_tclassid)
435                                 return 1;
436 #endif
437                 }
438
439                 rtnh = rtnh_next(rtnh, &remaining);
440         } endfor_nexthops(fi);
441 #endif
442         return 0;
443 }
444
445
446 /*
447    Picture
448    -------
449
450    Semantics of nexthop is very messy by historical reasons.
451    We have to take into account, that:
452    a) gateway can be actually local interface address,
453       so that gatewayed route is direct.
454    b) gateway must be on-link address, possibly
455       described not by an ifaddr, but also by a direct route.
456    c) If both gateway and interface are specified, they should not
457       contradict.
458    d) If we use tunnel routes, gateway could be not on-link.
459
460    Attempt to reconcile all of these (alas, self-contradictory) conditions
461    results in pretty ugly and hairy code with obscure logic.
462
463    I chose to generalized it instead, so that the size
464    of code does not increase practically, but it becomes
465    much more general.
466    Every prefix is assigned a "scope" value: "host" is local address,
467    "link" is direct route,
468    [ ... "site" ... "interior" ... ]
469    and "universe" is true gateway route with global meaning.
470
471    Every prefix refers to a set of "nexthop"s (gw, oif),
472    where gw must have narrower scope. This recursion stops
473    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
474    which means that gw is forced to be on link.
475
476    Code is still hairy, but now it is apparently logically
477    consistent and very flexible. F.e. as by-product it allows
478    to co-exists in peace independent exterior and interior
479    routing processes.
480
481    Normally it looks as following.
482
483    {universe prefix}  -> (gw, oif) [scope link]
484                           |
485                           |-> {link prefix} -> (gw, oif) [scope local]
486                                                 |
487                                                 |-> {local prefix} (terminal node)
488  */
489
490 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
491                         struct fib_nh *nh)
492 {
493         int err;
494
495         if (nh->nh_gw) {
496                 struct fib_result res;
497
498 #ifdef CONFIG_IP_ROUTE_PERVASIVE
499                 if (nh->nh_flags&RTNH_F_PERVASIVE)
500                         return 0;
501 #endif
502                 if (nh->nh_flags&RTNH_F_ONLINK) {
503                         struct net_device *dev;
504
505                         if (cfg->fc_scope >= RT_SCOPE_LINK)
506                                 return -EINVAL;
507                         if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
508                                 return -EINVAL;
509                         if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
510                                 return -ENODEV;
511                         if (!(dev->flags&IFF_UP))
512                                 return -ENETDOWN;
513                         nh->nh_dev = dev;
514                         dev_hold(dev);
515                         nh->nh_scope = RT_SCOPE_LINK;
516                         return 0;
517                 }
518                 {
519                         struct flowi fl = {
520                                 .nl_u = {
521                                         .ip4_u = {
522                                                 .daddr = nh->nh_gw,
523                                                 .scope = cfg->fc_scope + 1,
524                                         },
525                                 },
526                                 .oif = nh->nh_oif,
527                         };
528
529                         /* It is not necessary, but requires a bit of thinking */
530                         if (fl.fl4_scope < RT_SCOPE_LINK)
531                                 fl.fl4_scope = RT_SCOPE_LINK;
532                         if ((err = fib_lookup(&fl, &res)) != 0)
533                                 return err;
534                 }
535                 err = -EINVAL;
536                 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
537                         goto out;
538                 nh->nh_scope = res.scope;
539                 nh->nh_oif = FIB_RES_OIF(res);
540                 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
541                         goto out;
542                 dev_hold(nh->nh_dev);
543                 err = -ENETDOWN;
544                 if (!(nh->nh_dev->flags & IFF_UP))
545                         goto out;
546                 err = 0;
547 out:
548                 fib_res_put(&res);
549                 return err;
550         } else {
551                 struct in_device *in_dev;
552
553                 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
554                         return -EINVAL;
555
556                 in_dev = inetdev_by_index(nh->nh_oif);
557                 if (in_dev == NULL)
558                         return -ENODEV;
559                 if (!(in_dev->dev->flags&IFF_UP)) {
560                         in_dev_put(in_dev);
561                         return -ENETDOWN;
562                 }
563                 nh->nh_dev = in_dev->dev;
564                 dev_hold(nh->nh_dev);
565                 nh->nh_scope = RT_SCOPE_HOST;
566                 in_dev_put(in_dev);
567         }
568         return 0;
569 }
570
571 static inline unsigned int fib_laddr_hashfn(u32 val)
572 {
573         unsigned int mask = (fib_hash_size - 1);
574
575         return (val ^ (val >> 7) ^ (val >> 14)) & mask;
576 }
577
578 static struct hlist_head *fib_hash_alloc(int bytes)
579 {
580         if (bytes <= PAGE_SIZE)
581                 return kmalloc(bytes, GFP_KERNEL);
582         else
583                 return (struct hlist_head *)
584                         __get_free_pages(GFP_KERNEL, get_order(bytes));
585 }
586
587 static void fib_hash_free(struct hlist_head *hash, int bytes)
588 {
589         if (!hash)
590                 return;
591
592         if (bytes <= PAGE_SIZE)
593                 kfree(hash);
594         else
595                 free_pages((unsigned long) hash, get_order(bytes));
596 }
597
598 static void fib_hash_move(struct hlist_head *new_info_hash,
599                           struct hlist_head *new_laddrhash,
600                           unsigned int new_size)
601 {
602         struct hlist_head *old_info_hash, *old_laddrhash;
603         unsigned int old_size = fib_hash_size;
604         unsigned int i, bytes;
605
606         spin_lock_bh(&fib_info_lock);
607         old_info_hash = fib_info_hash;
608         old_laddrhash = fib_info_laddrhash;
609         fib_hash_size = new_size;
610
611         for (i = 0; i < old_size; i++) {
612                 struct hlist_head *head = &fib_info_hash[i];
613                 struct hlist_node *node, *n;
614                 struct fib_info *fi;
615
616                 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
617                         struct hlist_head *dest;
618                         unsigned int new_hash;
619
620                         hlist_del(&fi->fib_hash);
621
622                         new_hash = fib_info_hashfn(fi);
623                         dest = &new_info_hash[new_hash];
624                         hlist_add_head(&fi->fib_hash, dest);
625                 }
626         }
627         fib_info_hash = new_info_hash;
628
629         for (i = 0; i < old_size; i++) {
630                 struct hlist_head *lhead = &fib_info_laddrhash[i];
631                 struct hlist_node *node, *n;
632                 struct fib_info *fi;
633
634                 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
635                         struct hlist_head *ldest;
636                         unsigned int new_hash;
637
638                         hlist_del(&fi->fib_lhash);
639
640                         new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
641                         ldest = &new_laddrhash[new_hash];
642                         hlist_add_head(&fi->fib_lhash, ldest);
643                 }
644         }
645         fib_info_laddrhash = new_laddrhash;
646
647         spin_unlock_bh(&fib_info_lock);
648
649         bytes = old_size * sizeof(struct hlist_head *);
650         fib_hash_free(old_info_hash, bytes);
651         fib_hash_free(old_laddrhash, bytes);
652 }
653
654 struct fib_info *fib_create_info(struct fib_config *cfg)
655 {
656         int err;
657         struct fib_info *fi = NULL;
658         struct fib_info *ofi;
659         int nhs = 1;
660
661         /* Fast check to catch the most weird cases */
662         if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
663                 goto err_inval;
664
665 #ifdef CONFIG_IP_ROUTE_MULTIPATH
666         if (cfg->fc_mp) {
667                 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
668                 if (nhs == 0)
669                         goto err_inval;
670         }
671 #endif
672 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
673         if (cfg->fc_mp_alg) {
674                 if (cfg->fc_mp_alg < IP_MP_ALG_NONE ||
675                     cfg->fc_mp_alg > IP_MP_ALG_MAX)
676                         goto err_inval;
677         }
678 #endif
679
680         err = -ENOBUFS;
681         if (fib_info_cnt >= fib_hash_size) {
682                 unsigned int new_size = fib_hash_size << 1;
683                 struct hlist_head *new_info_hash;
684                 struct hlist_head *new_laddrhash;
685                 unsigned int bytes;
686
687                 if (!new_size)
688                         new_size = 1;
689                 bytes = new_size * sizeof(struct hlist_head *);
690                 new_info_hash = fib_hash_alloc(bytes);
691                 new_laddrhash = fib_hash_alloc(bytes);
692                 if (!new_info_hash || !new_laddrhash) {
693                         fib_hash_free(new_info_hash, bytes);
694                         fib_hash_free(new_laddrhash, bytes);
695                 } else {
696                         memset(new_info_hash, 0, bytes);
697                         memset(new_laddrhash, 0, bytes);
698
699                         fib_hash_move(new_info_hash, new_laddrhash, new_size);
700                 }
701
702                 if (!fib_hash_size)
703                         goto failure;
704         }
705
706         fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
707         if (fi == NULL)
708                 goto failure;
709         fib_info_cnt++;
710
711         fi->fib_protocol = cfg->fc_protocol;
712         fi->fib_flags = cfg->fc_flags;
713         fi->fib_priority = cfg->fc_priority;
714         fi->fib_prefsrc = cfg->fc_prefsrc;
715
716         fi->fib_nhs = nhs;
717         change_nexthops(fi) {
718                 nh->nh_parent = fi;
719         } endfor_nexthops(fi)
720
721         if (cfg->fc_mx) {
722                 struct nlattr *nla;
723                 int remaining;
724
725                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
726                         int type = nla->nla_type;
727
728                         if (type) {
729                                 if (type > RTAX_MAX)
730                                         goto err_inval;
731                                 fi->fib_metrics[type - 1] = nla_get_u32(nla);
732                         }
733                 }
734         }
735
736         if (cfg->fc_mp) {
737 #ifdef CONFIG_IP_ROUTE_MULTIPATH
738                 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
739                 if (err != 0)
740                         goto failure;
741                 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
742                         goto err_inval;
743                 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
744                         goto err_inval;
745 #ifdef CONFIG_NET_CLS_ROUTE
746                 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
747                         goto err_inval;
748 #endif
749 #else
750                 goto err_inval;
751 #endif
752         } else {
753                 struct fib_nh *nh = fi->fib_nh;
754
755                 nh->nh_oif = cfg->fc_oif;
756                 nh->nh_gw = cfg->fc_gw;
757                 nh->nh_flags = cfg->fc_flags;
758 #ifdef CONFIG_NET_CLS_ROUTE
759                 nh->nh_tclassid = cfg->fc_flow;
760 #endif
761 #ifdef CONFIG_IP_ROUTE_MULTIPATH
762                 nh->nh_weight = 1;
763 #endif
764         }
765
766 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
767         fi->fib_mp_alg = cfg->fc_mp_alg;
768 #endif
769
770         if (fib_props[cfg->fc_type].error) {
771                 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
772                         goto err_inval;
773                 goto link_it;
774         }
775
776         if (cfg->fc_scope > RT_SCOPE_HOST)
777                 goto err_inval;
778
779         if (cfg->fc_scope == RT_SCOPE_HOST) {
780                 struct fib_nh *nh = fi->fib_nh;
781
782                 /* Local address is added. */
783                 if (nhs != 1 || nh->nh_gw)
784                         goto err_inval;
785                 nh->nh_scope = RT_SCOPE_NOWHERE;
786                 nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
787                 err = -ENODEV;
788                 if (nh->nh_dev == NULL)
789                         goto failure;
790         } else {
791                 change_nexthops(fi) {
792                         if ((err = fib_check_nh(cfg, fi, nh)) != 0)
793                                 goto failure;
794                 } endfor_nexthops(fi)
795         }
796
797         if (fi->fib_prefsrc) {
798                 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
799                     fi->fib_prefsrc != cfg->fc_dst)
800                         if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
801                                 goto err_inval;
802         }
803
804 link_it:
805         if ((ofi = fib_find_info(fi)) != NULL) {
806                 fi->fib_dead = 1;
807                 free_fib_info(fi);
808                 ofi->fib_treeref++;
809                 return ofi;
810         }
811
812         fi->fib_treeref++;
813         atomic_inc(&fi->fib_clntref);
814         spin_lock_bh(&fib_info_lock);
815         hlist_add_head(&fi->fib_hash,
816                        &fib_info_hash[fib_info_hashfn(fi)]);
817         if (fi->fib_prefsrc) {
818                 struct hlist_head *head;
819
820                 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
821                 hlist_add_head(&fi->fib_lhash, head);
822         }
823         change_nexthops(fi) {
824                 struct hlist_head *head;
825                 unsigned int hash;
826
827                 if (!nh->nh_dev)
828                         continue;
829                 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
830                 head = &fib_info_devhash[hash];
831                 hlist_add_head(&nh->nh_hash, head);
832         } endfor_nexthops(fi)
833         spin_unlock_bh(&fib_info_lock);
834         return fi;
835
836 err_inval:
837         err = -EINVAL;
838
839 failure:
840         if (fi) {
841                 fi->fib_dead = 1;
842                 free_fib_info(fi);
843         }
844
845         return ERR_PTR(err);
846 }
847
848 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
849 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
850                        struct fib_result *res, __u32 zone, __u32 mask, 
851                         int prefixlen)
852 {
853         struct fib_alias *fa;
854         int nh_sel = 0;
855
856         list_for_each_entry_rcu(fa, head, fa_list) {
857                 int err;
858
859                 if (fa->fa_tos &&
860                     fa->fa_tos != flp->fl4_tos)
861                         continue;
862
863                 if (fa->fa_scope < flp->fl4_scope)
864                         continue;
865
866                 fa->fa_state |= FA_S_ACCESSED;
867
868                 err = fib_props[fa->fa_type].error;
869                 if (err == 0) {
870                         struct fib_info *fi = fa->fa_info;
871
872                         if (fi->fib_flags & RTNH_F_DEAD)
873                                 continue;
874
875                         switch (fa->fa_type) {
876                         case RTN_UNICAST:
877                         case RTN_LOCAL:
878                         case RTN_BROADCAST:
879                         case RTN_ANYCAST:
880                         case RTN_MULTICAST:
881                                 for_nexthops(fi) {
882                                         if (nh->nh_flags&RTNH_F_DEAD)
883                                                 continue;
884                                         if (!flp->oif || flp->oif == nh->nh_oif)
885                                                 break;
886                                 }
887 #ifdef CONFIG_IP_ROUTE_MULTIPATH
888                                 if (nhsel < fi->fib_nhs) {
889                                         nh_sel = nhsel;
890                                         goto out_fill_res;
891                                 }
892 #else
893                                 if (nhsel < 1) {
894                                         goto out_fill_res;
895                                 }
896 #endif
897                                 endfor_nexthops(fi);
898                                 continue;
899
900                         default:
901                                 printk(KERN_DEBUG "impossible 102\n");
902                                 return -EINVAL;
903                         };
904                 }
905                 return err;
906         }
907         return 1;
908
909 out_fill_res:
910         res->prefixlen = prefixlen;
911         res->nh_sel = nh_sel;
912         res->type = fa->fa_type;
913         res->scope = fa->fa_scope;
914         res->fi = fa->fa_info;
915 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
916         res->netmask = mask;
917         res->network = zone &
918                 (0xFFFFFFFF >> (32 - prefixlen));
919 #endif
920         atomic_inc(&res->fi->fib_clntref);
921         return 0;
922 }
923
924 /* Find appropriate source address to this destination */
925
926 u32 __fib_res_prefsrc(struct fib_result *res)
927 {
928         return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
929 }
930
931 int
932 fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
933               u32 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
934               struct fib_info *fi, unsigned int flags)
935 {
936         struct rtmsg *rtm;
937         struct nlmsghdr  *nlh;
938         unsigned char    *b = skb->tail;
939
940         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*rtm), flags);
941         rtm = NLMSG_DATA(nlh);
942         rtm->rtm_family = AF_INET;
943         rtm->rtm_dst_len = dst_len;
944         rtm->rtm_src_len = 0;
945         rtm->rtm_tos = tos;
946         rtm->rtm_table = tb_id;
947         RTA_PUT_U32(skb, RTA_TABLE, tb_id);
948         rtm->rtm_type = type;
949         rtm->rtm_flags = fi->fib_flags;
950         rtm->rtm_scope = scope;
951         if (rtm->rtm_dst_len)
952                 RTA_PUT(skb, RTA_DST, 4, dst);
953         rtm->rtm_protocol = fi->fib_protocol;
954         if (fi->fib_priority)
955                 RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
956         if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
957                 goto rtattr_failure;
958         if (fi->fib_prefsrc)
959                 RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
960         if (fi->fib_nhs == 1) {
961                 if (fi->fib_nh->nh_gw)
962                         RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw);
963                 if (fi->fib_nh->nh_oif)
964                         RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
965 #ifdef CONFIG_NET_CLS_ROUTE
966                 if (fi->fib_nh[0].nh_tclassid)
967                         RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
968 #endif
969         }
970 #ifdef CONFIG_IP_ROUTE_MULTIPATH
971         if (fi->fib_nhs > 1) {
972                 struct rtnexthop *nhp;
973                 struct rtattr *mp_head;
974                 if (skb_tailroom(skb) <= RTA_SPACE(0))
975                         goto rtattr_failure;
976                 mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
977
978                 for_nexthops(fi) {
979                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
980                                 goto rtattr_failure;
981                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
982                         nhp->rtnh_flags = nh->nh_flags & 0xFF;
983                         nhp->rtnh_hops = nh->nh_weight-1;
984                         nhp->rtnh_ifindex = nh->nh_oif;
985                         if (nh->nh_gw)
986                                 RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
987 #ifdef CONFIG_NET_CLS_ROUTE
988                         if (nh->nh_tclassid)
989                                 RTA_PUT(skb, RTA_FLOW, 4, &nh->nh_tclassid);
990 #endif
991                         nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
992                 } endfor_nexthops(fi);
993                 mp_head->rta_type = RTA_MULTIPATH;
994                 mp_head->rta_len = skb->tail - (u8*)mp_head;
995         }
996 #endif
997         nlh->nlmsg_len = skb->tail - b;
998         return skb->len;
999
1000 nlmsg_failure:
1001 rtattr_failure:
1002         skb_trim(skb, b - skb->data);
1003         return -1;
1004 }
1005
1006 /*
1007    Update FIB if:
1008    - local address disappeared -> we must delete all the entries
1009      referring to it.
1010    - device went down -> we must shutdown all nexthops going via it.
1011  */
1012
1013 int fib_sync_down(u32 local, struct net_device *dev, int force)
1014 {
1015         int ret = 0;
1016         int scope = RT_SCOPE_NOWHERE;
1017         
1018         if (force)
1019                 scope = -1;
1020
1021         if (local && fib_info_laddrhash) {
1022                 unsigned int hash = fib_laddr_hashfn(local);
1023                 struct hlist_head *head = &fib_info_laddrhash[hash];
1024                 struct hlist_node *node;
1025                 struct fib_info *fi;
1026
1027                 hlist_for_each_entry(fi, node, head, fib_lhash) {
1028                         if (fi->fib_prefsrc == local) {
1029                                 fi->fib_flags |= RTNH_F_DEAD;
1030                                 ret++;
1031                         }
1032                 }
1033         }
1034
1035         if (dev) {
1036                 struct fib_info *prev_fi = NULL;
1037                 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1038                 struct hlist_head *head = &fib_info_devhash[hash];
1039                 struct hlist_node *node;
1040                 struct fib_nh *nh;
1041
1042                 hlist_for_each_entry(nh, node, head, nh_hash) {
1043                         struct fib_info *fi = nh->nh_parent;
1044                         int dead;
1045
1046                         BUG_ON(!fi->fib_nhs);
1047                         if (nh->nh_dev != dev || fi == prev_fi)
1048                                 continue;
1049                         prev_fi = fi;
1050                         dead = 0;
1051                         change_nexthops(fi) {
1052                                 if (nh->nh_flags&RTNH_F_DEAD)
1053                                         dead++;
1054                                 else if (nh->nh_dev == dev &&
1055                                          nh->nh_scope != scope) {
1056                                         nh->nh_flags |= RTNH_F_DEAD;
1057 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1058                                         spin_lock_bh(&fib_multipath_lock);
1059                                         fi->fib_power -= nh->nh_power;
1060                                         nh->nh_power = 0;
1061                                         spin_unlock_bh(&fib_multipath_lock);
1062 #endif
1063                                         dead++;
1064                                 }
1065 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1066                                 if (force > 1 && nh->nh_dev == dev) {
1067                                         dead = fi->fib_nhs;
1068                                         break;
1069                                 }
1070 #endif
1071                         } endfor_nexthops(fi)
1072                         if (dead == fi->fib_nhs) {
1073                                 fi->fib_flags |= RTNH_F_DEAD;
1074                                 ret++;
1075                         }
1076                 }
1077         }
1078
1079         return ret;
1080 }
1081
1082 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1083
1084 /*
1085    Dead device goes up. We wake up dead nexthops.
1086    It takes sense only on multipath routes.
1087  */
1088
1089 int fib_sync_up(struct net_device *dev)
1090 {
1091         struct fib_info *prev_fi;
1092         unsigned int hash;
1093         struct hlist_head *head;
1094         struct hlist_node *node;
1095         struct fib_nh *nh;
1096         int ret;
1097
1098         if (!(dev->flags&IFF_UP))
1099                 return 0;
1100
1101         prev_fi = NULL;
1102         hash = fib_devindex_hashfn(dev->ifindex);
1103         head = &fib_info_devhash[hash];
1104         ret = 0;
1105
1106         hlist_for_each_entry(nh, node, head, nh_hash) {
1107                 struct fib_info *fi = nh->nh_parent;
1108                 int alive;
1109
1110                 BUG_ON(!fi->fib_nhs);
1111                 if (nh->nh_dev != dev || fi == prev_fi)
1112                         continue;
1113
1114                 prev_fi = fi;
1115                 alive = 0;
1116                 change_nexthops(fi) {
1117                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1118                                 alive++;
1119                                 continue;
1120                         }
1121                         if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1122                                 continue;
1123                         if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1124                                 continue;
1125                         alive++;
1126                         spin_lock_bh(&fib_multipath_lock);
1127                         nh->nh_power = 0;
1128                         nh->nh_flags &= ~RTNH_F_DEAD;
1129                         spin_unlock_bh(&fib_multipath_lock);
1130                 } endfor_nexthops(fi)
1131
1132                 if (alive > 0) {
1133                         fi->fib_flags &= ~RTNH_F_DEAD;
1134                         ret++;
1135                 }
1136         }
1137
1138         return ret;
1139 }
1140
1141 /*
1142    The algorithm is suboptimal, but it provides really
1143    fair weighted route distribution.
1144  */
1145
1146 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1147 {
1148         struct fib_info *fi = res->fi;
1149         int w;
1150
1151         spin_lock_bh(&fib_multipath_lock);
1152         if (fi->fib_power <= 0) {
1153                 int power = 0;
1154                 change_nexthops(fi) {
1155                         if (!(nh->nh_flags&RTNH_F_DEAD)) {
1156                                 power += nh->nh_weight;
1157                                 nh->nh_power = nh->nh_weight;
1158                         }
1159                 } endfor_nexthops(fi);
1160                 fi->fib_power = power;
1161                 if (power <= 0) {
1162                         spin_unlock_bh(&fib_multipath_lock);
1163                         /* Race condition: route has just become dead. */
1164                         res->nh_sel = 0;
1165                         return;
1166                 }
1167         }
1168
1169
1170         /* w should be random number [0..fi->fib_power-1],
1171            it is pretty bad approximation.
1172          */
1173
1174         w = jiffies % fi->fib_power;
1175
1176         change_nexthops(fi) {
1177                 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1178                         if ((w -= nh->nh_power) <= 0) {
1179                                 nh->nh_power--;
1180                                 fi->fib_power--;
1181                                 res->nh_sel = nhsel;
1182                                 spin_unlock_bh(&fib_multipath_lock);
1183                                 return;
1184                         }
1185                 }
1186         } endfor_nexthops(fi);
1187
1188         /* Race condition: route has just become dead. */
1189         res->nh_sel = 0;
1190         spin_unlock_bh(&fib_multipath_lock);
1191 }
1192 #endif