Merge branch 'irq-final-for-linus-v2' of git://git.kernel.org/pub/scm/linux/kernel...
[pandora-kernel.git] / net / ipv4 / fib_semantics.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              IPv4 Forwarding Information Base: semantics.
7  *
8  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9  *
10  *              This program is free software; you can redistribute it and/or
11  *              modify it under the terms of the GNU General Public License
12  *              as published by the Free Software Foundation; either version
13  *              2 of the License, or (at your option) any later version.
14  */
15
16 #include <asm/uaccess.h>
17 #include <asm/system.h>
18 #include <linux/bitops.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/jiffies.h>
22 #include <linux/mm.h>
23 #include <linux/string.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/errno.h>
27 #include <linux/in.h>
28 #include <linux/inet.h>
29 #include <linux/inetdevice.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/proc_fs.h>
33 #include <linux/skbuff.h>
34 #include <linux/init.h>
35 #include <linux/slab.h>
36
37 #include <net/arp.h>
38 #include <net/ip.h>
39 #include <net/protocol.h>
40 #include <net/route.h>
41 #include <net/tcp.h>
42 #include <net/sock.h>
43 #include <net/ip_fib.h>
44 #include <net/netlink.h>
45 #include <net/nexthop.h>
46
47 #include "fib_lookup.h"
48
49 static DEFINE_SPINLOCK(fib_info_lock);
50 static struct hlist_head *fib_info_hash;
51 static struct hlist_head *fib_info_laddrhash;
52 static unsigned int fib_info_hash_size;
53 static unsigned int fib_info_cnt;
54
55 #define DEVINDEX_HASHBITS 8
56 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
57 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
58
59 #ifdef CONFIG_IP_ROUTE_MULTIPATH
60
61 static DEFINE_SPINLOCK(fib_multipath_lock);
62
63 #define for_nexthops(fi) {                                              \
64         int nhsel; const struct fib_nh *nh;                             \
65         for (nhsel = 0, nh = (fi)->fib_nh;                              \
66              nhsel < (fi)->fib_nhs;                                     \
67              nh++, nhsel++)
68
69 #define change_nexthops(fi) {                                           \
70         int nhsel; struct fib_nh *nexthop_nh;                           \
71         for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh);   \
72              nhsel < (fi)->fib_nhs;                                     \
73              nexthop_nh++, nhsel++)
74
75 #else /* CONFIG_IP_ROUTE_MULTIPATH */
76
77 /* Hope, that gcc will optimize it to get rid of dummy loop */
78
79 #define for_nexthops(fi) {                                              \
80         int nhsel; const struct fib_nh *nh = (fi)->fib_nh;              \
81         for (nhsel = 0; nhsel < 1; nhsel++)
82
83 #define change_nexthops(fi) {                                           \
84         int nhsel;                                                      \
85         struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh);    \
86         for (nhsel = 0; nhsel < 1; nhsel++)
87
88 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
89
90 #define endfor_nexthops(fi) }
91
92
93 const struct fib_prop fib_props[RTN_MAX + 1] = {
94         [RTN_UNSPEC] = {
95                 .error  = 0,
96                 .scope  = RT_SCOPE_NOWHERE,
97         },
98         [RTN_UNICAST] = {
99                 .error  = 0,
100                 .scope  = RT_SCOPE_UNIVERSE,
101         },
102         [RTN_LOCAL] = {
103                 .error  = 0,
104                 .scope  = RT_SCOPE_HOST,
105         },
106         [RTN_BROADCAST] = {
107                 .error  = 0,
108                 .scope  = RT_SCOPE_LINK,
109         },
110         [RTN_ANYCAST] = {
111                 .error  = 0,
112                 .scope  = RT_SCOPE_LINK,
113         },
114         [RTN_MULTICAST] = {
115                 .error  = 0,
116                 .scope  = RT_SCOPE_UNIVERSE,
117         },
118         [RTN_BLACKHOLE] = {
119                 .error  = -EINVAL,
120                 .scope  = RT_SCOPE_UNIVERSE,
121         },
122         [RTN_UNREACHABLE] = {
123                 .error  = -EHOSTUNREACH,
124                 .scope  = RT_SCOPE_UNIVERSE,
125         },
126         [RTN_PROHIBIT] = {
127                 .error  = -EACCES,
128                 .scope  = RT_SCOPE_UNIVERSE,
129         },
130         [RTN_THROW] = {
131                 .error  = -EAGAIN,
132                 .scope  = RT_SCOPE_UNIVERSE,
133         },
134         [RTN_NAT] = {
135                 .error  = -EINVAL,
136                 .scope  = RT_SCOPE_NOWHERE,
137         },
138         [RTN_XRESOLVE] = {
139                 .error  = -EINVAL,
140                 .scope  = RT_SCOPE_NOWHERE,
141         },
142 };
143
144
145 /* Release a nexthop info record */
146
147 static void free_fib_info_rcu(struct rcu_head *head)
148 {
149         struct fib_info *fi = container_of(head, struct fib_info, rcu);
150
151         if (fi->fib_metrics != (u32 *) dst_default_metrics)
152                 kfree(fi->fib_metrics);
153         kfree(fi);
154 }
155
156 void free_fib_info(struct fib_info *fi)
157 {
158         if (fi->fib_dead == 0) {
159                 pr_warning("Freeing alive fib_info %p\n", fi);
160                 return;
161         }
162         change_nexthops(fi) {
163                 if (nexthop_nh->nh_dev)
164                         dev_put(nexthop_nh->nh_dev);
165                 nexthop_nh->nh_dev = NULL;
166         } endfor_nexthops(fi);
167         fib_info_cnt--;
168         release_net(fi->fib_net);
169         call_rcu(&fi->rcu, free_fib_info_rcu);
170 }
171
172 void fib_release_info(struct fib_info *fi)
173 {
174         spin_lock_bh(&fib_info_lock);
175         if (fi && --fi->fib_treeref == 0) {
176                 hlist_del(&fi->fib_hash);
177                 if (fi->fib_prefsrc)
178                         hlist_del(&fi->fib_lhash);
179                 change_nexthops(fi) {
180                         if (!nexthop_nh->nh_dev)
181                                 continue;
182                         hlist_del(&nexthop_nh->nh_hash);
183                 } endfor_nexthops(fi)
184                 fi->fib_dead = 1;
185                 fib_info_put(fi);
186         }
187         spin_unlock_bh(&fib_info_lock);
188 }
189
190 static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
191 {
192         const struct fib_nh *onh = ofi->fib_nh;
193
194         for_nexthops(fi) {
195                 if (nh->nh_oif != onh->nh_oif ||
196                     nh->nh_gw  != onh->nh_gw ||
197                     nh->nh_scope != onh->nh_scope ||
198 #ifdef CONFIG_IP_ROUTE_MULTIPATH
199                     nh->nh_weight != onh->nh_weight ||
200 #endif
201 #ifdef CONFIG_IP_ROUTE_CLASSID
202                     nh->nh_tclassid != onh->nh_tclassid ||
203 #endif
204                     ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
205                         return -1;
206                 onh++;
207         } endfor_nexthops(fi);
208         return 0;
209 }
210
211 static inline unsigned int fib_devindex_hashfn(unsigned int val)
212 {
213         unsigned int mask = DEVINDEX_HASHSIZE - 1;
214
215         return (val ^
216                 (val >> DEVINDEX_HASHBITS) ^
217                 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
218 }
219
220 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
221 {
222         unsigned int mask = (fib_info_hash_size - 1);
223         unsigned int val = fi->fib_nhs;
224
225         val ^= (fi->fib_protocol << 8) | fi->fib_scope;
226         val ^= (__force u32)fi->fib_prefsrc;
227         val ^= fi->fib_priority;
228         for_nexthops(fi) {
229                 val ^= fib_devindex_hashfn(nh->nh_oif);
230         } endfor_nexthops(fi)
231
232         return (val ^ (val >> 7) ^ (val >> 12)) & mask;
233 }
234
235 static struct fib_info *fib_find_info(const struct fib_info *nfi)
236 {
237         struct hlist_head *head;
238         struct hlist_node *node;
239         struct fib_info *fi;
240         unsigned int hash;
241
242         hash = fib_info_hashfn(nfi);
243         head = &fib_info_hash[hash];
244
245         hlist_for_each_entry(fi, node, head, fib_hash) {
246                 if (!net_eq(fi->fib_net, nfi->fib_net))
247                         continue;
248                 if (fi->fib_nhs != nfi->fib_nhs)
249                         continue;
250                 if (nfi->fib_protocol == fi->fib_protocol &&
251                     nfi->fib_scope == fi->fib_scope &&
252                     nfi->fib_prefsrc == fi->fib_prefsrc &&
253                     nfi->fib_priority == fi->fib_priority &&
254                     memcmp(nfi->fib_metrics, fi->fib_metrics,
255                            sizeof(u32) * RTAX_MAX) == 0 &&
256                     ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
257                     (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
258                         return fi;
259         }
260
261         return NULL;
262 }
263
264 /* Check, that the gateway is already configured.
265  * Used only by redirect accept routine.
266  */
267 int ip_fib_check_default(__be32 gw, struct net_device *dev)
268 {
269         struct hlist_head *head;
270         struct hlist_node *node;
271         struct fib_nh *nh;
272         unsigned int hash;
273
274         spin_lock(&fib_info_lock);
275
276         hash = fib_devindex_hashfn(dev->ifindex);
277         head = &fib_info_devhash[hash];
278         hlist_for_each_entry(nh, node, head, nh_hash) {
279                 if (nh->nh_dev == dev &&
280                     nh->nh_gw == gw &&
281                     !(nh->nh_flags & RTNH_F_DEAD)) {
282                         spin_unlock(&fib_info_lock);
283                         return 0;
284                 }
285         }
286
287         spin_unlock(&fib_info_lock);
288
289         return -1;
290 }
291
292 static inline size_t fib_nlmsg_size(struct fib_info *fi)
293 {
294         size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
295                          + nla_total_size(4) /* RTA_TABLE */
296                          + nla_total_size(4) /* RTA_DST */
297                          + nla_total_size(4) /* RTA_PRIORITY */
298                          + nla_total_size(4); /* RTA_PREFSRC */
299
300         /* space for nested metrics */
301         payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
302
303         if (fi->fib_nhs) {
304                 /* Also handles the special case fib_nhs == 1 */
305
306                 /* each nexthop is packed in an attribute */
307                 size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
308
309                 /* may contain flow and gateway attribute */
310                 nhsize += 2 * nla_total_size(4);
311
312                 /* all nexthops are packed in a nested attribute */
313                 payload += nla_total_size(fi->fib_nhs * nhsize);
314         }
315
316         return payload;
317 }
318
319 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
320                int dst_len, u32 tb_id, struct nl_info *info,
321                unsigned int nlm_flags)
322 {
323         struct sk_buff *skb;
324         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
325         int err = -ENOBUFS;
326
327         skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
328         if (skb == NULL)
329                 goto errout;
330
331         err = fib_dump_info(skb, info->pid, seq, event, tb_id,
332                             fa->fa_type, key, dst_len,
333                             fa->fa_tos, fa->fa_info, nlm_flags);
334         if (err < 0) {
335                 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
336                 WARN_ON(err == -EMSGSIZE);
337                 kfree_skb(skb);
338                 goto errout;
339         }
340         rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
341                     info->nlh, GFP_KERNEL);
342         return;
343 errout:
344         if (err < 0)
345                 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
346 }
347
348 /* Return the first fib alias matching TOS with
349  * priority less than or equal to PRIO.
350  */
351 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
352 {
353         if (fah) {
354                 struct fib_alias *fa;
355                 list_for_each_entry(fa, fah, fa_list) {
356                         if (fa->fa_tos > tos)
357                                 continue;
358                         if (fa->fa_info->fib_priority >= prio ||
359                             fa->fa_tos < tos)
360                                 return fa;
361                 }
362         }
363         return NULL;
364 }
365
366 int fib_detect_death(struct fib_info *fi, int order,
367                      struct fib_info **last_resort, int *last_idx, int dflt)
368 {
369         struct neighbour *n;
370         int state = NUD_NONE;
371
372         n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
373         if (n) {
374                 state = n->nud_state;
375                 neigh_release(n);
376         }
377         if (state == NUD_REACHABLE)
378                 return 0;
379         if ((state & NUD_VALID) && order != dflt)
380                 return 0;
381         if ((state & NUD_VALID) ||
382             (*last_idx < 0 && order > dflt)) {
383                 *last_resort = fi;
384                 *last_idx = order;
385         }
386         return 1;
387 }
388
389 #ifdef CONFIG_IP_ROUTE_MULTIPATH
390
391 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
392 {
393         int nhs = 0;
394
395         while (rtnh_ok(rtnh, remaining)) {
396                 nhs++;
397                 rtnh = rtnh_next(rtnh, &remaining);
398         }
399
400         /* leftover implies invalid nexthop configuration, discard it */
401         return remaining > 0 ? 0 : nhs;
402 }
403
404 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
405                        int remaining, struct fib_config *cfg)
406 {
407         change_nexthops(fi) {
408                 int attrlen;
409
410                 if (!rtnh_ok(rtnh, remaining))
411                         return -EINVAL;
412
413                 nexthop_nh->nh_flags =
414                         (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
415                 nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
416                 nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
417
418                 attrlen = rtnh_attrlen(rtnh);
419                 if (attrlen > 0) {
420                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
421
422                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
423                         nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
424 #ifdef CONFIG_IP_ROUTE_CLASSID
425                         nla = nla_find(attrs, attrlen, RTA_FLOW);
426                         nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
427 #endif
428                 }
429
430                 rtnh = rtnh_next(rtnh, &remaining);
431         } endfor_nexthops(fi);
432
433         return 0;
434 }
435
436 #endif
437
438 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
439 {
440 #ifdef CONFIG_IP_ROUTE_MULTIPATH
441         struct rtnexthop *rtnh;
442         int remaining;
443 #endif
444
445         if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
446                 return 1;
447
448         if (cfg->fc_oif || cfg->fc_gw) {
449                 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
450                     (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
451                         return 0;
452                 return 1;
453         }
454
455 #ifdef CONFIG_IP_ROUTE_MULTIPATH
456         if (cfg->fc_mp == NULL)
457                 return 0;
458
459         rtnh = cfg->fc_mp;
460         remaining = cfg->fc_mp_len;
461
462         for_nexthops(fi) {
463                 int attrlen;
464
465                 if (!rtnh_ok(rtnh, remaining))
466                         return -EINVAL;
467
468                 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
469                         return 1;
470
471                 attrlen = rtnh_attrlen(rtnh);
472                 if (attrlen < 0) {
473                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
474
475                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
476                         if (nla && nla_get_be32(nla) != nh->nh_gw)
477                                 return 1;
478 #ifdef CONFIG_IP_ROUTE_CLASSID
479                         nla = nla_find(attrs, attrlen, RTA_FLOW);
480                         if (nla && nla_get_u32(nla) != nh->nh_tclassid)
481                                 return 1;
482 #endif
483                 }
484
485                 rtnh = rtnh_next(rtnh, &remaining);
486         } endfor_nexthops(fi);
487 #endif
488         return 0;
489 }
490
491
492 /*
493  * Picture
494  * -------
495  *
496  * Semantics of nexthop is very messy by historical reasons.
497  * We have to take into account, that:
498  * a) gateway can be actually local interface address,
499  *    so that gatewayed route is direct.
500  * b) gateway must be on-link address, possibly
501  *    described not by an ifaddr, but also by a direct route.
502  * c) If both gateway and interface are specified, they should not
503  *    contradict.
504  * d) If we use tunnel routes, gateway could be not on-link.
505  *
506  * Attempt to reconcile all of these (alas, self-contradictory) conditions
507  * results in pretty ugly and hairy code with obscure logic.
508  *
509  * I chose to generalized it instead, so that the size
510  * of code does not increase practically, but it becomes
511  * much more general.
512  * Every prefix is assigned a "scope" value: "host" is local address,
513  * "link" is direct route,
514  * [ ... "site" ... "interior" ... ]
515  * and "universe" is true gateway route with global meaning.
516  *
517  * Every prefix refers to a set of "nexthop"s (gw, oif),
518  * where gw must have narrower scope. This recursion stops
519  * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
520  * which means that gw is forced to be on link.
521  *
522  * Code is still hairy, but now it is apparently logically
523  * consistent and very flexible. F.e. as by-product it allows
524  * to co-exists in peace independent exterior and interior
525  * routing processes.
526  *
527  * Normally it looks as following.
528  *
529  * {universe prefix}  -> (gw, oif) [scope link]
530  *                |
531  *                |-> {link prefix} -> (gw, oif) [scope local]
532  *                                      |
533  *                                      |-> {local prefix} (terminal node)
534  */
535 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
536                         struct fib_nh *nh)
537 {
538         int err;
539         struct net *net;
540         struct net_device *dev;
541
542         net = cfg->fc_nlinfo.nl_net;
543         if (nh->nh_gw) {
544                 struct fib_result res;
545
546                 if (nh->nh_flags & RTNH_F_ONLINK) {
547
548                         if (cfg->fc_scope >= RT_SCOPE_LINK)
549                                 return -EINVAL;
550                         if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
551                                 return -EINVAL;
552                         dev = __dev_get_by_index(net, nh->nh_oif);
553                         if (!dev)
554                                 return -ENODEV;
555                         if (!(dev->flags & IFF_UP))
556                                 return -ENETDOWN;
557                         nh->nh_dev = dev;
558                         dev_hold(dev);
559                         nh->nh_scope = RT_SCOPE_LINK;
560                         return 0;
561                 }
562                 rcu_read_lock();
563                 {
564                         struct flowi4 fl4 = {
565                                 .daddr = nh->nh_gw,
566                                 .flowi4_scope = cfg->fc_scope + 1,
567                                 .flowi4_oif = nh->nh_oif,
568                         };
569
570                         /* It is not necessary, but requires a bit of thinking */
571                         if (fl4.flowi4_scope < RT_SCOPE_LINK)
572                                 fl4.flowi4_scope = RT_SCOPE_LINK;
573                         err = fib_lookup(net, &fl4, &res);
574                         if (err) {
575                                 rcu_read_unlock();
576                                 return err;
577                         }
578                 }
579                 err = -EINVAL;
580                 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
581                         goto out;
582                 nh->nh_scope = res.scope;
583                 nh->nh_oif = FIB_RES_OIF(res);
584                 nh->nh_dev = dev = FIB_RES_DEV(res);
585                 if (!dev)
586                         goto out;
587                 dev_hold(dev);
588                 err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
589         } else {
590                 struct in_device *in_dev;
591
592                 if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
593                         return -EINVAL;
594
595                 rcu_read_lock();
596                 err = -ENODEV;
597                 in_dev = inetdev_by_index(net, nh->nh_oif);
598                 if (in_dev == NULL)
599                         goto out;
600                 err = -ENETDOWN;
601                 if (!(in_dev->dev->flags & IFF_UP))
602                         goto out;
603                 nh->nh_dev = in_dev->dev;
604                 dev_hold(nh->nh_dev);
605                 nh->nh_scope = RT_SCOPE_HOST;
606                 err = 0;
607         }
608 out:
609         rcu_read_unlock();
610         return err;
611 }
612
613 static inline unsigned int fib_laddr_hashfn(__be32 val)
614 {
615         unsigned int mask = (fib_info_hash_size - 1);
616
617         return ((__force u32)val ^
618                 ((__force u32)val >> 7) ^
619                 ((__force u32)val >> 14)) & mask;
620 }
621
622 static struct hlist_head *fib_info_hash_alloc(int bytes)
623 {
624         if (bytes <= PAGE_SIZE)
625                 return kzalloc(bytes, GFP_KERNEL);
626         else
627                 return (struct hlist_head *)
628                         __get_free_pages(GFP_KERNEL | __GFP_ZERO,
629                                          get_order(bytes));
630 }
631
632 static void fib_info_hash_free(struct hlist_head *hash, int bytes)
633 {
634         if (!hash)
635                 return;
636
637         if (bytes <= PAGE_SIZE)
638                 kfree(hash);
639         else
640                 free_pages((unsigned long) hash, get_order(bytes));
641 }
642
643 static void fib_info_hash_move(struct hlist_head *new_info_hash,
644                                struct hlist_head *new_laddrhash,
645                                unsigned int new_size)
646 {
647         struct hlist_head *old_info_hash, *old_laddrhash;
648         unsigned int old_size = fib_info_hash_size;
649         unsigned int i, bytes;
650
651         spin_lock_bh(&fib_info_lock);
652         old_info_hash = fib_info_hash;
653         old_laddrhash = fib_info_laddrhash;
654         fib_info_hash_size = new_size;
655
656         for (i = 0; i < old_size; i++) {
657                 struct hlist_head *head = &fib_info_hash[i];
658                 struct hlist_node *node, *n;
659                 struct fib_info *fi;
660
661                 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
662                         struct hlist_head *dest;
663                         unsigned int new_hash;
664
665                         hlist_del(&fi->fib_hash);
666
667                         new_hash = fib_info_hashfn(fi);
668                         dest = &new_info_hash[new_hash];
669                         hlist_add_head(&fi->fib_hash, dest);
670                 }
671         }
672         fib_info_hash = new_info_hash;
673
674         for (i = 0; i < old_size; i++) {
675                 struct hlist_head *lhead = &fib_info_laddrhash[i];
676                 struct hlist_node *node, *n;
677                 struct fib_info *fi;
678
679                 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
680                         struct hlist_head *ldest;
681                         unsigned int new_hash;
682
683                         hlist_del(&fi->fib_lhash);
684
685                         new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
686                         ldest = &new_laddrhash[new_hash];
687                         hlist_add_head(&fi->fib_lhash, ldest);
688                 }
689         }
690         fib_info_laddrhash = new_laddrhash;
691
692         spin_unlock_bh(&fib_info_lock);
693
694         bytes = old_size * sizeof(struct hlist_head *);
695         fib_info_hash_free(old_info_hash, bytes);
696         fib_info_hash_free(old_laddrhash, bytes);
697 }
698
699 __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
700 {
701         nh->nh_saddr = inet_select_addr(nh->nh_dev,
702                                         nh->nh_gw,
703                                         nh->nh_parent->fib_scope);
704         nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
705
706         return nh->nh_saddr;
707 }
708
709 struct fib_info *fib_create_info(struct fib_config *cfg)
710 {
711         int err;
712         struct fib_info *fi = NULL;
713         struct fib_info *ofi;
714         int nhs = 1;
715         struct net *net = cfg->fc_nlinfo.nl_net;
716
717         if (cfg->fc_type > RTN_MAX)
718                 goto err_inval;
719
720         /* Fast check to catch the most weird cases */
721         if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
722                 goto err_inval;
723
724 #ifdef CONFIG_IP_ROUTE_MULTIPATH
725         if (cfg->fc_mp) {
726                 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
727                 if (nhs == 0)
728                         goto err_inval;
729         }
730 #endif
731
732         err = -ENOBUFS;
733         if (fib_info_cnt >= fib_info_hash_size) {
734                 unsigned int new_size = fib_info_hash_size << 1;
735                 struct hlist_head *new_info_hash;
736                 struct hlist_head *new_laddrhash;
737                 unsigned int bytes;
738
739                 if (!new_size)
740                         new_size = 1;
741                 bytes = new_size * sizeof(struct hlist_head *);
742                 new_info_hash = fib_info_hash_alloc(bytes);
743                 new_laddrhash = fib_info_hash_alloc(bytes);
744                 if (!new_info_hash || !new_laddrhash) {
745                         fib_info_hash_free(new_info_hash, bytes);
746                         fib_info_hash_free(new_laddrhash, bytes);
747                 } else
748                         fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
749
750                 if (!fib_info_hash_size)
751                         goto failure;
752         }
753
754         fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
755         if (fi == NULL)
756                 goto failure;
757         if (cfg->fc_mx) {
758                 fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
759                 if (!fi->fib_metrics)
760                         goto failure;
761         } else
762                 fi->fib_metrics = (u32 *) dst_default_metrics;
763         fib_info_cnt++;
764
765         fi->fib_net = hold_net(net);
766         fi->fib_protocol = cfg->fc_protocol;
767         fi->fib_scope = cfg->fc_scope;
768         fi->fib_flags = cfg->fc_flags;
769         fi->fib_priority = cfg->fc_priority;
770         fi->fib_prefsrc = cfg->fc_prefsrc;
771
772         fi->fib_nhs = nhs;
773         change_nexthops(fi) {
774                 nexthop_nh->nh_parent = fi;
775         } endfor_nexthops(fi)
776
777         if (cfg->fc_mx) {
778                 struct nlattr *nla;
779                 int remaining;
780
781                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
782                         int type = nla_type(nla);
783
784                         if (type) {
785                                 if (type > RTAX_MAX)
786                                         goto err_inval;
787                                 fi->fib_metrics[type - 1] = nla_get_u32(nla);
788                         }
789                 }
790         }
791
792         if (cfg->fc_mp) {
793 #ifdef CONFIG_IP_ROUTE_MULTIPATH
794                 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
795                 if (err != 0)
796                         goto failure;
797                 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
798                         goto err_inval;
799                 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
800                         goto err_inval;
801 #ifdef CONFIG_IP_ROUTE_CLASSID
802                 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
803                         goto err_inval;
804 #endif
805 #else
806                 goto err_inval;
807 #endif
808         } else {
809                 struct fib_nh *nh = fi->fib_nh;
810
811                 nh->nh_oif = cfg->fc_oif;
812                 nh->nh_gw = cfg->fc_gw;
813                 nh->nh_flags = cfg->fc_flags;
814 #ifdef CONFIG_IP_ROUTE_CLASSID
815                 nh->nh_tclassid = cfg->fc_flow;
816 #endif
817 #ifdef CONFIG_IP_ROUTE_MULTIPATH
818                 nh->nh_weight = 1;
819 #endif
820         }
821
822         if (fib_props[cfg->fc_type].error) {
823                 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
824                         goto err_inval;
825                 goto link_it;
826         } else {
827                 switch (cfg->fc_type) {
828                 case RTN_UNICAST:
829                 case RTN_LOCAL:
830                 case RTN_BROADCAST:
831                 case RTN_ANYCAST:
832                 case RTN_MULTICAST:
833                         break;
834                 default:
835                         goto err_inval;
836                 }
837         }
838
839         if (cfg->fc_scope > RT_SCOPE_HOST)
840                 goto err_inval;
841
842         if (cfg->fc_scope == RT_SCOPE_HOST) {
843                 struct fib_nh *nh = fi->fib_nh;
844
845                 /* Local address is added. */
846                 if (nhs != 1 || nh->nh_gw)
847                         goto err_inval;
848                 nh->nh_scope = RT_SCOPE_NOWHERE;
849                 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
850                 err = -ENODEV;
851                 if (nh->nh_dev == NULL)
852                         goto failure;
853         } else {
854                 change_nexthops(fi) {
855                         err = fib_check_nh(cfg, fi, nexthop_nh);
856                         if (err != 0)
857                                 goto failure;
858                 } endfor_nexthops(fi)
859         }
860
861         if (fi->fib_prefsrc) {
862                 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
863                     fi->fib_prefsrc != cfg->fc_dst)
864                         if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
865                                 goto err_inval;
866         }
867
868         change_nexthops(fi) {
869                 fib_info_update_nh_saddr(net, nexthop_nh);
870         } endfor_nexthops(fi)
871
872 link_it:
873         ofi = fib_find_info(fi);
874         if (ofi) {
875                 fi->fib_dead = 1;
876                 free_fib_info(fi);
877                 ofi->fib_treeref++;
878                 return ofi;
879         }
880
881         fi->fib_treeref++;
882         atomic_inc(&fi->fib_clntref);
883         spin_lock_bh(&fib_info_lock);
884         hlist_add_head(&fi->fib_hash,
885                        &fib_info_hash[fib_info_hashfn(fi)]);
886         if (fi->fib_prefsrc) {
887                 struct hlist_head *head;
888
889                 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
890                 hlist_add_head(&fi->fib_lhash, head);
891         }
892         change_nexthops(fi) {
893                 struct hlist_head *head;
894                 unsigned int hash;
895
896                 if (!nexthop_nh->nh_dev)
897                         continue;
898                 hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
899                 head = &fib_info_devhash[hash];
900                 hlist_add_head(&nexthop_nh->nh_hash, head);
901         } endfor_nexthops(fi)
902         spin_unlock_bh(&fib_info_lock);
903         return fi;
904
905 err_inval:
906         err = -EINVAL;
907
908 failure:
909         if (fi) {
910                 fi->fib_dead = 1;
911                 free_fib_info(fi);
912         }
913
914         return ERR_PTR(err);
915 }
916
917 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
918                   u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
919                   struct fib_info *fi, unsigned int flags)
920 {
921         struct nlmsghdr *nlh;
922         struct rtmsg *rtm;
923
924         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
925         if (nlh == NULL)
926                 return -EMSGSIZE;
927
928         rtm = nlmsg_data(nlh);
929         rtm->rtm_family = AF_INET;
930         rtm->rtm_dst_len = dst_len;
931         rtm->rtm_src_len = 0;
932         rtm->rtm_tos = tos;
933         if (tb_id < 256)
934                 rtm->rtm_table = tb_id;
935         else
936                 rtm->rtm_table = RT_TABLE_COMPAT;
937         NLA_PUT_U32(skb, RTA_TABLE, tb_id);
938         rtm->rtm_type = type;
939         rtm->rtm_flags = fi->fib_flags;
940         rtm->rtm_scope = fi->fib_scope;
941         rtm->rtm_protocol = fi->fib_protocol;
942
943         if (rtm->rtm_dst_len)
944                 NLA_PUT_BE32(skb, RTA_DST, dst);
945
946         if (fi->fib_priority)
947                 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
948
949         if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
950                 goto nla_put_failure;
951
952         if (fi->fib_prefsrc)
953                 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
954
955         if (fi->fib_nhs == 1) {
956                 if (fi->fib_nh->nh_gw)
957                         NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
958
959                 if (fi->fib_nh->nh_oif)
960                         NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
961 #ifdef CONFIG_IP_ROUTE_CLASSID
962                 if (fi->fib_nh[0].nh_tclassid)
963                         NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
964 #endif
965         }
966 #ifdef CONFIG_IP_ROUTE_MULTIPATH
967         if (fi->fib_nhs > 1) {
968                 struct rtnexthop *rtnh;
969                 struct nlattr *mp;
970
971                 mp = nla_nest_start(skb, RTA_MULTIPATH);
972                 if (mp == NULL)
973                         goto nla_put_failure;
974
975                 for_nexthops(fi) {
976                         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
977                         if (rtnh == NULL)
978                                 goto nla_put_failure;
979
980                         rtnh->rtnh_flags = nh->nh_flags & 0xFF;
981                         rtnh->rtnh_hops = nh->nh_weight - 1;
982                         rtnh->rtnh_ifindex = nh->nh_oif;
983
984                         if (nh->nh_gw)
985                                 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
986 #ifdef CONFIG_IP_ROUTE_CLASSID
987                         if (nh->nh_tclassid)
988                                 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
989 #endif
990                         /* length of rtnetlink header + attributes */
991                         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
992                 } endfor_nexthops(fi);
993
994                 nla_nest_end(skb, mp);
995         }
996 #endif
997         return nlmsg_end(skb, nlh);
998
999 nla_put_failure:
1000         nlmsg_cancel(skb, nlh);
1001         return -EMSGSIZE;
1002 }
1003
1004 /*
1005  * Update FIB if:
1006  * - local address disappeared -> we must delete all the entries
1007  *   referring to it.
1008  * - device went down -> we must shutdown all nexthops going via it.
1009  */
1010 int fib_sync_down_addr(struct net *net, __be32 local)
1011 {
1012         int ret = 0;
1013         unsigned int hash = fib_laddr_hashfn(local);
1014         struct hlist_head *head = &fib_info_laddrhash[hash];
1015         struct hlist_node *node;
1016         struct fib_info *fi;
1017
1018         if (fib_info_laddrhash == NULL || local == 0)
1019                 return 0;
1020
1021         hlist_for_each_entry(fi, node, head, fib_lhash) {
1022                 if (!net_eq(fi->fib_net, net))
1023                         continue;
1024                 if (fi->fib_prefsrc == local) {
1025                         fi->fib_flags |= RTNH_F_DEAD;
1026                         ret++;
1027                 }
1028         }
1029         return ret;
1030 }
1031
1032 int fib_sync_down_dev(struct net_device *dev, int force)
1033 {
1034         int ret = 0;
1035         int scope = RT_SCOPE_NOWHERE;
1036         struct fib_info *prev_fi = NULL;
1037         unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1038         struct hlist_head *head = &fib_info_devhash[hash];
1039         struct hlist_node *node;
1040         struct fib_nh *nh;
1041
1042         if (force)
1043                 scope = -1;
1044
1045         hlist_for_each_entry(nh, node, head, nh_hash) {
1046                 struct fib_info *fi = nh->nh_parent;
1047                 int dead;
1048
1049                 BUG_ON(!fi->fib_nhs);
1050                 if (nh->nh_dev != dev || fi == prev_fi)
1051                         continue;
1052                 prev_fi = fi;
1053                 dead = 0;
1054                 change_nexthops(fi) {
1055                         if (nexthop_nh->nh_flags & RTNH_F_DEAD)
1056                                 dead++;
1057                         else if (nexthop_nh->nh_dev == dev &&
1058                                  nexthop_nh->nh_scope != scope) {
1059                                 nexthop_nh->nh_flags |= RTNH_F_DEAD;
1060 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1061                                 spin_lock_bh(&fib_multipath_lock);
1062                                 fi->fib_power -= nexthop_nh->nh_power;
1063                                 nexthop_nh->nh_power = 0;
1064                                 spin_unlock_bh(&fib_multipath_lock);
1065 #endif
1066                                 dead++;
1067                         }
1068 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1069                         if (force > 1 && nexthop_nh->nh_dev == dev) {
1070                                 dead = fi->fib_nhs;
1071                                 break;
1072                         }
1073 #endif
1074                 } endfor_nexthops(fi)
1075                 if (dead == fi->fib_nhs) {
1076                         fi->fib_flags |= RTNH_F_DEAD;
1077                         ret++;
1078                 }
1079         }
1080
1081         return ret;
1082 }
1083
1084 /* Must be invoked inside of an RCU protected region.  */
1085 void fib_select_default(struct fib_result *res)
1086 {
1087         struct fib_info *fi = NULL, *last_resort = NULL;
1088         struct list_head *fa_head = res->fa_head;
1089         struct fib_table *tb = res->table;
1090         int order = -1, last_idx = -1;
1091         struct fib_alias *fa;
1092
1093         list_for_each_entry_rcu(fa, fa_head, fa_list) {
1094                 struct fib_info *next_fi = fa->fa_info;
1095
1096                 if (next_fi->fib_scope != res->scope ||
1097                     fa->fa_type != RTN_UNICAST)
1098                         continue;
1099
1100                 if (next_fi->fib_priority > res->fi->fib_priority)
1101                         break;
1102                 if (!next_fi->fib_nh[0].nh_gw ||
1103                     next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1104                         continue;
1105
1106                 fib_alias_accessed(fa);
1107
1108                 if (fi == NULL) {
1109                         if (next_fi != res->fi)
1110                                 break;
1111                 } else if (!fib_detect_death(fi, order, &last_resort,
1112                                              &last_idx, tb->tb_default)) {
1113                         fib_result_assign(res, fi);
1114                         tb->tb_default = order;
1115                         goto out;
1116                 }
1117                 fi = next_fi;
1118                 order++;
1119         }
1120
1121         if (order <= 0 || fi == NULL) {
1122                 tb->tb_default = -1;
1123                 goto out;
1124         }
1125
1126         if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1127                                 tb->tb_default)) {
1128                 fib_result_assign(res, fi);
1129                 tb->tb_default = order;
1130                 goto out;
1131         }
1132
1133         if (last_idx >= 0)
1134                 fib_result_assign(res, last_resort);
1135         tb->tb_default = last_idx;
1136 out:
1137         return;
1138 }
1139
1140 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1141
1142 /*
1143  * Dead device goes up. We wake up dead nexthops.
1144  * It takes sense only on multipath routes.
1145  */
1146 int fib_sync_up(struct net_device *dev)
1147 {
1148         struct fib_info *prev_fi;
1149         unsigned int hash;
1150         struct hlist_head *head;
1151         struct hlist_node *node;
1152         struct fib_nh *nh;
1153         int ret;
1154
1155         if (!(dev->flags & IFF_UP))
1156                 return 0;
1157
1158         prev_fi = NULL;
1159         hash = fib_devindex_hashfn(dev->ifindex);
1160         head = &fib_info_devhash[hash];
1161         ret = 0;
1162
1163         hlist_for_each_entry(nh, node, head, nh_hash) {
1164                 struct fib_info *fi = nh->nh_parent;
1165                 int alive;
1166
1167                 BUG_ON(!fi->fib_nhs);
1168                 if (nh->nh_dev != dev || fi == prev_fi)
1169                         continue;
1170
1171                 prev_fi = fi;
1172                 alive = 0;
1173                 change_nexthops(fi) {
1174                         if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
1175                                 alive++;
1176                                 continue;
1177                         }
1178                         if (nexthop_nh->nh_dev == NULL ||
1179                             !(nexthop_nh->nh_dev->flags & IFF_UP))
1180                                 continue;
1181                         if (nexthop_nh->nh_dev != dev ||
1182                             !__in_dev_get_rtnl(dev))
1183                                 continue;
1184                         alive++;
1185                         spin_lock_bh(&fib_multipath_lock);
1186                         nexthop_nh->nh_power = 0;
1187                         nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
1188                         spin_unlock_bh(&fib_multipath_lock);
1189                 } endfor_nexthops(fi)
1190
1191                 if (alive > 0) {
1192                         fi->fib_flags &= ~RTNH_F_DEAD;
1193                         ret++;
1194                 }
1195         }
1196
1197         return ret;
1198 }
1199
1200 /*
1201  * The algorithm is suboptimal, but it provides really
1202  * fair weighted route distribution.
1203  */
1204 void fib_select_multipath(struct fib_result *res)
1205 {
1206         struct fib_info *fi = res->fi;
1207         int w;
1208
1209         spin_lock_bh(&fib_multipath_lock);
1210         if (fi->fib_power <= 0) {
1211                 int power = 0;
1212                 change_nexthops(fi) {
1213                         if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
1214                                 power += nexthop_nh->nh_weight;
1215                                 nexthop_nh->nh_power = nexthop_nh->nh_weight;
1216                         }
1217                 } endfor_nexthops(fi);
1218                 fi->fib_power = power;
1219                 if (power <= 0) {
1220                         spin_unlock_bh(&fib_multipath_lock);
1221                         /* Race condition: route has just become dead. */
1222                         res->nh_sel = 0;
1223                         return;
1224                 }
1225         }
1226
1227
1228         /* w should be random number [0..fi->fib_power-1],
1229          * it is pretty bad approximation.
1230          */
1231
1232         w = jiffies % fi->fib_power;
1233
1234         change_nexthops(fi) {
1235                 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
1236                     nexthop_nh->nh_power) {
1237                         w -= nexthop_nh->nh_power;
1238                         if (w <= 0) {
1239                                 nexthop_nh->nh_power--;
1240                                 fi->fib_power--;
1241                                 res->nh_sel = nhsel;
1242                                 spin_unlock_bh(&fib_multipath_lock);
1243                                 return;
1244                         }
1245                 }
1246         } endfor_nexthops(fi);
1247
1248         /* Race condition: route has just become dead. */
1249         res->nh_sel = 0;
1250         spin_unlock_bh(&fib_multipath_lock);
1251 }
1252 #endif