Merge branch 'locks' of git://linux-nfs.org/~bfields/linux
[pandora-kernel.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_min_delay              = 2 * HZ;
120 static int ip_rt_max_delay              = 10 * HZ;
121 static int ip_rt_max_size;
122 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
123 static int ip_rt_gc_interval            = 60 * HZ;
124 static int ip_rt_gc_min_interval        = HZ / 2;
125 static int ip_rt_redirect_number        = 9;
126 static int ip_rt_redirect_load          = HZ / 50;
127 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost             = HZ;
129 static int ip_rt_error_burst            = 5 * HZ;
130 static int ip_rt_gc_elasticity          = 8;
131 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
132 static int ip_rt_min_pmtu               = 512 + 20 + 20;
133 static int ip_rt_min_advmss             = 256;
134 static int ip_rt_secret_interval        = 10 * 60 * HZ;
135 static unsigned long rt_deadline;
136
137 #define RTprint(a...)   printk(KERN_DEBUG a)
138
139 static struct timer_list rt_flush_timer;
140 static void rt_check_expire(struct work_struct *work);
141 static DECLARE_DELAYED_WORK(expires_work, rt_check_expire);
142 static struct timer_list rt_secret_timer;
143
144 /*
145  *      Interface to generic destination cache.
146  */
147
148 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
149 static void              ipv4_dst_destroy(struct dst_entry *dst);
150 static void              ipv4_dst_ifdown(struct dst_entry *dst,
151                                          struct net_device *dev, int how);
152 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
153 static void              ipv4_link_failure(struct sk_buff *skb);
154 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
155 static int rt_garbage_collect(void);
156
157
158 static struct dst_ops ipv4_dst_ops = {
159         .family =               AF_INET,
160         .protocol =             __constant_htons(ETH_P_IP),
161         .gc =                   rt_garbage_collect,
162         .check =                ipv4_dst_check,
163         .destroy =              ipv4_dst_destroy,
164         .ifdown =               ipv4_dst_ifdown,
165         .negative_advice =      ipv4_negative_advice,
166         .link_failure =         ipv4_link_failure,
167         .update_pmtu =          ip_rt_update_pmtu,
168         .entry_size =           sizeof(struct rtable),
169 };
170
171 #define ECN_OR_COST(class)      TC_PRIO_##class
172
173 const __u8 ip_tos2prio[16] = {
174         TC_PRIO_BESTEFFORT,
175         ECN_OR_COST(FILLER),
176         TC_PRIO_BESTEFFORT,
177         ECN_OR_COST(BESTEFFORT),
178         TC_PRIO_BULK,
179         ECN_OR_COST(BULK),
180         TC_PRIO_BULK,
181         ECN_OR_COST(BULK),
182         TC_PRIO_INTERACTIVE,
183         ECN_OR_COST(INTERACTIVE),
184         TC_PRIO_INTERACTIVE,
185         ECN_OR_COST(INTERACTIVE),
186         TC_PRIO_INTERACTIVE_BULK,
187         ECN_OR_COST(INTERACTIVE_BULK),
188         TC_PRIO_INTERACTIVE_BULK,
189         ECN_OR_COST(INTERACTIVE_BULK)
190 };
191
192
193 /*
194  * Route cache.
195  */
196
197 /* The locking scheme is rather straight forward:
198  *
199  * 1) Read-Copy Update protects the buckets of the central route hash.
200  * 2) Only writers remove entries, and they hold the lock
201  *    as they look at rtable reference counts.
202  * 3) Only readers acquire references to rtable entries,
203  *    they do so with atomic increments and with the
204  *    lock held.
205  */
206
207 struct rt_hash_bucket {
208         struct rtable   *chain;
209 };
210 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
211         defined(CONFIG_PROVE_LOCKING)
212 /*
213  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
214  * The size of this table is a power of two and depends on the number of CPUS.
215  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
216  */
217 #ifdef CONFIG_LOCKDEP
218 # define RT_HASH_LOCK_SZ        256
219 #else
220 # if NR_CPUS >= 32
221 #  define RT_HASH_LOCK_SZ       4096
222 # elif NR_CPUS >= 16
223 #  define RT_HASH_LOCK_SZ       2048
224 # elif NR_CPUS >= 8
225 #  define RT_HASH_LOCK_SZ       1024
226 # elif NR_CPUS >= 4
227 #  define RT_HASH_LOCK_SZ       512
228 # else
229 #  define RT_HASH_LOCK_SZ       256
230 # endif
231 #endif
232
233 static spinlock_t       *rt_hash_locks;
234 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
235 # define rt_hash_lock_init()    { \
236                 int i; \
237                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
238                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
239                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
240                         spin_lock_init(&rt_hash_locks[i]); \
241                 }
242 #else
243 # define rt_hash_lock_addr(slot) NULL
244 # define rt_hash_lock_init()
245 #endif
246
247 static struct rt_hash_bucket    *rt_hash_table;
248 static unsigned                 rt_hash_mask;
249 static unsigned int             rt_hash_log;
250 static unsigned int             rt_hash_rnd;
251
252 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
253 #define RT_CACHE_STAT_INC(field) \
254         (__raw_get_cpu_var(rt_cache_stat).field++)
255
256 static int rt_intern_hash(unsigned hash, struct rtable *rth,
257                                 struct rtable **res);
258
259 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
260 {
261         return (jhash_2words(daddr, saddr, rt_hash_rnd)
262                 & rt_hash_mask);
263 }
264
265 #define rt_hash(daddr, saddr, idx) \
266         rt_hash_code((__force u32)(__be32)(daddr),\
267                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
268
269 #ifdef CONFIG_PROC_FS
270 struct rt_cache_iter_state {
271         int bucket;
272 };
273
274 static struct rtable *rt_cache_get_first(struct seq_file *seq)
275 {
276         struct rtable *r = NULL;
277         struct rt_cache_iter_state *st = seq->private;
278
279         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
280                 rcu_read_lock_bh();
281                 r = rt_hash_table[st->bucket].chain;
282                 if (r)
283                         break;
284                 rcu_read_unlock_bh();
285         }
286         return r;
287 }
288
289 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
290 {
291         struct rt_cache_iter_state *st = rcu_dereference(seq->private);
292
293         r = r->u.dst.rt_next;
294         while (!r) {
295                 rcu_read_unlock_bh();
296                 if (--st->bucket < 0)
297                         break;
298                 rcu_read_lock_bh();
299                 r = rt_hash_table[st->bucket].chain;
300         }
301         return r;
302 }
303
304 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
305 {
306         struct rtable *r = rt_cache_get_first(seq);
307
308         if (r)
309                 while (pos && (r = rt_cache_get_next(seq, r)))
310                         --pos;
311         return pos ? NULL : r;
312 }
313
314 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
315 {
316         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
317 }
318
319 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
320 {
321         struct rtable *r = NULL;
322
323         if (v == SEQ_START_TOKEN)
324                 r = rt_cache_get_first(seq);
325         else
326                 r = rt_cache_get_next(seq, v);
327         ++*pos;
328         return r;
329 }
330
331 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
332 {
333         if (v && v != SEQ_START_TOKEN)
334                 rcu_read_unlock_bh();
335 }
336
337 static int rt_cache_seq_show(struct seq_file *seq, void *v)
338 {
339         if (v == SEQ_START_TOKEN)
340                 seq_printf(seq, "%-127s\n",
341                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
342                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
343                            "HHUptod\tSpecDst");
344         else {
345                 struct rtable *r = v;
346                 char temp[256];
347
348                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
349                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
350                         r->u.dst.dev ? r->u.dst.dev->name : "*",
351                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
352                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
353                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
354                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
355                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
356                         dst_metric(&r->u.dst, RTAX_WINDOW),
357                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
358                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
359                         r->fl.fl4_tos,
360                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
361                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
362                                        dev_queue_xmit) : 0,
363                         r->rt_spec_dst);
364                 seq_printf(seq, "%-127s\n", temp);
365         }
366         return 0;
367 }
368
369 static const struct seq_operations rt_cache_seq_ops = {
370         .start  = rt_cache_seq_start,
371         .next   = rt_cache_seq_next,
372         .stop   = rt_cache_seq_stop,
373         .show   = rt_cache_seq_show,
374 };
375
376 static int rt_cache_seq_open(struct inode *inode, struct file *file)
377 {
378         return seq_open_private(file, &rt_cache_seq_ops,
379                         sizeof(struct rt_cache_iter_state));
380 }
381
382 static const struct file_operations rt_cache_seq_fops = {
383         .owner   = THIS_MODULE,
384         .open    = rt_cache_seq_open,
385         .read    = seq_read,
386         .llseek  = seq_lseek,
387         .release = seq_release_private,
388 };
389
390
391 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
392 {
393         int cpu;
394
395         if (*pos == 0)
396                 return SEQ_START_TOKEN;
397
398         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
399                 if (!cpu_possible(cpu))
400                         continue;
401                 *pos = cpu+1;
402                 return &per_cpu(rt_cache_stat, cpu);
403         }
404         return NULL;
405 }
406
407 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
408 {
409         int cpu;
410
411         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
412                 if (!cpu_possible(cpu))
413                         continue;
414                 *pos = cpu+1;
415                 return &per_cpu(rt_cache_stat, cpu);
416         }
417         return NULL;
418
419 }
420
421 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
422 {
423
424 }
425
426 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
427 {
428         struct rt_cache_stat *st = v;
429
430         if (v == SEQ_START_TOKEN) {
431                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
432                 return 0;
433         }
434
435         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
436                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
437                    atomic_read(&ipv4_dst_ops.entries),
438                    st->in_hit,
439                    st->in_slow_tot,
440                    st->in_slow_mc,
441                    st->in_no_route,
442                    st->in_brd,
443                    st->in_martian_dst,
444                    st->in_martian_src,
445
446                    st->out_hit,
447                    st->out_slow_tot,
448                    st->out_slow_mc,
449
450                    st->gc_total,
451                    st->gc_ignored,
452                    st->gc_goal_miss,
453                    st->gc_dst_overflow,
454                    st->in_hlist_search,
455                    st->out_hlist_search
456                 );
457         return 0;
458 }
459
460 static const struct seq_operations rt_cpu_seq_ops = {
461         .start  = rt_cpu_seq_start,
462         .next   = rt_cpu_seq_next,
463         .stop   = rt_cpu_seq_stop,
464         .show   = rt_cpu_seq_show,
465 };
466
467
468 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
469 {
470         return seq_open(file, &rt_cpu_seq_ops);
471 }
472
473 static const struct file_operations rt_cpu_seq_fops = {
474         .owner   = THIS_MODULE,
475         .open    = rt_cpu_seq_open,
476         .read    = seq_read,
477         .llseek  = seq_lseek,
478         .release = seq_release,
479 };
480
481 #endif /* CONFIG_PROC_FS */
482
483 static __inline__ void rt_free(struct rtable *rt)
484 {
485         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
486 }
487
488 static __inline__ void rt_drop(struct rtable *rt)
489 {
490         ip_rt_put(rt);
491         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
492 }
493
494 static __inline__ int rt_fast_clean(struct rtable *rth)
495 {
496         /* Kill broadcast/multicast entries very aggresively, if they
497            collide in hash table with more useful entries */
498         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
499                 rth->fl.iif && rth->u.dst.rt_next;
500 }
501
502 static __inline__ int rt_valuable(struct rtable *rth)
503 {
504         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
505                 rth->u.dst.expires;
506 }
507
508 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
509 {
510         unsigned long age;
511         int ret = 0;
512
513         if (atomic_read(&rth->u.dst.__refcnt))
514                 goto out;
515
516         ret = 1;
517         if (rth->u.dst.expires &&
518             time_after_eq(jiffies, rth->u.dst.expires))
519                 goto out;
520
521         age = jiffies - rth->u.dst.lastuse;
522         ret = 0;
523         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
524             (age <= tmo2 && rt_valuable(rth)))
525                 goto out;
526         ret = 1;
527 out:    return ret;
528 }
529
530 /* Bits of score are:
531  * 31: very valuable
532  * 30: not quite useless
533  * 29..0: usage counter
534  */
535 static inline u32 rt_score(struct rtable *rt)
536 {
537         u32 score = jiffies - rt->u.dst.lastuse;
538
539         score = ~score & ~(3<<30);
540
541         if (rt_valuable(rt))
542                 score |= (1<<31);
543
544         if (!rt->fl.iif ||
545             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
546                 score |= (1<<30);
547
548         return score;
549 }
550
551 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
552 {
553         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
554                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
555                 (fl1->mark ^ fl2->mark) |
556                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
557                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
558                 (fl1->oif ^ fl2->oif) |
559                 (fl1->iif ^ fl2->iif)) == 0;
560 }
561
562 static void rt_check_expire(struct work_struct *work)
563 {
564         static unsigned int rover;
565         unsigned int i = rover, goal;
566         struct rtable *rth, **rthp;
567         u64 mult;
568
569         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
570         if (ip_rt_gc_timeout > 1)
571                 do_div(mult, ip_rt_gc_timeout);
572         goal = (unsigned int)mult;
573         if (goal > rt_hash_mask)
574                 goal = rt_hash_mask + 1;
575         for (; goal > 0; goal--) {
576                 unsigned long tmo = ip_rt_gc_timeout;
577
578                 i = (i + 1) & rt_hash_mask;
579                 rthp = &rt_hash_table[i].chain;
580
581                 if (*rthp == NULL)
582                         continue;
583                 spin_lock_bh(rt_hash_lock_addr(i));
584                 while ((rth = *rthp) != NULL) {
585                         if (rth->u.dst.expires) {
586                                 /* Entry is expired even if it is in use */
587                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
588                                         tmo >>= 1;
589                                         rthp = &rth->u.dst.rt_next;
590                                         continue;
591                                 }
592                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
593                                 tmo >>= 1;
594                                 rthp = &rth->u.dst.rt_next;
595                                 continue;
596                         }
597
598                         /* Cleanup aged off entries. */
599                         *rthp = rth->u.dst.rt_next;
600                         rt_free(rth);
601                 }
602                 spin_unlock_bh(rt_hash_lock_addr(i));
603         }
604         rover = i;
605         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
606 }
607
608 /* This can run from both BH and non-BH contexts, the latter
609  * in the case of a forced flush event.
610  */
611 static void rt_run_flush(unsigned long dummy)
612 {
613         int i;
614         struct rtable *rth, *next;
615
616         rt_deadline = 0;
617
618         get_random_bytes(&rt_hash_rnd, 4);
619
620         for (i = rt_hash_mask; i >= 0; i--) {
621                 spin_lock_bh(rt_hash_lock_addr(i));
622                 rth = rt_hash_table[i].chain;
623                 if (rth)
624                         rt_hash_table[i].chain = NULL;
625                 spin_unlock_bh(rt_hash_lock_addr(i));
626
627                 for (; rth; rth = next) {
628                         next = rth->u.dst.rt_next;
629                         rt_free(rth);
630                 }
631         }
632 }
633
634 static DEFINE_SPINLOCK(rt_flush_lock);
635
636 void rt_cache_flush(int delay)
637 {
638         unsigned long now = jiffies;
639         int user_mode = !in_softirq();
640
641         if (delay < 0)
642                 delay = ip_rt_min_delay;
643
644         spin_lock_bh(&rt_flush_lock);
645
646         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
647                 long tmo = (long)(rt_deadline - now);
648
649                 /* If flush timer is already running
650                    and flush request is not immediate (delay > 0):
651
652                    if deadline is not achieved, prolongate timer to "delay",
653                    otherwise fire it at deadline time.
654                  */
655
656                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
657                         tmo = 0;
658
659                 if (delay > tmo)
660                         delay = tmo;
661         }
662
663         if (delay <= 0) {
664                 spin_unlock_bh(&rt_flush_lock);
665                 rt_run_flush(0);
666                 return;
667         }
668
669         if (rt_deadline == 0)
670                 rt_deadline = now + ip_rt_max_delay;
671
672         mod_timer(&rt_flush_timer, now+delay);
673         spin_unlock_bh(&rt_flush_lock);
674 }
675
676 static void rt_secret_rebuild(unsigned long dummy)
677 {
678         unsigned long now = jiffies;
679
680         rt_cache_flush(0);
681         mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
682 }
683
684 /*
685    Short description of GC goals.
686
687    We want to build algorithm, which will keep routing cache
688    at some equilibrium point, when number of aged off entries
689    is kept approximately equal to newly generated ones.
690
691    Current expiration strength is variable "expire".
692    We try to adjust it dynamically, so that if networking
693    is idle expires is large enough to keep enough of warm entries,
694    and when load increases it reduces to limit cache size.
695  */
696
697 static int rt_garbage_collect(void)
698 {
699         static unsigned long expire = RT_GC_TIMEOUT;
700         static unsigned long last_gc;
701         static int rover;
702         static int equilibrium;
703         struct rtable *rth, **rthp;
704         unsigned long now = jiffies;
705         int goal;
706
707         /*
708          * Garbage collection is pretty expensive,
709          * do not make it too frequently.
710          */
711
712         RT_CACHE_STAT_INC(gc_total);
713
714         if (now - last_gc < ip_rt_gc_min_interval &&
715             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
716                 RT_CACHE_STAT_INC(gc_ignored);
717                 goto out;
718         }
719
720         /* Calculate number of entries, which we want to expire now. */
721         goal = atomic_read(&ipv4_dst_ops.entries) -
722                 (ip_rt_gc_elasticity << rt_hash_log);
723         if (goal <= 0) {
724                 if (equilibrium < ipv4_dst_ops.gc_thresh)
725                         equilibrium = ipv4_dst_ops.gc_thresh;
726                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
727                 if (goal > 0) {
728                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
729                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
730                 }
731         } else {
732                 /* We are in dangerous area. Try to reduce cache really
733                  * aggressively.
734                  */
735                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
736                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
737         }
738
739         if (now - last_gc >= ip_rt_gc_min_interval)
740                 last_gc = now;
741
742         if (goal <= 0) {
743                 equilibrium += goal;
744                 goto work_done;
745         }
746
747         do {
748                 int i, k;
749
750                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
751                         unsigned long tmo = expire;
752
753                         k = (k + 1) & rt_hash_mask;
754                         rthp = &rt_hash_table[k].chain;
755                         spin_lock_bh(rt_hash_lock_addr(k));
756                         while ((rth = *rthp) != NULL) {
757                                 if (!rt_may_expire(rth, tmo, expire)) {
758                                         tmo >>= 1;
759                                         rthp = &rth->u.dst.rt_next;
760                                         continue;
761                                 }
762                                 *rthp = rth->u.dst.rt_next;
763                                 rt_free(rth);
764                                 goal--;
765                         }
766                         spin_unlock_bh(rt_hash_lock_addr(k));
767                         if (goal <= 0)
768                                 break;
769                 }
770                 rover = k;
771
772                 if (goal <= 0)
773                         goto work_done;
774
775                 /* Goal is not achieved. We stop process if:
776
777                    - if expire reduced to zero. Otherwise, expire is halfed.
778                    - if table is not full.
779                    - if we are called from interrupt.
780                    - jiffies check is just fallback/debug loop breaker.
781                      We will not spin here for long time in any case.
782                  */
783
784                 RT_CACHE_STAT_INC(gc_goal_miss);
785
786                 if (expire == 0)
787                         break;
788
789                 expire >>= 1;
790 #if RT_CACHE_DEBUG >= 2
791                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
792                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
793 #endif
794
795                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
796                         goto out;
797         } while (!in_softirq() && time_before_eq(jiffies, now));
798
799         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
800                 goto out;
801         if (net_ratelimit())
802                 printk(KERN_WARNING "dst cache overflow\n");
803         RT_CACHE_STAT_INC(gc_dst_overflow);
804         return 1;
805
806 work_done:
807         expire += ip_rt_gc_min_interval;
808         if (expire > ip_rt_gc_timeout ||
809             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
810                 expire = ip_rt_gc_timeout;
811 #if RT_CACHE_DEBUG >= 2
812         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
813                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
814 #endif
815 out:    return 0;
816 }
817
818 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
819 {
820         struct rtable   *rth, **rthp;
821         unsigned long   now;
822         struct rtable *cand, **candp;
823         u32             min_score;
824         int             chain_length;
825         int attempts = !in_softirq();
826
827 restart:
828         chain_length = 0;
829         min_score = ~(u32)0;
830         cand = NULL;
831         candp = NULL;
832         now = jiffies;
833
834         rthp = &rt_hash_table[hash].chain;
835
836         spin_lock_bh(rt_hash_lock_addr(hash));
837         while ((rth = *rthp) != NULL) {
838                 if (compare_keys(&rth->fl, &rt->fl)) {
839                         /* Put it first */
840                         *rthp = rth->u.dst.rt_next;
841                         /*
842                          * Since lookup is lockfree, the deletion
843                          * must be visible to another weakly ordered CPU before
844                          * the insertion at the start of the hash chain.
845                          */
846                         rcu_assign_pointer(rth->u.dst.rt_next,
847                                            rt_hash_table[hash].chain);
848                         /*
849                          * Since lookup is lockfree, the update writes
850                          * must be ordered for consistency on SMP.
851                          */
852                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
853
854                         rth->u.dst.__use++;
855                         dst_hold(&rth->u.dst);
856                         rth->u.dst.lastuse = now;
857                         spin_unlock_bh(rt_hash_lock_addr(hash));
858
859                         rt_drop(rt);
860                         *rp = rth;
861                         return 0;
862                 }
863
864                 if (!atomic_read(&rth->u.dst.__refcnt)) {
865                         u32 score = rt_score(rth);
866
867                         if (score <= min_score) {
868                                 cand = rth;
869                                 candp = rthp;
870                                 min_score = score;
871                         }
872                 }
873
874                 chain_length++;
875
876                 rthp = &rth->u.dst.rt_next;
877         }
878
879         if (cand) {
880                 /* ip_rt_gc_elasticity used to be average length of chain
881                  * length, when exceeded gc becomes really aggressive.
882                  *
883                  * The second limit is less certain. At the moment it allows
884                  * only 2 entries per bucket. We will see.
885                  */
886                 if (chain_length > ip_rt_gc_elasticity) {
887                         *candp = cand->u.dst.rt_next;
888                         rt_free(cand);
889                 }
890         }
891
892         /* Try to bind route to arp only if it is output
893            route or unicast forwarding path.
894          */
895         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
896                 int err = arp_bind_neighbour(&rt->u.dst);
897                 if (err) {
898                         spin_unlock_bh(rt_hash_lock_addr(hash));
899
900                         if (err != -ENOBUFS) {
901                                 rt_drop(rt);
902                                 return err;
903                         }
904
905                         /* Neighbour tables are full and nothing
906                            can be released. Try to shrink route cache,
907                            it is most likely it holds some neighbour records.
908                          */
909                         if (attempts-- > 0) {
910                                 int saved_elasticity = ip_rt_gc_elasticity;
911                                 int saved_int = ip_rt_gc_min_interval;
912                                 ip_rt_gc_elasticity     = 1;
913                                 ip_rt_gc_min_interval   = 0;
914                                 rt_garbage_collect();
915                                 ip_rt_gc_min_interval   = saved_int;
916                                 ip_rt_gc_elasticity     = saved_elasticity;
917                                 goto restart;
918                         }
919
920                         if (net_ratelimit())
921                                 printk(KERN_WARNING "Neighbour table overflow.\n");
922                         rt_drop(rt);
923                         return -ENOBUFS;
924                 }
925         }
926
927         rt->u.dst.rt_next = rt_hash_table[hash].chain;
928 #if RT_CACHE_DEBUG >= 2
929         if (rt->u.dst.rt_next) {
930                 struct rtable *trt;
931                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
932                        NIPQUAD(rt->rt_dst));
933                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
934                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
935                 printk("\n");
936         }
937 #endif
938         rt_hash_table[hash].chain = rt;
939         spin_unlock_bh(rt_hash_lock_addr(hash));
940         *rp = rt;
941         return 0;
942 }
943
944 void rt_bind_peer(struct rtable *rt, int create)
945 {
946         static DEFINE_SPINLOCK(rt_peer_lock);
947         struct inet_peer *peer;
948
949         peer = inet_getpeer(rt->rt_dst, create);
950
951         spin_lock_bh(&rt_peer_lock);
952         if (rt->peer == NULL) {
953                 rt->peer = peer;
954                 peer = NULL;
955         }
956         spin_unlock_bh(&rt_peer_lock);
957         if (peer)
958                 inet_putpeer(peer);
959 }
960
961 /*
962  * Peer allocation may fail only in serious out-of-memory conditions.  However
963  * we still can generate some output.
964  * Random ID selection looks a bit dangerous because we have no chances to
965  * select ID being unique in a reasonable period of time.
966  * But broken packet identifier may be better than no packet at all.
967  */
968 static void ip_select_fb_ident(struct iphdr *iph)
969 {
970         static DEFINE_SPINLOCK(ip_fb_id_lock);
971         static u32 ip_fallback_id;
972         u32 salt;
973
974         spin_lock_bh(&ip_fb_id_lock);
975         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
976         iph->id = htons(salt & 0xFFFF);
977         ip_fallback_id = salt;
978         spin_unlock_bh(&ip_fb_id_lock);
979 }
980
981 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
982 {
983         struct rtable *rt = (struct rtable *) dst;
984
985         if (rt) {
986                 if (rt->peer == NULL)
987                         rt_bind_peer(rt, 1);
988
989                 /* If peer is attached to destination, it is never detached,
990                    so that we need not to grab a lock to dereference it.
991                  */
992                 if (rt->peer) {
993                         iph->id = htons(inet_getid(rt->peer, more));
994                         return;
995                 }
996         } else
997                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
998                        __builtin_return_address(0));
999
1000         ip_select_fb_ident(iph);
1001 }
1002
1003 static void rt_del(unsigned hash, struct rtable *rt)
1004 {
1005         struct rtable **rthp;
1006
1007         spin_lock_bh(rt_hash_lock_addr(hash));
1008         ip_rt_put(rt);
1009         for (rthp = &rt_hash_table[hash].chain; *rthp;
1010              rthp = &(*rthp)->u.dst.rt_next)
1011                 if (*rthp == rt) {
1012                         *rthp = rt->u.dst.rt_next;
1013                         rt_free(rt);
1014                         break;
1015                 }
1016         spin_unlock_bh(rt_hash_lock_addr(hash));
1017 }
1018
1019 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1020                     __be32 saddr, struct net_device *dev)
1021 {
1022         int i, k;
1023         struct in_device *in_dev = in_dev_get(dev);
1024         struct rtable *rth, **rthp;
1025         __be32  skeys[2] = { saddr, 0 };
1026         int  ikeys[2] = { dev->ifindex, 0 };
1027         struct netevent_redirect netevent;
1028
1029         if (!in_dev)
1030                 return;
1031
1032         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1033             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1034                 goto reject_redirect;
1035
1036         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1037                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1038                         goto reject_redirect;
1039                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1040                         goto reject_redirect;
1041         } else {
1042                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1043                         goto reject_redirect;
1044         }
1045
1046         for (i = 0; i < 2; i++) {
1047                 for (k = 0; k < 2; k++) {
1048                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1049
1050                         rthp=&rt_hash_table[hash].chain;
1051
1052                         rcu_read_lock();
1053                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1054                                 struct rtable *rt;
1055
1056                                 if (rth->fl.fl4_dst != daddr ||
1057                                     rth->fl.fl4_src != skeys[i] ||
1058                                     rth->fl.oif != ikeys[k] ||
1059                                     rth->fl.iif != 0) {
1060                                         rthp = &rth->u.dst.rt_next;
1061                                         continue;
1062                                 }
1063
1064                                 if (rth->rt_dst != daddr ||
1065                                     rth->rt_src != saddr ||
1066                                     rth->u.dst.error ||
1067                                     rth->rt_gateway != old_gw ||
1068                                     rth->u.dst.dev != dev)
1069                                         break;
1070
1071                                 dst_hold(&rth->u.dst);
1072                                 rcu_read_unlock();
1073
1074                                 rt = dst_alloc(&ipv4_dst_ops);
1075                                 if (rt == NULL) {
1076                                         ip_rt_put(rth);
1077                                         in_dev_put(in_dev);
1078                                         return;
1079                                 }
1080
1081                                 /* Copy all the information. */
1082                                 *rt = *rth;
1083                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1084                                 rt->u.dst.__use         = 1;
1085                                 atomic_set(&rt->u.dst.__refcnt, 1);
1086                                 rt->u.dst.child         = NULL;
1087                                 if (rt->u.dst.dev)
1088                                         dev_hold(rt->u.dst.dev);
1089                                 if (rt->idev)
1090                                         in_dev_hold(rt->idev);
1091                                 rt->u.dst.obsolete      = 0;
1092                                 rt->u.dst.lastuse       = jiffies;
1093                                 rt->u.dst.path          = &rt->u.dst;
1094                                 rt->u.dst.neighbour     = NULL;
1095                                 rt->u.dst.hh            = NULL;
1096                                 rt->u.dst.xfrm          = NULL;
1097
1098                                 rt->rt_flags            |= RTCF_REDIRECTED;
1099
1100                                 /* Gateway is different ... */
1101                                 rt->rt_gateway          = new_gw;
1102
1103                                 /* Redirect received -> path was valid */
1104                                 dst_confirm(&rth->u.dst);
1105
1106                                 if (rt->peer)
1107                                         atomic_inc(&rt->peer->refcnt);
1108
1109                                 if (arp_bind_neighbour(&rt->u.dst) ||
1110                                     !(rt->u.dst.neighbour->nud_state &
1111                                             NUD_VALID)) {
1112                                         if (rt->u.dst.neighbour)
1113                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1114                                         ip_rt_put(rth);
1115                                         rt_drop(rt);
1116                                         goto do_next;
1117                                 }
1118
1119                                 netevent.old = &rth->u.dst;
1120                                 netevent.new = &rt->u.dst;
1121                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1122                                                         &netevent);
1123
1124                                 rt_del(hash, rth);
1125                                 if (!rt_intern_hash(hash, rt, &rt))
1126                                         ip_rt_put(rt);
1127                                 goto do_next;
1128                         }
1129                         rcu_read_unlock();
1130                 do_next:
1131                         ;
1132                 }
1133         }
1134         in_dev_put(in_dev);
1135         return;
1136
1137 reject_redirect:
1138 #ifdef CONFIG_IP_ROUTE_VERBOSE
1139         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1140                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1141                         "%u.%u.%u.%u ignored.\n"
1142                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1143                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1144                        NIPQUAD(saddr), NIPQUAD(daddr));
1145 #endif
1146         in_dev_put(in_dev);
1147 }
1148
1149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1150 {
1151         struct rtable *rt = (struct rtable*)dst;
1152         struct dst_entry *ret = dst;
1153
1154         if (rt) {
1155                 if (dst->obsolete) {
1156                         ip_rt_put(rt);
1157                         ret = NULL;
1158                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1159                            rt->u.dst.expires) {
1160                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1161                                                 rt->fl.oif);
1162 #if RT_CACHE_DEBUG >= 1
1163                         printk(KERN_DEBUG "ip_rt_advice: redirect to "
1164                                           "%u.%u.%u.%u/%02x dropped\n",
1165                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1166 #endif
1167                         rt_del(hash, rt);
1168                         ret = NULL;
1169                 }
1170         }
1171         return ret;
1172 }
1173
1174 /*
1175  * Algorithm:
1176  *      1. The first ip_rt_redirect_number redirects are sent
1177  *         with exponential backoff, then we stop sending them at all,
1178  *         assuming that the host ignores our redirects.
1179  *      2. If we did not see packets requiring redirects
1180  *         during ip_rt_redirect_silence, we assume that the host
1181  *         forgot redirected route and start to send redirects again.
1182  *
1183  * This algorithm is much cheaper and more intelligent than dumb load limiting
1184  * in icmp.c.
1185  *
1186  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1187  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1188  */
1189
1190 void ip_rt_send_redirect(struct sk_buff *skb)
1191 {
1192         struct rtable *rt = (struct rtable*)skb->dst;
1193         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1194
1195         if (!in_dev)
1196                 return;
1197
1198         if (!IN_DEV_TX_REDIRECTS(in_dev))
1199                 goto out;
1200
1201         /* No redirected packets during ip_rt_redirect_silence;
1202          * reset the algorithm.
1203          */
1204         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1205                 rt->u.dst.rate_tokens = 0;
1206
1207         /* Too many ignored redirects; do not send anything
1208          * set u.dst.rate_last to the last seen redirected packet.
1209          */
1210         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1211                 rt->u.dst.rate_last = jiffies;
1212                 goto out;
1213         }
1214
1215         /* Check for load limit; set rate_last to the latest sent
1216          * redirect.
1217          */
1218         if (rt->u.dst.rate_tokens == 0 ||
1219             time_after(jiffies,
1220                        (rt->u.dst.rate_last +
1221                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1222                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1223                 rt->u.dst.rate_last = jiffies;
1224                 ++rt->u.dst.rate_tokens;
1225 #ifdef CONFIG_IP_ROUTE_VERBOSE
1226                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1227                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1228                     net_ratelimit())
1229                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1230                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1231                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1232                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1233 #endif
1234         }
1235 out:
1236         in_dev_put(in_dev);
1237 }
1238
1239 static int ip_error(struct sk_buff *skb)
1240 {
1241         struct rtable *rt = (struct rtable*)skb->dst;
1242         unsigned long now;
1243         int code;
1244
1245         switch (rt->u.dst.error) {
1246                 case EINVAL:
1247                 default:
1248                         goto out;
1249                 case EHOSTUNREACH:
1250                         code = ICMP_HOST_UNREACH;
1251                         break;
1252                 case ENETUNREACH:
1253                         code = ICMP_NET_UNREACH;
1254                         break;
1255                 case EACCES:
1256                         code = ICMP_PKT_FILTERED;
1257                         break;
1258         }
1259
1260         now = jiffies;
1261         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1262         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1263                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1264         rt->u.dst.rate_last = now;
1265         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1266                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1267                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1268         }
1269
1270 out:    kfree_skb(skb);
1271         return 0;
1272 }
1273
1274 /*
1275  *      The last two values are not from the RFC but
1276  *      are needed for AMPRnet AX.25 paths.
1277  */
1278
1279 static const unsigned short mtu_plateau[] =
1280 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1281
1282 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1283 {
1284         int i;
1285
1286         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1287                 if (old_mtu > mtu_plateau[i])
1288                         return mtu_plateau[i];
1289         return 68;
1290 }
1291
1292 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1293 {
1294         int i;
1295         unsigned short old_mtu = ntohs(iph->tot_len);
1296         struct rtable *rth;
1297         __be32  skeys[2] = { iph->saddr, 0, };
1298         __be32  daddr = iph->daddr;
1299         unsigned short est_mtu = 0;
1300
1301         if (ipv4_config.no_pmtu_disc)
1302                 return 0;
1303
1304         for (i = 0; i < 2; i++) {
1305                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1306
1307                 rcu_read_lock();
1308                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1309                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1310                         if (rth->fl.fl4_dst == daddr &&
1311                             rth->fl.fl4_src == skeys[i] &&
1312                             rth->rt_dst  == daddr &&
1313                             rth->rt_src  == iph->saddr &&
1314                             rth->fl.iif == 0 &&
1315                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1316                                 unsigned short mtu = new_mtu;
1317
1318                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1319
1320                                         /* BSD 4.2 compatibility hack :-( */
1321                                         if (mtu == 0 &&
1322                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1323                                             old_mtu >= 68 + (iph->ihl << 2))
1324                                                 old_mtu -= iph->ihl << 2;
1325
1326                                         mtu = guess_mtu(old_mtu);
1327                                 }
1328                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1329                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1330                                                 dst_confirm(&rth->u.dst);
1331                                                 if (mtu < ip_rt_min_pmtu) {
1332                                                         mtu = ip_rt_min_pmtu;
1333                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1334                                                                 (1 << RTAX_MTU);
1335                                                 }
1336                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1337                                                 dst_set_expires(&rth->u.dst,
1338                                                         ip_rt_mtu_expires);
1339                                         }
1340                                         est_mtu = mtu;
1341                                 }
1342                         }
1343                 }
1344                 rcu_read_unlock();
1345         }
1346         return est_mtu ? : new_mtu;
1347 }
1348
1349 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1350 {
1351         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1352             !(dst_metric_locked(dst, RTAX_MTU))) {
1353                 if (mtu < ip_rt_min_pmtu) {
1354                         mtu = ip_rt_min_pmtu;
1355                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1356                 }
1357                 dst->metrics[RTAX_MTU-1] = mtu;
1358                 dst_set_expires(dst, ip_rt_mtu_expires);
1359                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1360         }
1361 }
1362
1363 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1364 {
1365         return NULL;
1366 }
1367
1368 static void ipv4_dst_destroy(struct dst_entry *dst)
1369 {
1370         struct rtable *rt = (struct rtable *) dst;
1371         struct inet_peer *peer = rt->peer;
1372         struct in_device *idev = rt->idev;
1373
1374         if (peer) {
1375                 rt->peer = NULL;
1376                 inet_putpeer(peer);
1377         }
1378
1379         if (idev) {
1380                 rt->idev = NULL;
1381                 in_dev_put(idev);
1382         }
1383 }
1384
1385 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1386                             int how)
1387 {
1388         struct rtable *rt = (struct rtable *) dst;
1389         struct in_device *idev = rt->idev;
1390         if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1391                 struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
1392                 if (loopback_idev) {
1393                         rt->idev = loopback_idev;
1394                         in_dev_put(idev);
1395                 }
1396         }
1397 }
1398
1399 static void ipv4_link_failure(struct sk_buff *skb)
1400 {
1401         struct rtable *rt;
1402
1403         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1404
1405         rt = (struct rtable *) skb->dst;
1406         if (rt)
1407                 dst_set_expires(&rt->u.dst, 0);
1408 }
1409
1410 static int ip_rt_bug(struct sk_buff *skb)
1411 {
1412         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1413                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1414                 skb->dev ? skb->dev->name : "?");
1415         kfree_skb(skb);
1416         return 0;
1417 }
1418
1419 /*
1420    We do not cache source address of outgoing interface,
1421    because it is used only by IP RR, TS and SRR options,
1422    so that it out of fast path.
1423
1424    BTW remember: "addr" is allowed to be not aligned
1425    in IP options!
1426  */
1427
1428 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1429 {
1430         __be32 src;
1431         struct fib_result res;
1432
1433         if (rt->fl.iif == 0)
1434                 src = rt->rt_src;
1435         else if (fib_lookup(&rt->fl, &res) == 0) {
1436                 src = FIB_RES_PREFSRC(res);
1437                 fib_res_put(&res);
1438         } else
1439                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1440                                         RT_SCOPE_UNIVERSE);
1441         memcpy(addr, &src, 4);
1442 }
1443
1444 #ifdef CONFIG_NET_CLS_ROUTE
1445 static void set_class_tag(struct rtable *rt, u32 tag)
1446 {
1447         if (!(rt->u.dst.tclassid & 0xFFFF))
1448                 rt->u.dst.tclassid |= tag & 0xFFFF;
1449         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1450                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1451 }
1452 #endif
1453
1454 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1455 {
1456         struct fib_info *fi = res->fi;
1457
1458         if (fi) {
1459                 if (FIB_RES_GW(*res) &&
1460                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1461                         rt->rt_gateway = FIB_RES_GW(*res);
1462                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1463                        sizeof(rt->u.dst.metrics));
1464                 if (fi->fib_mtu == 0) {
1465                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1466                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1467                             rt->rt_gateway != rt->rt_dst &&
1468                             rt->u.dst.dev->mtu > 576)
1469                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1470                 }
1471 #ifdef CONFIG_NET_CLS_ROUTE
1472                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1473 #endif
1474         } else
1475                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1476
1477         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1478                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1479         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1480                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1481         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1482                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1483                                        ip_rt_min_advmss);
1484         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1485                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1486
1487 #ifdef CONFIG_NET_CLS_ROUTE
1488 #ifdef CONFIG_IP_MULTIPLE_TABLES
1489         set_class_tag(rt, fib_rules_tclass(res));
1490 #endif
1491         set_class_tag(rt, itag);
1492 #endif
1493         rt->rt_type = res->type;
1494 }
1495
1496 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1497                                 u8 tos, struct net_device *dev, int our)
1498 {
1499         unsigned hash;
1500         struct rtable *rth;
1501         __be32 spec_dst;
1502         struct in_device *in_dev = in_dev_get(dev);
1503         u32 itag = 0;
1504
1505         /* Primary sanity checks. */
1506
1507         if (in_dev == NULL)
1508                 return -EINVAL;
1509
1510         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1511             skb->protocol != htons(ETH_P_IP))
1512                 goto e_inval;
1513
1514         if (ZERONET(saddr)) {
1515                 if (!LOCAL_MCAST(daddr))
1516                         goto e_inval;
1517                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1518         } else if (fib_validate_source(saddr, 0, tos, 0,
1519                                         dev, &spec_dst, &itag) < 0)
1520                 goto e_inval;
1521
1522         rth = dst_alloc(&ipv4_dst_ops);
1523         if (!rth)
1524                 goto e_nobufs;
1525
1526         rth->u.dst.output= ip_rt_bug;
1527
1528         atomic_set(&rth->u.dst.__refcnt, 1);
1529         rth->u.dst.flags= DST_HOST;
1530         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1531                 rth->u.dst.flags |= DST_NOPOLICY;
1532         rth->fl.fl4_dst = daddr;
1533         rth->rt_dst     = daddr;
1534         rth->fl.fl4_tos = tos;
1535         rth->fl.mark    = skb->mark;
1536         rth->fl.fl4_src = saddr;
1537         rth->rt_src     = saddr;
1538 #ifdef CONFIG_NET_CLS_ROUTE
1539         rth->u.dst.tclassid = itag;
1540 #endif
1541         rth->rt_iif     =
1542         rth->fl.iif     = dev->ifindex;
1543         rth->u.dst.dev  = init_net.loopback_dev;
1544         dev_hold(rth->u.dst.dev);
1545         rth->idev       = in_dev_get(rth->u.dst.dev);
1546         rth->fl.oif     = 0;
1547         rth->rt_gateway = daddr;
1548         rth->rt_spec_dst= spec_dst;
1549         rth->rt_type    = RTN_MULTICAST;
1550         rth->rt_flags   = RTCF_MULTICAST;
1551         if (our) {
1552                 rth->u.dst.input= ip_local_deliver;
1553                 rth->rt_flags |= RTCF_LOCAL;
1554         }
1555
1556 #ifdef CONFIG_IP_MROUTE
1557         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1558                 rth->u.dst.input = ip_mr_input;
1559 #endif
1560         RT_CACHE_STAT_INC(in_slow_mc);
1561
1562         in_dev_put(in_dev);
1563         hash = rt_hash(daddr, saddr, dev->ifindex);
1564         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1565
1566 e_nobufs:
1567         in_dev_put(in_dev);
1568         return -ENOBUFS;
1569
1570 e_inval:
1571         in_dev_put(in_dev);
1572         return -EINVAL;
1573 }
1574
1575
1576 static void ip_handle_martian_source(struct net_device *dev,
1577                                      struct in_device *in_dev,
1578                                      struct sk_buff *skb,
1579                                      __be32 daddr,
1580                                      __be32 saddr)
1581 {
1582         RT_CACHE_STAT_INC(in_martian_src);
1583 #ifdef CONFIG_IP_ROUTE_VERBOSE
1584         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1585                 /*
1586                  *      RFC1812 recommendation, if source is martian,
1587                  *      the only hint is MAC header.
1588                  */
1589                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1590                         "%u.%u.%u.%u, on dev %s\n",
1591                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1592                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1593                         int i;
1594                         const unsigned char *p = skb_mac_header(skb);
1595                         printk(KERN_WARNING "ll header: ");
1596                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1597                                 printk("%02x", *p);
1598                                 if (i < (dev->hard_header_len - 1))
1599                                         printk(":");
1600                         }
1601                         printk("\n");
1602                 }
1603         }
1604 #endif
1605 }
1606
1607 static inline int __mkroute_input(struct sk_buff *skb,
1608                                   struct fib_result* res,
1609                                   struct in_device *in_dev,
1610                                   __be32 daddr, __be32 saddr, u32 tos,
1611                                   struct rtable **result)
1612 {
1613
1614         struct rtable *rth;
1615         int err;
1616         struct in_device *out_dev;
1617         unsigned flags = 0;
1618         __be32 spec_dst;
1619         u32 itag;
1620
1621         /* get a working reference to the output device */
1622         out_dev = in_dev_get(FIB_RES_DEV(*res));
1623         if (out_dev == NULL) {
1624                 if (net_ratelimit())
1625                         printk(KERN_CRIT "Bug in ip_route_input" \
1626                                "_slow(). Please, report\n");
1627                 return -EINVAL;
1628         }
1629
1630
1631         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1632                                   in_dev->dev, &spec_dst, &itag);
1633         if (err < 0) {
1634                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1635                                          saddr);
1636
1637                 err = -EINVAL;
1638                 goto cleanup;
1639         }
1640
1641         if (err)
1642                 flags |= RTCF_DIRECTSRC;
1643
1644         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1645             (IN_DEV_SHARED_MEDIA(out_dev) ||
1646              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1647                 flags |= RTCF_DOREDIRECT;
1648
1649         if (skb->protocol != htons(ETH_P_IP)) {
1650                 /* Not IP (i.e. ARP). Do not create route, if it is
1651                  * invalid for proxy arp. DNAT routes are always valid.
1652                  */
1653                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1654                         err = -EINVAL;
1655                         goto cleanup;
1656                 }
1657         }
1658
1659
1660         rth = dst_alloc(&ipv4_dst_ops);
1661         if (!rth) {
1662                 err = -ENOBUFS;
1663                 goto cleanup;
1664         }
1665
1666         atomic_set(&rth->u.dst.__refcnt, 1);
1667         rth->u.dst.flags= DST_HOST;
1668         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1669                 rth->u.dst.flags |= DST_NOPOLICY;
1670         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1671                 rth->u.dst.flags |= DST_NOXFRM;
1672         rth->fl.fl4_dst = daddr;
1673         rth->rt_dst     = daddr;
1674         rth->fl.fl4_tos = tos;
1675         rth->fl.mark    = skb->mark;
1676         rth->fl.fl4_src = saddr;
1677         rth->rt_src     = saddr;
1678         rth->rt_gateway = daddr;
1679         rth->rt_iif     =
1680                 rth->fl.iif     = in_dev->dev->ifindex;
1681         rth->u.dst.dev  = (out_dev)->dev;
1682         dev_hold(rth->u.dst.dev);
1683         rth->idev       = in_dev_get(rth->u.dst.dev);
1684         rth->fl.oif     = 0;
1685         rth->rt_spec_dst= spec_dst;
1686
1687         rth->u.dst.input = ip_forward;
1688         rth->u.dst.output = ip_output;
1689
1690         rt_set_nexthop(rth, res, itag);
1691
1692         rth->rt_flags = flags;
1693
1694         *result = rth;
1695         err = 0;
1696  cleanup:
1697         /* release the working reference to the output device */
1698         in_dev_put(out_dev);
1699         return err;
1700 }
1701
1702 static inline int ip_mkroute_input(struct sk_buff *skb,
1703                                    struct fib_result* res,
1704                                    const struct flowi *fl,
1705                                    struct in_device *in_dev,
1706                                    __be32 daddr, __be32 saddr, u32 tos)
1707 {
1708         struct rtable* rth = NULL;
1709         int err;
1710         unsigned hash;
1711
1712 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1713         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1714                 fib_select_multipath(fl, res);
1715 #endif
1716
1717         /* create a routing cache entry */
1718         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1719         if (err)
1720                 return err;
1721
1722         /* put it into the cache */
1723         hash = rt_hash(daddr, saddr, fl->iif);
1724         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1725 }
1726
1727 /*
1728  *      NOTE. We drop all the packets that has local source
1729  *      addresses, because every properly looped back packet
1730  *      must have correct destination already attached by output routine.
1731  *
1732  *      Such approach solves two big problems:
1733  *      1. Not simplex devices are handled properly.
1734  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1735  */
1736
1737 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1738                                u8 tos, struct net_device *dev)
1739 {
1740         struct fib_result res;
1741         struct in_device *in_dev = in_dev_get(dev);
1742         struct flowi fl = { .nl_u = { .ip4_u =
1743                                       { .daddr = daddr,
1744                                         .saddr = saddr,
1745                                         .tos = tos,
1746                                         .scope = RT_SCOPE_UNIVERSE,
1747                                       } },
1748                             .mark = skb->mark,
1749                             .iif = dev->ifindex };
1750         unsigned        flags = 0;
1751         u32             itag = 0;
1752         struct rtable * rth;
1753         unsigned        hash;
1754         __be32          spec_dst;
1755         int             err = -EINVAL;
1756         int             free_res = 0;
1757
1758         /* IP on this device is disabled. */
1759
1760         if (!in_dev)
1761                 goto out;
1762
1763         /* Check for the most weird martians, which can be not detected
1764            by fib_lookup.
1765          */
1766
1767         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1768                 goto martian_source;
1769
1770         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1771                 goto brd_input;
1772
1773         /* Accept zero addresses only to limited broadcast;
1774          * I even do not know to fix it or not. Waiting for complains :-)
1775          */
1776         if (ZERONET(saddr))
1777                 goto martian_source;
1778
1779         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1780                 goto martian_destination;
1781
1782         /*
1783          *      Now we are ready to route packet.
1784          */
1785         if ((err = fib_lookup(&fl, &res)) != 0) {
1786                 if (!IN_DEV_FORWARD(in_dev))
1787                         goto e_hostunreach;
1788                 goto no_route;
1789         }
1790         free_res = 1;
1791
1792         RT_CACHE_STAT_INC(in_slow_tot);
1793
1794         if (res.type == RTN_BROADCAST)
1795                 goto brd_input;
1796
1797         if (res.type == RTN_LOCAL) {
1798                 int result;
1799                 result = fib_validate_source(saddr, daddr, tos,
1800                                              init_net.loopback_dev->ifindex,
1801                                              dev, &spec_dst, &itag);
1802                 if (result < 0)
1803                         goto martian_source;
1804                 if (result)
1805                         flags |= RTCF_DIRECTSRC;
1806                 spec_dst = daddr;
1807                 goto local_input;
1808         }
1809
1810         if (!IN_DEV_FORWARD(in_dev))
1811                 goto e_hostunreach;
1812         if (res.type != RTN_UNICAST)
1813                 goto martian_destination;
1814
1815         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1816         if (err == -ENOBUFS)
1817                 goto e_nobufs;
1818         if (err == -EINVAL)
1819                 goto e_inval;
1820
1821 done:
1822         in_dev_put(in_dev);
1823         if (free_res)
1824                 fib_res_put(&res);
1825 out:    return err;
1826
1827 brd_input:
1828         if (skb->protocol != htons(ETH_P_IP))
1829                 goto e_inval;
1830
1831         if (ZERONET(saddr))
1832                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1833         else {
1834                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1835                                           &itag);
1836                 if (err < 0)
1837                         goto martian_source;
1838                 if (err)
1839                         flags |= RTCF_DIRECTSRC;
1840         }
1841         flags |= RTCF_BROADCAST;
1842         res.type = RTN_BROADCAST;
1843         RT_CACHE_STAT_INC(in_brd);
1844
1845 local_input:
1846         rth = dst_alloc(&ipv4_dst_ops);
1847         if (!rth)
1848                 goto e_nobufs;
1849
1850         rth->u.dst.output= ip_rt_bug;
1851
1852         atomic_set(&rth->u.dst.__refcnt, 1);
1853         rth->u.dst.flags= DST_HOST;
1854         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1855                 rth->u.dst.flags |= DST_NOPOLICY;
1856         rth->fl.fl4_dst = daddr;
1857         rth->rt_dst     = daddr;
1858         rth->fl.fl4_tos = tos;
1859         rth->fl.mark    = skb->mark;
1860         rth->fl.fl4_src = saddr;
1861         rth->rt_src     = saddr;
1862 #ifdef CONFIG_NET_CLS_ROUTE
1863         rth->u.dst.tclassid = itag;
1864 #endif
1865         rth->rt_iif     =
1866         rth->fl.iif     = dev->ifindex;
1867         rth->u.dst.dev  = init_net.loopback_dev;
1868         dev_hold(rth->u.dst.dev);
1869         rth->idev       = in_dev_get(rth->u.dst.dev);
1870         rth->rt_gateway = daddr;
1871         rth->rt_spec_dst= spec_dst;
1872         rth->u.dst.input= ip_local_deliver;
1873         rth->rt_flags   = flags|RTCF_LOCAL;
1874         if (res.type == RTN_UNREACHABLE) {
1875                 rth->u.dst.input= ip_error;
1876                 rth->u.dst.error= -err;
1877                 rth->rt_flags   &= ~RTCF_LOCAL;
1878         }
1879         rth->rt_type    = res.type;
1880         hash = rt_hash(daddr, saddr, fl.iif);
1881         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1882         goto done;
1883
1884 no_route:
1885         RT_CACHE_STAT_INC(in_no_route);
1886         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1887         res.type = RTN_UNREACHABLE;
1888         goto local_input;
1889
1890         /*
1891          *      Do not cache martian addresses: they should be logged (RFC1812)
1892          */
1893 martian_destination:
1894         RT_CACHE_STAT_INC(in_martian_dst);
1895 #ifdef CONFIG_IP_ROUTE_VERBOSE
1896         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1897                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1898                         "%u.%u.%u.%u, dev %s\n",
1899                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1900 #endif
1901
1902 e_hostunreach:
1903         err = -EHOSTUNREACH;
1904         goto done;
1905
1906 e_inval:
1907         err = -EINVAL;
1908         goto done;
1909
1910 e_nobufs:
1911         err = -ENOBUFS;
1912         goto done;
1913
1914 martian_source:
1915         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1916         goto e_inval;
1917 }
1918
1919 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1920                    u8 tos, struct net_device *dev)
1921 {
1922         struct rtable * rth;
1923         unsigned        hash;
1924         int iif = dev->ifindex;
1925
1926         tos &= IPTOS_RT_MASK;
1927         hash = rt_hash(daddr, saddr, iif);
1928
1929         rcu_read_lock();
1930         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1931              rth = rcu_dereference(rth->u.dst.rt_next)) {
1932                 if (rth->fl.fl4_dst == daddr &&
1933                     rth->fl.fl4_src == saddr &&
1934                     rth->fl.iif == iif &&
1935                     rth->fl.oif == 0 &&
1936                     rth->fl.mark == skb->mark &&
1937                     rth->fl.fl4_tos == tos) {
1938                         rth->u.dst.lastuse = jiffies;
1939                         dst_hold(&rth->u.dst);
1940                         rth->u.dst.__use++;
1941                         RT_CACHE_STAT_INC(in_hit);
1942                         rcu_read_unlock();
1943                         skb->dst = (struct dst_entry*)rth;
1944                         return 0;
1945                 }
1946                 RT_CACHE_STAT_INC(in_hlist_search);
1947         }
1948         rcu_read_unlock();
1949
1950         /* Multicast recognition logic is moved from route cache to here.
1951            The problem was that too many Ethernet cards have broken/missing
1952            hardware multicast filters :-( As result the host on multicasting
1953            network acquires a lot of useless route cache entries, sort of
1954            SDR messages from all the world. Now we try to get rid of them.
1955            Really, provided software IP multicast filter is organized
1956            reasonably (at least, hashed), it does not result in a slowdown
1957            comparing with route cache reject entries.
1958            Note, that multicast routers are not affected, because
1959            route cache entry is created eventually.
1960          */
1961         if (MULTICAST(daddr)) {
1962                 struct in_device *in_dev;
1963
1964                 rcu_read_lock();
1965                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
1966                         int our = ip_check_mc(in_dev, daddr, saddr,
1967                                 ip_hdr(skb)->protocol);
1968                         if (our
1969 #ifdef CONFIG_IP_MROUTE
1970                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1971 #endif
1972                             ) {
1973                                 rcu_read_unlock();
1974                                 return ip_route_input_mc(skb, daddr, saddr,
1975                                                          tos, dev, our);
1976                         }
1977                 }
1978                 rcu_read_unlock();
1979                 return -EINVAL;
1980         }
1981         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
1982 }
1983
1984 static inline int __mkroute_output(struct rtable **result,
1985                                    struct fib_result* res,
1986                                    const struct flowi *fl,
1987                                    const struct flowi *oldflp,
1988                                    struct net_device *dev_out,
1989                                    unsigned flags)
1990 {
1991         struct rtable *rth;
1992         struct in_device *in_dev;
1993         u32 tos = RT_FL_TOS(oldflp);
1994         int err = 0;
1995
1996         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
1997                 return -EINVAL;
1998
1999         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2000                 res->type = RTN_BROADCAST;
2001         else if (MULTICAST(fl->fl4_dst))
2002                 res->type = RTN_MULTICAST;
2003         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2004                 return -EINVAL;
2005
2006         if (dev_out->flags & IFF_LOOPBACK)
2007                 flags |= RTCF_LOCAL;
2008
2009         /* get work reference to inet device */
2010         in_dev = in_dev_get(dev_out);
2011         if (!in_dev)
2012                 return -EINVAL;
2013
2014         if (res->type == RTN_BROADCAST) {
2015                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2016                 if (res->fi) {
2017                         fib_info_put(res->fi);
2018                         res->fi = NULL;
2019                 }
2020         } else if (res->type == RTN_MULTICAST) {
2021                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2022                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2023                                  oldflp->proto))
2024                         flags &= ~RTCF_LOCAL;
2025                 /* If multicast route do not exist use
2026                    default one, but do not gateway in this case.
2027                    Yes, it is hack.
2028                  */
2029                 if (res->fi && res->prefixlen < 4) {
2030                         fib_info_put(res->fi);
2031                         res->fi = NULL;
2032                 }
2033         }
2034
2035
2036         rth = dst_alloc(&ipv4_dst_ops);
2037         if (!rth) {
2038                 err = -ENOBUFS;
2039                 goto cleanup;
2040         }
2041
2042         atomic_set(&rth->u.dst.__refcnt, 1);
2043         rth->u.dst.flags= DST_HOST;
2044         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2045                 rth->u.dst.flags |= DST_NOXFRM;
2046         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2047                 rth->u.dst.flags |= DST_NOPOLICY;
2048
2049         rth->fl.fl4_dst = oldflp->fl4_dst;
2050         rth->fl.fl4_tos = tos;
2051         rth->fl.fl4_src = oldflp->fl4_src;
2052         rth->fl.oif     = oldflp->oif;
2053         rth->fl.mark    = oldflp->mark;
2054         rth->rt_dst     = fl->fl4_dst;
2055         rth->rt_src     = fl->fl4_src;
2056         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2057         /* get references to the devices that are to be hold by the routing
2058            cache entry */
2059         rth->u.dst.dev  = dev_out;
2060         dev_hold(dev_out);
2061         rth->idev       = in_dev_get(dev_out);
2062         rth->rt_gateway = fl->fl4_dst;
2063         rth->rt_spec_dst= fl->fl4_src;
2064
2065         rth->u.dst.output=ip_output;
2066
2067         RT_CACHE_STAT_INC(out_slow_tot);
2068
2069         if (flags & RTCF_LOCAL) {
2070                 rth->u.dst.input = ip_local_deliver;
2071                 rth->rt_spec_dst = fl->fl4_dst;
2072         }
2073         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2074                 rth->rt_spec_dst = fl->fl4_src;
2075                 if (flags & RTCF_LOCAL &&
2076                     !(dev_out->flags & IFF_LOOPBACK)) {
2077                         rth->u.dst.output = ip_mc_output;
2078                         RT_CACHE_STAT_INC(out_slow_mc);
2079                 }
2080 #ifdef CONFIG_IP_MROUTE
2081                 if (res->type == RTN_MULTICAST) {
2082                         if (IN_DEV_MFORWARD(in_dev) &&
2083                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2084                                 rth->u.dst.input = ip_mr_input;
2085                                 rth->u.dst.output = ip_mc_output;
2086                         }
2087                 }
2088 #endif
2089         }
2090
2091         rt_set_nexthop(rth, res, 0);
2092
2093         rth->rt_flags = flags;
2094
2095         *result = rth;
2096  cleanup:
2097         /* release work reference to inet device */
2098         in_dev_put(in_dev);
2099
2100         return err;
2101 }
2102
2103 static inline int ip_mkroute_output(struct rtable **rp,
2104                                     struct fib_result* res,
2105                                     const struct flowi *fl,
2106                                     const struct flowi *oldflp,
2107                                     struct net_device *dev_out,
2108                                     unsigned flags)
2109 {
2110         struct rtable *rth = NULL;
2111         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2112         unsigned hash;
2113         if (err == 0) {
2114                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2115                 err = rt_intern_hash(hash, rth, rp);
2116         }
2117
2118         return err;
2119 }
2120
2121 /*
2122  * Major route resolver routine.
2123  */
2124
2125 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2126 {
2127         u32 tos = RT_FL_TOS(oldflp);
2128         struct flowi fl = { .nl_u = { .ip4_u =
2129                                       { .daddr = oldflp->fl4_dst,
2130                                         .saddr = oldflp->fl4_src,
2131                                         .tos = tos & IPTOS_RT_MASK,
2132                                         .scope = ((tos & RTO_ONLINK) ?
2133                                                   RT_SCOPE_LINK :
2134                                                   RT_SCOPE_UNIVERSE),
2135                                       } },
2136                             .mark = oldflp->mark,
2137                             .iif = init_net.loopback_dev->ifindex,
2138                             .oif = oldflp->oif };
2139         struct fib_result res;
2140         unsigned flags = 0;
2141         struct net_device *dev_out = NULL;
2142         int free_res = 0;
2143         int err;
2144
2145
2146         res.fi          = NULL;
2147 #ifdef CONFIG_IP_MULTIPLE_TABLES
2148         res.r           = NULL;
2149 #endif
2150
2151         if (oldflp->fl4_src) {
2152                 err = -EINVAL;
2153                 if (MULTICAST(oldflp->fl4_src) ||
2154                     BADCLASS(oldflp->fl4_src) ||
2155                     ZERONET(oldflp->fl4_src))
2156                         goto out;
2157
2158                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2159                 dev_out = ip_dev_find(oldflp->fl4_src);
2160                 if (dev_out == NULL)
2161                         goto out;
2162
2163                 /* I removed check for oif == dev_out->oif here.
2164                    It was wrong for two reasons:
2165                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2166                       assigned to multiple interfaces.
2167                    2. Moreover, we are allowed to send packets with saddr
2168                       of another iface. --ANK
2169                  */
2170
2171                 if (oldflp->oif == 0
2172                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2173                         /* Special hack: user can direct multicasts
2174                            and limited broadcast via necessary interface
2175                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2176                            This hack is not just for fun, it allows
2177                            vic,vat and friends to work.
2178                            They bind socket to loopback, set ttl to zero
2179                            and expect that it will work.
2180                            From the viewpoint of routing cache they are broken,
2181                            because we are not allowed to build multicast path
2182                            with loopback source addr (look, routing cache
2183                            cannot know, that ttl is zero, so that packet
2184                            will not leave this host and route is valid).
2185                            Luckily, this hack is good workaround.
2186                          */
2187
2188                         fl.oif = dev_out->ifindex;
2189                         goto make_route;
2190                 }
2191                 if (dev_out)
2192                         dev_put(dev_out);
2193                 dev_out = NULL;
2194         }
2195
2196
2197         if (oldflp->oif) {
2198                 dev_out = dev_get_by_index(&init_net, oldflp->oif);
2199                 err = -ENODEV;
2200                 if (dev_out == NULL)
2201                         goto out;
2202
2203                 /* RACE: Check return value of inet_select_addr instead. */
2204                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2205                         dev_put(dev_out);
2206                         goto out;       /* Wrong error code */
2207                 }
2208
2209                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2210                         if (!fl.fl4_src)
2211                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2212                                                               RT_SCOPE_LINK);
2213                         goto make_route;
2214                 }
2215                 if (!fl.fl4_src) {
2216                         if (MULTICAST(oldflp->fl4_dst))
2217                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2218                                                               fl.fl4_scope);
2219                         else if (!oldflp->fl4_dst)
2220                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2221                                                               RT_SCOPE_HOST);
2222                 }
2223         }
2224
2225         if (!fl.fl4_dst) {
2226                 fl.fl4_dst = fl.fl4_src;
2227                 if (!fl.fl4_dst)
2228                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2229                 if (dev_out)
2230                         dev_put(dev_out);
2231                 dev_out = init_net.loopback_dev;
2232                 dev_hold(dev_out);
2233                 fl.oif = init_net.loopback_dev->ifindex;
2234                 res.type = RTN_LOCAL;
2235                 flags |= RTCF_LOCAL;
2236                 goto make_route;
2237         }
2238
2239         if (fib_lookup(&fl, &res)) {
2240                 res.fi = NULL;
2241                 if (oldflp->oif) {
2242                         /* Apparently, routing tables are wrong. Assume,
2243                            that the destination is on link.
2244
2245                            WHY? DW.
2246                            Because we are allowed to send to iface
2247                            even if it has NO routes and NO assigned
2248                            addresses. When oif is specified, routing
2249                            tables are looked up with only one purpose:
2250                            to catch if destination is gatewayed, rather than
2251                            direct. Moreover, if MSG_DONTROUTE is set,
2252                            we send packet, ignoring both routing tables
2253                            and ifaddr state. --ANK
2254
2255
2256                            We could make it even if oif is unknown,
2257                            likely IPv6, but we do not.
2258                          */
2259
2260                         if (fl.fl4_src == 0)
2261                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2262                                                               RT_SCOPE_LINK);
2263                         res.type = RTN_UNICAST;
2264                         goto make_route;
2265                 }
2266                 if (dev_out)
2267                         dev_put(dev_out);
2268                 err = -ENETUNREACH;
2269                 goto out;
2270         }
2271         free_res = 1;
2272
2273         if (res.type == RTN_LOCAL) {
2274                 if (!fl.fl4_src)
2275                         fl.fl4_src = fl.fl4_dst;
2276                 if (dev_out)
2277                         dev_put(dev_out);
2278                 dev_out = init_net.loopback_dev;
2279                 dev_hold(dev_out);
2280                 fl.oif = dev_out->ifindex;
2281                 if (res.fi)
2282                         fib_info_put(res.fi);
2283                 res.fi = NULL;
2284                 flags |= RTCF_LOCAL;
2285                 goto make_route;
2286         }
2287
2288 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2289         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2290                 fib_select_multipath(&fl, &res);
2291         else
2292 #endif
2293         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2294                 fib_select_default(&fl, &res);
2295
2296         if (!fl.fl4_src)
2297                 fl.fl4_src = FIB_RES_PREFSRC(res);
2298
2299         if (dev_out)
2300                 dev_put(dev_out);
2301         dev_out = FIB_RES_DEV(res);
2302         dev_hold(dev_out);
2303         fl.oif = dev_out->ifindex;
2304
2305
2306 make_route:
2307         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2308
2309
2310         if (free_res)
2311                 fib_res_put(&res);
2312         if (dev_out)
2313                 dev_put(dev_out);
2314 out:    return err;
2315 }
2316
2317 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2318 {
2319         unsigned hash;
2320         struct rtable *rth;
2321
2322         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2323
2324         rcu_read_lock_bh();
2325         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2326                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2327                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2328                     rth->fl.fl4_src == flp->fl4_src &&
2329                     rth->fl.iif == 0 &&
2330                     rth->fl.oif == flp->oif &&
2331                     rth->fl.mark == flp->mark &&
2332                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2333                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2334                         rth->u.dst.lastuse = jiffies;
2335                         dst_hold(&rth->u.dst);
2336                         rth->u.dst.__use++;
2337                         RT_CACHE_STAT_INC(out_hit);
2338                         rcu_read_unlock_bh();
2339                         *rp = rth;
2340                         return 0;
2341                 }
2342                 RT_CACHE_STAT_INC(out_hlist_search);
2343         }
2344         rcu_read_unlock_bh();
2345
2346         return ip_route_output_slow(rp, flp);
2347 }
2348
2349 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2350
2351 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2352 {
2353 }
2354
2355 static struct dst_ops ipv4_dst_blackhole_ops = {
2356         .family                 =       AF_INET,
2357         .protocol               =       __constant_htons(ETH_P_IP),
2358         .destroy                =       ipv4_dst_destroy,
2359         .check                  =       ipv4_dst_check,
2360         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2361         .entry_size             =       sizeof(struct rtable),
2362 };
2363
2364
2365 static int ipv4_blackhole_output(struct sk_buff *skb)
2366 {
2367         kfree_skb(skb);
2368         return 0;
2369 }
2370
2371 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2372 {
2373         struct rtable *ort = *rp;
2374         struct rtable *rt = (struct rtable *)
2375                 dst_alloc(&ipv4_dst_blackhole_ops);
2376
2377         if (rt) {
2378                 struct dst_entry *new = &rt->u.dst;
2379
2380                 atomic_set(&new->__refcnt, 1);
2381                 new->__use = 1;
2382                 new->input = ipv4_blackhole_output;
2383                 new->output = ipv4_blackhole_output;
2384                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2385
2386                 new->dev = ort->u.dst.dev;
2387                 if (new->dev)
2388                         dev_hold(new->dev);
2389
2390                 rt->fl = ort->fl;
2391
2392                 rt->idev = ort->idev;
2393                 if (rt->idev)
2394                         in_dev_hold(rt->idev);
2395                 rt->rt_flags = ort->rt_flags;
2396                 rt->rt_type = ort->rt_type;
2397                 rt->rt_dst = ort->rt_dst;
2398                 rt->rt_src = ort->rt_src;
2399                 rt->rt_iif = ort->rt_iif;
2400                 rt->rt_gateway = ort->rt_gateway;
2401                 rt->rt_spec_dst = ort->rt_spec_dst;
2402                 rt->peer = ort->peer;
2403                 if (rt->peer)
2404                         atomic_inc(&rt->peer->refcnt);
2405
2406                 dst_free(new);
2407         }
2408
2409         dst_release(&(*rp)->u.dst);
2410         *rp = rt;
2411         return (rt ? 0 : -ENOMEM);
2412 }
2413
2414 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2415 {
2416         int err;
2417
2418         if ((err = __ip_route_output_key(rp, flp)) != 0)
2419                 return err;
2420
2421         if (flp->proto) {
2422                 if (!flp->fl4_src)
2423                         flp->fl4_src = (*rp)->rt_src;
2424                 if (!flp->fl4_dst)
2425                         flp->fl4_dst = (*rp)->rt_dst;
2426                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2427                 if (err == -EREMOTE)
2428                         err = ipv4_dst_blackhole(rp, flp, sk);
2429
2430                 return err;
2431         }
2432
2433         return 0;
2434 }
2435
2436 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2437
2438 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2439 {
2440         return ip_route_output_flow(rp, flp, NULL, 0);
2441 }
2442
2443 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2444                         int nowait, unsigned int flags)
2445 {
2446         struct rtable *rt = (struct rtable*)skb->dst;
2447         struct rtmsg *r;
2448         struct nlmsghdr *nlh;
2449         long expires;
2450         u32 id = 0, ts = 0, tsage = 0, error;
2451
2452         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2453         if (nlh == NULL)
2454                 return -EMSGSIZE;
2455
2456         r = nlmsg_data(nlh);
2457         r->rtm_family    = AF_INET;
2458         r->rtm_dst_len  = 32;
2459         r->rtm_src_len  = 0;
2460         r->rtm_tos      = rt->fl.fl4_tos;
2461         r->rtm_table    = RT_TABLE_MAIN;
2462         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2463         r->rtm_type     = rt->rt_type;
2464         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2465         r->rtm_protocol = RTPROT_UNSPEC;
2466         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2467         if (rt->rt_flags & RTCF_NOTIFY)
2468                 r->rtm_flags |= RTM_F_NOTIFY;
2469
2470         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2471
2472         if (rt->fl.fl4_src) {
2473                 r->rtm_src_len = 32;
2474                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2475         }
2476         if (rt->u.dst.dev)
2477                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2478 #ifdef CONFIG_NET_CLS_ROUTE
2479         if (rt->u.dst.tclassid)
2480                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2481 #endif
2482         if (rt->fl.iif)
2483                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2484         else if (rt->rt_src != rt->fl.fl4_src)
2485                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2486
2487         if (rt->rt_dst != rt->rt_gateway)
2488                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2489
2490         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2491                 goto nla_put_failure;
2492
2493         error = rt->u.dst.error;
2494         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2495         if (rt->peer) {
2496                 id = rt->peer->ip_id_count;
2497                 if (rt->peer->tcp_ts_stamp) {
2498                         ts = rt->peer->tcp_ts;
2499                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2500                 }
2501         }
2502
2503         if (rt->fl.iif) {
2504 #ifdef CONFIG_IP_MROUTE
2505                 __be32 dst = rt->rt_dst;
2506
2507                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2508                     IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2509                         int err = ipmr_get_route(skb, r, nowait);
2510                         if (err <= 0) {
2511                                 if (!nowait) {
2512                                         if (err == 0)
2513                                                 return 0;
2514                                         goto nla_put_failure;
2515                                 } else {
2516                                         if (err == -EMSGSIZE)
2517                                                 goto nla_put_failure;
2518                                         error = err;
2519                                 }
2520                         }
2521                 } else
2522 #endif
2523                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2524         }
2525
2526         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2527                                expires, error) < 0)
2528                 goto nla_put_failure;
2529
2530         return nlmsg_end(skb, nlh);
2531
2532 nla_put_failure:
2533         nlmsg_cancel(skb, nlh);
2534         return -EMSGSIZE;
2535 }
2536
2537 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2538 {
2539         struct rtmsg *rtm;
2540         struct nlattr *tb[RTA_MAX+1];
2541         struct rtable *rt = NULL;
2542         __be32 dst = 0;
2543         __be32 src = 0;
2544         u32 iif;
2545         int err;
2546         struct sk_buff *skb;
2547
2548         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2549         if (err < 0)
2550                 goto errout;
2551
2552         rtm = nlmsg_data(nlh);
2553
2554         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2555         if (skb == NULL) {
2556                 err = -ENOBUFS;
2557                 goto errout;
2558         }
2559
2560         /* Reserve room for dummy headers, this skb can pass
2561            through good chunk of routing engine.
2562          */
2563         skb_reset_mac_header(skb);
2564         skb_reset_network_header(skb);
2565
2566         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2567         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2568         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2569
2570         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2571         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2572         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2573
2574         if (iif) {
2575                 struct net_device *dev;
2576
2577                 dev = __dev_get_by_index(&init_net, iif);
2578                 if (dev == NULL) {
2579                         err = -ENODEV;
2580                         goto errout_free;
2581                 }
2582
2583                 skb->protocol   = htons(ETH_P_IP);
2584                 skb->dev        = dev;
2585                 local_bh_disable();
2586                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2587                 local_bh_enable();
2588
2589                 rt = (struct rtable*) skb->dst;
2590                 if (err == 0 && rt->u.dst.error)
2591                         err = -rt->u.dst.error;
2592         } else {
2593                 struct flowi fl = {
2594                         .nl_u = {
2595                                 .ip4_u = {
2596                                         .daddr = dst,
2597                                         .saddr = src,
2598                                         .tos = rtm->rtm_tos,
2599                                 },
2600                         },
2601                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2602                 };
2603                 err = ip_route_output_key(&rt, &fl);
2604         }
2605
2606         if (err)
2607                 goto errout_free;
2608
2609         skb->dst = &rt->u.dst;
2610         if (rtm->rtm_flags & RTM_F_NOTIFY)
2611                 rt->rt_flags |= RTCF_NOTIFY;
2612
2613         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2614                                 RTM_NEWROUTE, 0, 0);
2615         if (err <= 0)
2616                 goto errout_free;
2617
2618         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2619 errout:
2620         return err;
2621
2622 errout_free:
2623         kfree_skb(skb);
2624         goto errout;
2625 }
2626
2627 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2628 {
2629         struct rtable *rt;
2630         int h, s_h;
2631         int idx, s_idx;
2632
2633         s_h = cb->args[0];
2634         s_idx = idx = cb->args[1];
2635         for (h = 0; h <= rt_hash_mask; h++) {
2636                 if (h < s_h) continue;
2637                 if (h > s_h)
2638                         s_idx = 0;
2639                 rcu_read_lock_bh();
2640                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2641                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2642                         if (idx < s_idx)
2643                                 continue;
2644                         skb->dst = dst_clone(&rt->u.dst);
2645                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2646                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2647                                          1, NLM_F_MULTI) <= 0) {
2648                                 dst_release(xchg(&skb->dst, NULL));
2649                                 rcu_read_unlock_bh();
2650                                 goto done;
2651                         }
2652                         dst_release(xchg(&skb->dst, NULL));
2653                 }
2654                 rcu_read_unlock_bh();
2655         }
2656
2657 done:
2658         cb->args[0] = h;
2659         cb->args[1] = idx;
2660         return skb->len;
2661 }
2662
2663 void ip_rt_multicast_event(struct in_device *in_dev)
2664 {
2665         rt_cache_flush(0);
2666 }
2667
2668 #ifdef CONFIG_SYSCTL
2669 static int flush_delay;
2670
2671 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2672                                         struct file *filp, void __user *buffer,
2673                                         size_t *lenp, loff_t *ppos)
2674 {
2675         if (write) {
2676                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2677                 rt_cache_flush(flush_delay);
2678                 return 0;
2679         }
2680
2681         return -EINVAL;
2682 }
2683
2684 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2685                                                 int __user *name,
2686                                                 int nlen,
2687                                                 void __user *oldval,
2688                                                 size_t __user *oldlenp,
2689                                                 void __user *newval,
2690                                                 size_t newlen)
2691 {
2692         int delay;
2693         if (newlen != sizeof(int))
2694                 return -EINVAL;
2695         if (get_user(delay, (int __user *)newval))
2696                 return -EFAULT;
2697         rt_cache_flush(delay);
2698         return 0;
2699 }
2700
2701 ctl_table ipv4_route_table[] = {
2702         {
2703                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2704                 .procname       = "flush",
2705                 .data           = &flush_delay,
2706                 .maxlen         = sizeof(int),
2707                 .mode           = 0200,
2708                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2709                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2710         },
2711         {
2712                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2713                 .procname       = "min_delay",
2714                 .data           = &ip_rt_min_delay,
2715                 .maxlen         = sizeof(int),
2716                 .mode           = 0644,
2717                 .proc_handler   = &proc_dointvec_jiffies,
2718                 .strategy       = &sysctl_jiffies,
2719         },
2720         {
2721                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2722                 .procname       = "max_delay",
2723                 .data           = &ip_rt_max_delay,
2724                 .maxlen         = sizeof(int),
2725                 .mode           = 0644,
2726                 .proc_handler   = &proc_dointvec_jiffies,
2727                 .strategy       = &sysctl_jiffies,
2728         },
2729         {
2730                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2731                 .procname       = "gc_thresh",
2732                 .data           = &ipv4_dst_ops.gc_thresh,
2733                 .maxlen         = sizeof(int),
2734                 .mode           = 0644,
2735                 .proc_handler   = &proc_dointvec,
2736         },
2737         {
2738                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2739                 .procname       = "max_size",
2740                 .data           = &ip_rt_max_size,
2741                 .maxlen         = sizeof(int),
2742                 .mode           = 0644,
2743                 .proc_handler   = &proc_dointvec,
2744         },
2745         {
2746                 /*  Deprecated. Use gc_min_interval_ms */
2747
2748                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2749                 .procname       = "gc_min_interval",
2750                 .data           = &ip_rt_gc_min_interval,
2751                 .maxlen         = sizeof(int),
2752                 .mode           = 0644,
2753                 .proc_handler   = &proc_dointvec_jiffies,
2754                 .strategy       = &sysctl_jiffies,
2755         },
2756         {
2757                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2758                 .procname       = "gc_min_interval_ms",
2759                 .data           = &ip_rt_gc_min_interval,
2760                 .maxlen         = sizeof(int),
2761                 .mode           = 0644,
2762                 .proc_handler   = &proc_dointvec_ms_jiffies,
2763                 .strategy       = &sysctl_ms_jiffies,
2764         },
2765         {
2766                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2767                 .procname       = "gc_timeout",
2768                 .data           = &ip_rt_gc_timeout,
2769                 .maxlen         = sizeof(int),
2770                 .mode           = 0644,
2771                 .proc_handler   = &proc_dointvec_jiffies,
2772                 .strategy       = &sysctl_jiffies,
2773         },
2774         {
2775                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2776                 .procname       = "gc_interval",
2777                 .data           = &ip_rt_gc_interval,
2778                 .maxlen         = sizeof(int),
2779                 .mode           = 0644,
2780                 .proc_handler   = &proc_dointvec_jiffies,
2781                 .strategy       = &sysctl_jiffies,
2782         },
2783         {
2784                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2785                 .procname       = "redirect_load",
2786                 .data           = &ip_rt_redirect_load,
2787                 .maxlen         = sizeof(int),
2788                 .mode           = 0644,
2789                 .proc_handler   = &proc_dointvec,
2790         },
2791         {
2792                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2793                 .procname       = "redirect_number",
2794                 .data           = &ip_rt_redirect_number,
2795                 .maxlen         = sizeof(int),
2796                 .mode           = 0644,
2797                 .proc_handler   = &proc_dointvec,
2798         },
2799         {
2800                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2801                 .procname       = "redirect_silence",
2802                 .data           = &ip_rt_redirect_silence,
2803                 .maxlen         = sizeof(int),
2804                 .mode           = 0644,
2805                 .proc_handler   = &proc_dointvec,
2806         },
2807         {
2808                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2809                 .procname       = "error_cost",
2810                 .data           = &ip_rt_error_cost,
2811                 .maxlen         = sizeof(int),
2812                 .mode           = 0644,
2813                 .proc_handler   = &proc_dointvec,
2814         },
2815         {
2816                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2817                 .procname       = "error_burst",
2818                 .data           = &ip_rt_error_burst,
2819                 .maxlen         = sizeof(int),
2820                 .mode           = 0644,
2821                 .proc_handler   = &proc_dointvec,
2822         },
2823         {
2824                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2825                 .procname       = "gc_elasticity",
2826                 .data           = &ip_rt_gc_elasticity,
2827                 .maxlen         = sizeof(int),
2828                 .mode           = 0644,
2829                 .proc_handler   = &proc_dointvec,
2830         },
2831         {
2832                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2833                 .procname       = "mtu_expires",
2834                 .data           = &ip_rt_mtu_expires,
2835                 .maxlen         = sizeof(int),
2836                 .mode           = 0644,
2837                 .proc_handler   = &proc_dointvec_jiffies,
2838                 .strategy       = &sysctl_jiffies,
2839         },
2840         {
2841                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2842                 .procname       = "min_pmtu",
2843                 .data           = &ip_rt_min_pmtu,
2844                 .maxlen         = sizeof(int),
2845                 .mode           = 0644,
2846                 .proc_handler   = &proc_dointvec,
2847         },
2848         {
2849                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2850                 .procname       = "min_adv_mss",
2851                 .data           = &ip_rt_min_advmss,
2852                 .maxlen         = sizeof(int),
2853                 .mode           = 0644,
2854                 .proc_handler   = &proc_dointvec,
2855         },
2856         {
2857                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2858                 .procname       = "secret_interval",
2859                 .data           = &ip_rt_secret_interval,
2860                 .maxlen         = sizeof(int),
2861                 .mode           = 0644,
2862                 .proc_handler   = &proc_dointvec_jiffies,
2863                 .strategy       = &sysctl_jiffies,
2864         },
2865         { .ctl_name = 0 }
2866 };
2867 #endif
2868
2869 #ifdef CONFIG_NET_CLS_ROUTE
2870 struct ip_rt_acct *ip_rt_acct;
2871
2872 /* This code sucks.  But you should have seen it before! --RR */
2873
2874 /* IP route accounting ptr for this logical cpu number. */
2875 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
2876
2877 #ifdef CONFIG_PROC_FS
2878 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
2879                            int length, int *eof, void *data)
2880 {
2881         unsigned int i;
2882
2883         if ((offset & 3) || (length & 3))
2884                 return -EIO;
2885
2886         if (offset >= sizeof(struct ip_rt_acct) * 256) {
2887                 *eof = 1;
2888                 return 0;
2889         }
2890
2891         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
2892                 length = sizeof(struct ip_rt_acct) * 256 - offset;
2893                 *eof = 1;
2894         }
2895
2896         offset /= sizeof(u32);
2897
2898         if (length > 0) {
2899                 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
2900                 u32 *dst = (u32 *) buffer;
2901
2902                 /* Copy first cpu. */
2903                 *start = buffer;
2904                 memcpy(dst, src, length);
2905
2906                 /* Add the other cpus in, one int at a time */
2907                 for_each_possible_cpu(i) {
2908                         unsigned int j;
2909
2910                         src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
2911
2912                         for (j = 0; j < length/4; j++)
2913                                 dst[j] += src[j];
2914                 }
2915         }
2916         return length;
2917 }
2918 #endif /* CONFIG_PROC_FS */
2919 #endif /* CONFIG_NET_CLS_ROUTE */
2920
2921 static __initdata unsigned long rhash_entries;
2922 static int __init set_rhash_entries(char *str)
2923 {
2924         if (!str)
2925                 return 0;
2926         rhash_entries = simple_strtoul(str, &str, 0);
2927         return 1;
2928 }
2929 __setup("rhash_entries=", set_rhash_entries);
2930
2931 int __init ip_rt_init(void)
2932 {
2933         int rc = 0;
2934
2935         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2936                              (jiffies ^ (jiffies >> 7)));
2937
2938 #ifdef CONFIG_NET_CLS_ROUTE
2939         {
2940         int order;
2941         for (order = 0;
2942              (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
2943                 /* NOTHING */;
2944         ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
2945         if (!ip_rt_acct)
2946                 panic("IP: failed to allocate ip_rt_acct\n");
2947         memset(ip_rt_acct, 0, PAGE_SIZE << order);
2948         }
2949 #endif
2950
2951         ipv4_dst_ops.kmem_cachep =
2952                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2953                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2954
2955         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2956
2957         rt_hash_table = (struct rt_hash_bucket *)
2958                 alloc_large_system_hash("IP route cache",
2959                                         sizeof(struct rt_hash_bucket),
2960                                         rhash_entries,
2961                                         (num_physpages >= 128 * 1024) ?
2962                                         15 : 17,
2963                                         0,
2964                                         &rt_hash_log,
2965                                         &rt_hash_mask,
2966                                         0);
2967         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2968         rt_hash_lock_init();
2969
2970         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2971         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2972
2973         devinet_init();
2974         ip_fib_init();
2975
2976         init_timer(&rt_flush_timer);
2977         rt_flush_timer.function = rt_run_flush;
2978         init_timer(&rt_secret_timer);
2979         rt_secret_timer.function = rt_secret_rebuild;
2980
2981         /* All the timers, started at system startup tend
2982            to synchronize. Perturb it a bit.
2983          */
2984         schedule_delayed_work(&expires_work,
2985                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
2986
2987         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
2988                 ip_rt_secret_interval;
2989         add_timer(&rt_secret_timer);
2990
2991 #ifdef CONFIG_PROC_FS
2992         {
2993         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
2994         if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
2995             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
2996                                              init_net.proc_net_stat))) {
2997                 return -ENOMEM;
2998         }
2999         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3000         }
3001 #ifdef CONFIG_NET_CLS_ROUTE
3002         create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
3003 #endif
3004 #endif
3005 #ifdef CONFIG_XFRM
3006         xfrm_init();
3007         xfrm4_init();
3008 #endif
3009         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3010
3011         return rc;
3012 }
3013
3014 EXPORT_SYMBOL(__ip_select_ident);
3015 EXPORT_SYMBOL(ip_route_input);
3016 EXPORT_SYMBOL(ip_route_output_key);