net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <net/dst.h>
  94 #include <net/net_namespace.h>
  95 #include <net/protocol.h>
  96 #include <net/ip.h>
  97 #include <net/route.h>
  98 #include <net/inetpeer.h>
  99 #include <net/sock.h>
 100 #include <net/ip_fib.h>
 101 #include <net/arp.h>
 102 #include <net/tcp.h>
 103 #include <net/icmp.h>
 104 #include <net/xfrm.h>
 105 #include <net/netevent.h>
 106 #include <net/rtnetlink.h>
 107 #ifdef CONFIG_SYSCTL
 108 #include <linux/sysctl.h>
 109 #endif
 110
 111 #define RT_FL_TOS(oldflp) \
 112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 113
 114 #define IP_MAX_MTU      0xFFF0
 115
 116 #define RT_GC_TIMEOUT (300*HZ)
 117
 118 static int ip_rt_max_size;
 119 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 120 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
 121 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 122 static int ip_rt_redirect_number __read_mostly  = 9;
 123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 125 static int ip_rt_error_cost __read_mostly       = HZ;
 126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 127 static int ip_rt_gc_elasticity __read_mostly    = 8;
 128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 130 static int ip_rt_min_advmss __read_mostly       = 256;
 131 static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
 132
 133 static void rt_worker_func(struct work_struct *work);
 134 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
 135
 136 /*
 137  *      Interface to generic destination cache.
 138  */
 139
 140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 141 static void              ipv4_dst_destroy(struct dst_entry *dst);
 142 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 143                                          struct net_device *dev, int how);
 144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 145 static void              ipv4_link_failure(struct sk_buff *skb);
 146 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 147 static int rt_garbage_collect(struct dst_ops *ops);
 148
 149
 150 static struct dst_ops ipv4_dst_ops = {
 151         .family =               AF_INET,
 152         .protocol =             __constant_htons(ETH_P_IP),
 153         .gc =                   rt_garbage_collect,
 154         .check =                ipv4_dst_check,
 155         .destroy =              ipv4_dst_destroy,
 156         .ifdown =               ipv4_dst_ifdown,
 157         .negative_advice =      ipv4_negative_advice,
 158         .link_failure =         ipv4_link_failure,
 159         .update_pmtu =          ip_rt_update_pmtu,
 160         .local_out =            __ip_local_out,
 161         .entry_size =           sizeof(struct rtable),
 162         .entries =              ATOMIC_INIT(0),
 163 };
 164
 165 #define ECN_OR_COST(class)      TC_PRIO_##class
 166
 167 const __u8 ip_tos2prio[16] = {
 168         TC_PRIO_BESTEFFORT,
 169         ECN_OR_COST(FILLER),
 170         TC_PRIO_BESTEFFORT,
 171         ECN_OR_COST(BESTEFFORT),
 172         TC_PRIO_BULK,
 173         ECN_OR_COST(BULK),
 174         TC_PRIO_BULK,
 175         ECN_OR_COST(BULK),
 176         TC_PRIO_INTERACTIVE,
 177         ECN_OR_COST(INTERACTIVE),
 178         TC_PRIO_INTERACTIVE,
 179         ECN_OR_COST(INTERACTIVE),
 180         TC_PRIO_INTERACTIVE_BULK,
 181         ECN_OR_COST(INTERACTIVE_BULK),
 182         TC_PRIO_INTERACTIVE_BULK,
 183         ECN_OR_COST(INTERACTIVE_BULK)
 184 };
 185
 186
 187 /*
 188  * Route cache.
 189  */
 190
 191 /* The locking scheme is rather straight forward:
 192  *
 193  * 1) Read-Copy Update protects the buckets of the central route hash.
 194  * 2) Only writers remove entries, and they hold the lock
 195  *    as they look at rtable reference counts.
 196  * 3) Only readers acquire references to rtable entries,
 197  *    they do so with atomic increments and with the
 198  *    lock held.
 199  */
 200
 201 struct rt_hash_bucket {
 202         struct rtable   *chain;
 203 };
 204 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 205         defined(CONFIG_PROVE_LOCKING)
 206 /*
 207  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 208  * The size of this table is a power of two and depends on the number of CPUS.
 209  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 210  */
 211 #ifdef CONFIG_LOCKDEP
 212 # define RT_HASH_LOCK_SZ        256
 213 #else
 214 # if NR_CPUS >= 32
 215 #  define RT_HASH_LOCK_SZ       4096
 216 # elif NR_CPUS >= 16
 217 #  define RT_HASH_LOCK_SZ       2048
 218 # elif NR_CPUS >= 8
 219 #  define RT_HASH_LOCK_SZ       1024
 220 # elif NR_CPUS >= 4
 221 #  define RT_HASH_LOCK_SZ       512
 222 # else
 223 #  define RT_HASH_LOCK_SZ       256
 224 # endif
 225 #endif
 226
 227 static spinlock_t       *rt_hash_locks;
 228 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 229
 230 static __init void rt_hash_lock_init(void)
 231 {
 232         int i;
 233
 234         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 235                         GFP_KERNEL);
 236         if (!rt_hash_locks)
 237                 panic("IP: failed to allocate rt_hash_locks\n");
 238
 239         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 240                 spin_lock_init(&rt_hash_locks[i]);
 241 }
 242 #else
 243 # define rt_hash_lock_addr(slot) NULL
 244
 245 static inline void rt_hash_lock_init(void)
 246 {
 247 }
 248 #endif
 249
 250 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 251 static unsigned                 rt_hash_mask __read_mostly;
 252 static unsigned int             rt_hash_log  __read_mostly;
 253 static atomic_t                 rt_genid __read_mostly;
 254
 255 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 256 #define RT_CACHE_STAT_INC(field) \
 257         (__raw_get_cpu_var(rt_cache_stat).field++)
 258
 259 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 260                 int genid)
 261 {
 262         return jhash_3words((__force u32)(__be32)(daddr),
 263                             (__force u32)(__be32)(saddr),
 264                             idx, genid)
 265                 & rt_hash_mask;
 266 }
 267
 268 #ifdef CONFIG_PROC_FS
 269 struct rt_cache_iter_state {
 270         struct seq_net_private p;
 271         int bucket;
 272         int genid;
 273 };
 274
 275 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 276 {
 277         struct rt_cache_iter_state *st = seq->private;
 278         struct rtable *r = NULL;
 279
 280         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 281                 rcu_read_lock_bh();
 282                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
 283                 while (r) {
 284                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
 285                             r->rt_genid == st->genid)
 286                                 return r;
 287                         r = rcu_dereference(r->u.dst.rt_next);
 288                 }
 289                 rcu_read_unlock_bh();
 290         }
 291         return r;
 292 }
 293
 294 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 295                                           struct rtable *r)
 296 {
 297         struct rt_cache_iter_state *st = seq->private;
 298         r = r->u.dst.rt_next;
 299         while (!r) {
 300                 rcu_read_unlock_bh();
 301                 if (--st->bucket < 0)
 302                         break;
 303                 rcu_read_lock_bh();
 304                 r = rt_hash_table[st->bucket].chain;
 305         }
 306         return rcu_dereference(r);
 307 }
 308
 309 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 310                                         struct rtable *r)
 311 {
 312         struct rt_cache_iter_state *st = seq->private;
 313         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 314                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
 315                         continue;
 316                 if (r->rt_genid == st->genid)
 317                         break;
 318         }
 319         return r;
 320 }
 321
 322 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 323 {
 324         struct rtable *r = rt_cache_get_first(seq);
 325
 326         if (r)
 327                 while (pos && (r = rt_cache_get_next(seq, r)))
 328                         --pos;
 329         return pos ? NULL : r;
 330 }
 331
 332 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 333 {
 334         struct rt_cache_iter_state *st = seq->private;
 335         if (*pos)
 336                 return rt_cache_get_idx(seq, *pos - 1);
 337         st->genid = atomic_read(&rt_genid);
 338         return SEQ_START_TOKEN;
 339 }
 340
 341 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 342 {
 343         struct rtable *r;
 344
 345         if (v == SEQ_START_TOKEN)
 346                 r = rt_cache_get_first(seq);
 347         else
 348                 r = rt_cache_get_next(seq, v);
 349         ++*pos;
 350         return r;
 351 }
 352
 353 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 354 {
 355         if (v && v != SEQ_START_TOKEN)
 356                 rcu_read_unlock_bh();
 357 }
 358
 359 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 360 {
 361         if (v == SEQ_START_TOKEN)
 362                 seq_printf(seq, "%-127s\n",
 363                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 364                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 365                            "HHUptod\tSpecDst");
 366         else {
 367                 struct rtable *r = v;
 368                 int len;
 369
 370                 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 371                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 372                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 373                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 374                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 375                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 376                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 377                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 378                         dst_metric(&r->u.dst, RTAX_WINDOW),
 379                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 380                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 381                         r->fl.fl4_tos,
 382                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 383                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 384                                        dev_queue_xmit) : 0,
 385                         r->rt_spec_dst, &len);
 386
 387                 seq_printf(seq, "%*s\n", 127 - len, "");
 388         }
 389         return 0;
 390 }
 391
 392 static const struct seq_operations rt_cache_seq_ops = {
 393         .start  = rt_cache_seq_start,
 394         .next   = rt_cache_seq_next,
 395         .stop   = rt_cache_seq_stop,
 396         .show   = rt_cache_seq_show,
 397 };
 398
 399 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 400 {
 401         return seq_open_net(inode, file, &rt_cache_seq_ops,
 402                         sizeof(struct rt_cache_iter_state));
 403 }
 404
 405 static const struct file_operations rt_cache_seq_fops = {
 406         .owner   = THIS_MODULE,
 407         .open    = rt_cache_seq_open,
 408         .read    = seq_read,
 409         .llseek  = seq_lseek,
 410         .release = seq_release_net,
 411 };
 412
 413
 414 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 415 {
 416         int cpu;
 417
 418         if (*pos == 0)
 419                 return SEQ_START_TOKEN;
 420
 421         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 422                 if (!cpu_possible(cpu))
 423                         continue;
 424                 *pos = cpu+1;
 425                 return &per_cpu(rt_cache_stat, cpu);
 426         }
 427         return NULL;
 428 }
 429
 430 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 431 {
 432         int cpu;
 433
 434         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 435                 if (!cpu_possible(cpu))
 436                         continue;
 437                 *pos = cpu+1;
 438                 return &per_cpu(rt_cache_stat, cpu);
 439         }
 440         return NULL;
 441
 442 }
 443
 444 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 445 {
 446
 447 }
 448
 449 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 450 {
 451         struct rt_cache_stat *st = v;
 452
 453         if (v == SEQ_START_TOKEN) {
 454                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 455                 return 0;
 456         }
 457
 458         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 459                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 460                    atomic_read(&ipv4_dst_ops.entries),
 461                    st->in_hit,
 462                    st->in_slow_tot,
 463                    st->in_slow_mc,
 464                    st->in_no_route,
 465                    st->in_brd,
 466                    st->in_martian_dst,
 467                    st->in_martian_src,
 468
 469                    st->out_hit,
 470                    st->out_slow_tot,
 471                    st->out_slow_mc,
 472
 473                    st->gc_total,
 474                    st->gc_ignored,
 475                    st->gc_goal_miss,
 476                    st->gc_dst_overflow,
 477                    st->in_hlist_search,
 478                    st->out_hlist_search
 479                 );
 480         return 0;
 481 }
 482
 483 static const struct seq_operations rt_cpu_seq_ops = {
 484         .start  = rt_cpu_seq_start,
 485         .next   = rt_cpu_seq_next,
 486         .stop   = rt_cpu_seq_stop,
 487         .show   = rt_cpu_seq_show,
 488 };
 489
 490
 491 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 492 {
 493         return seq_open(file, &rt_cpu_seq_ops);
 494 }
 495
 496 static const struct file_operations rt_cpu_seq_fops = {
 497         .owner   = THIS_MODULE,
 498         .open    = rt_cpu_seq_open,
 499         .read    = seq_read,
 500         .llseek  = seq_lseek,
 501         .release = seq_release,
 502 };
 503
 504 #ifdef CONFIG_NET_CLS_ROUTE
 505 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 506                            int length, int *eof, void *data)
 507 {
 508         unsigned int i;
 509
 510         if ((offset & 3) || (length & 3))
 511                 return -EIO;
 512
 513         if (offset >= sizeof(struct ip_rt_acct) * 256) {
 514                 *eof = 1;
 515                 return 0;
 516         }
 517
 518         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
 519                 length = sizeof(struct ip_rt_acct) * 256 - offset;
 520                 *eof = 1;
 521         }
 522
 523         offset /= sizeof(u32);
 524
 525         if (length > 0) {
 526                 u32 *dst = (u32 *) buffer;
 527
 528                 *start = buffer;
 529                 memset(dst, 0, length);
 530
 531                 for_each_possible_cpu(i) {
 532                         unsigned int j;
 533                         u32 *src;
 534
 535                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
 536                         for (j = 0; j < length/4; j++)
 537                                 dst[j] += src[j];
 538                 }
 539         }
 540         return length;
 541 }
 542 #endif
 543
 544 static int __net_init ip_rt_do_proc_init(struct net *net)
 545 {
 546         struct proc_dir_entry *pde;
 547
 548         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 549                         &rt_cache_seq_fops);
 550         if (!pde)
 551                 goto err1;
 552
 553         pde = proc_create("rt_cache", S_IRUGO,
 554                           net->proc_net_stat, &rt_cpu_seq_fops);
 555         if (!pde)
 556                 goto err2;
 557
 558 #ifdef CONFIG_NET_CLS_ROUTE
 559         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
 560                         ip_rt_acct_read, NULL);
 561         if (!pde)
 562                 goto err3;
 563 #endif
 564         return 0;
 565
 566 #ifdef CONFIG_NET_CLS_ROUTE
 567 err3:
 568         remove_proc_entry("rt_cache", net->proc_net_stat);
 569 #endif
 570 err2:
 571         remove_proc_entry("rt_cache", net->proc_net);
 572 err1:
 573         return -ENOMEM;
 574 }
 575
 576 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 577 {
 578         remove_proc_entry("rt_cache", net->proc_net_stat);
 579         remove_proc_entry("rt_cache", net->proc_net);
 580         remove_proc_entry("rt_acct", net->proc_net);
 581 }
 582
 583 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 584         .init = ip_rt_do_proc_init,
 585         .exit = ip_rt_do_proc_exit,
 586 };
 587
 588 static int __init ip_rt_proc_init(void)
 589 {
 590         return register_pernet_subsys(&ip_rt_proc_ops);
 591 }
 592
 593 #else
 594 static inline int ip_rt_proc_init(void)
 595 {
 596         return 0;
 597 }
 598 #endif /* CONFIG_PROC_FS */
 599
 600 static inline void rt_free(struct rtable *rt)
 601 {
 602         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 603 }
 604
 605 static inline void rt_drop(struct rtable *rt)
 606 {
 607         ip_rt_put(rt);
 608         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 609 }
 610
 611 static inline int rt_fast_clean(struct rtable *rth)
 612 {
 613         /* Kill broadcast/multicast entries very aggresively, if they
 614            collide in hash table with more useful entries */
 615         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 616                 rth->fl.iif && rth->u.dst.rt_next;
 617 }
 618
 619 static inline int rt_valuable(struct rtable *rth)
 620 {
 621         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 622                 rth->u.dst.expires;
 623 }
 624
 625 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 626 {
 627         unsigned long age;
 628         int ret = 0;
 629
 630         if (atomic_read(&rth->u.dst.__refcnt))
 631                 goto out;
 632
 633         ret = 1;
 634         if (rth->u.dst.expires &&
 635             time_after_eq(jiffies, rth->u.dst.expires))
 636                 goto out;
 637
 638         age = jiffies - rth->u.dst.lastuse;
 639         ret = 0;
 640         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 641             (age <= tmo2 && rt_valuable(rth)))
 642                 goto out;
 643         ret = 1;
 644 out:    return ret;
 645 }
 646
 647 /* Bits of score are:
 648  * 31: very valuable
 649  * 30: not quite useless
 650  * 29..0: usage counter
 651  */
 652 static inline u32 rt_score(struct rtable *rt)
 653 {
 654         u32 score = jiffies - rt->u.dst.lastuse;
 655
 656         score = ~score & ~(3<<30);
 657
 658         if (rt_valuable(rt))
 659                 score |= (1<<31);
 660
 661         if (!rt->fl.iif ||
 662             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 663                 score |= (1<<30);
 664
 665         return score;
 666 }
 667
 668 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 669 {
 670         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 671                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 672                 (fl1->mark ^ fl2->mark) |
 673                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 674                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 675                 (fl1->oif ^ fl2->oif) |
 676                 (fl1->iif ^ fl2->iif)) == 0;
 677 }
 678
 679 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 680 {
 681         return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
 682 }
 683
 684 /*
 685  * Perform a full scan of hash table and free all entries.
 686  * Can be called by a softirq or a process.
 687  * In the later case, we want to be reschedule if necessary
 688  */
 689 static void rt_do_flush(int process_context)
 690 {
 691         unsigned int i;
 692         struct rtable *rth, *next;
 693
 694         for (i = 0; i <= rt_hash_mask; i++) {
 695                 if (process_context && need_resched())
 696                         cond_resched();
 697                 rth = rt_hash_table[i].chain;
 698                 if (!rth)
 699                         continue;
 700
 701                 spin_lock_bh(rt_hash_lock_addr(i));
 702                 rth = rt_hash_table[i].chain;
 703                 rt_hash_table[i].chain = NULL;
 704                 spin_unlock_bh(rt_hash_lock_addr(i));
 705
 706                 for (; rth; rth = next) {
 707                         next = rth->u.dst.rt_next;
 708                         rt_free(rth);
 709                 }
 710         }
 711 }
 712
 713 static void rt_check_expire(void)
 714 {
 715         static unsigned int rover;
 716         unsigned int i = rover, goal;
 717         struct rtable *rth, **rthp;
 718         u64 mult;
 719
 720         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 721         if (ip_rt_gc_timeout > 1)
 722                 do_div(mult, ip_rt_gc_timeout);
 723         goal = (unsigned int)mult;
 724         if (goal > rt_hash_mask)
 725                 goal = rt_hash_mask + 1;
 726         for (; goal > 0; goal--) {
 727                 unsigned long tmo = ip_rt_gc_timeout;
 728
 729                 i = (i + 1) & rt_hash_mask;
 730                 rthp = &rt_hash_table[i].chain;
 731
 732                 if (need_resched())
 733                         cond_resched();
 734
 735                 if (*rthp == NULL)
 736                         continue;
 737                 spin_lock_bh(rt_hash_lock_addr(i));
 738                 while ((rth = *rthp) != NULL) {
 739                         if (rth->rt_genid != atomic_read(&rt_genid)) {
 740                                 *rthp = rth->u.dst.rt_next;
 741                                 rt_free(rth);
 742                                 continue;
 743                         }
 744                         if (rth->u.dst.expires) {
 745                                 /* Entry is expired even if it is in use */
 746                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 747                                         tmo >>= 1;
 748                                         rthp = &rth->u.dst.rt_next;
 749                                         continue;
 750                                 }
 751                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 752                                 tmo >>= 1;
 753                                 rthp = &rth->u.dst.rt_next;
 754                                 continue;
 755                         }
 756
 757                         /* Cleanup aged off entries. */
 758                         *rthp = rth->u.dst.rt_next;
 759                         rt_free(rth);
 760                 }
 761                 spin_unlock_bh(rt_hash_lock_addr(i));
 762         }
 763         rover = i;
 764 }
 765
 766 /*
 767  * rt_worker_func() is run in process context.
 768  * we call rt_check_expire() to scan part of the hash table
 769  */
 770 static void rt_worker_func(struct work_struct *work)
 771 {
 772         rt_check_expire();
 773         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 774 }
 775
 776 /*
 777  * Pertubation of rt_genid by a small quantity [1..256]
 778  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 779  * many times (2^24) without giving recent rt_genid.
 780  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 781  */
 782 static void rt_cache_invalidate(struct net *net)
 783 {
 784         unsigned char shuffle;
 785
 786         get_random_bytes(&shuffle, sizeof(shuffle));
 787         atomic_add(shuffle + 1U, &rt_genid);
 788 }
 789
 790 /*
 791  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 792  * delay >= 0 : invalidate & flush cache (can be long)
 793  */
 794 void rt_cache_flush(struct net *net, int delay)
 795 {
 796         rt_cache_invalidate(net);
 797         if (delay >= 0)
 798                 rt_do_flush(!in_softirq());
 799 }
 800
 801 /*
 802  * We change rt_genid and let gc do the cleanup
 803  */
 804 static void rt_secret_rebuild(unsigned long __net)
 805 {
 806         struct net *net = (struct net *)__net;
 807         rt_cache_invalidate(net);
 808         mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
 809 }
 810
 811 /*
 812    Short description of GC goals.
 813
 814    We want to build algorithm, which will keep routing cache
 815    at some equilibrium point, when number of aged off entries
 816    is kept approximately equal to newly generated ones.
 817
 818    Current expiration strength is variable "expire".
 819    We try to adjust it dynamically, so that if networking
 820    is idle expires is large enough to keep enough of warm entries,
 821    and when load increases it reduces to limit cache size.
 822  */
 823
 824 static int rt_garbage_collect(struct dst_ops *ops)
 825 {
 826         static unsigned long expire = RT_GC_TIMEOUT;
 827         static unsigned long last_gc;
 828         static int rover;
 829         static int equilibrium;
 830         struct rtable *rth, **rthp;
 831         unsigned long now = jiffies;
 832         int goal;
 833
 834         /*
 835          * Garbage collection is pretty expensive,
 836          * do not make it too frequently.
 837          */
 838
 839         RT_CACHE_STAT_INC(gc_total);
 840
 841         if (now - last_gc < ip_rt_gc_min_interval &&
 842             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 843                 RT_CACHE_STAT_INC(gc_ignored);
 844                 goto out;
 845         }
 846
 847         /* Calculate number of entries, which we want to expire now. */
 848         goal = atomic_read(&ipv4_dst_ops.entries) -
 849                 (ip_rt_gc_elasticity << rt_hash_log);
 850         if (goal <= 0) {
 851                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 852                         equilibrium = ipv4_dst_ops.gc_thresh;
 853                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 854                 if (goal > 0) {
 855                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 856                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 857                 }
 858         } else {
 859                 /* We are in dangerous area. Try to reduce cache really
 860                  * aggressively.
 861                  */
 862                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 863                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 864         }
 865
 866         if (now - last_gc >= ip_rt_gc_min_interval)
 867                 last_gc = now;
 868
 869         if (goal <= 0) {
 870                 equilibrium += goal;
 871                 goto work_done;
 872         }
 873
 874         do {
 875                 int i, k;
 876
 877                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 878                         unsigned long tmo = expire;
 879
 880                         k = (k + 1) & rt_hash_mask;
 881                         rthp = &rt_hash_table[k].chain;
 882                         spin_lock_bh(rt_hash_lock_addr(k));
 883                         while ((rth = *rthp) != NULL) {
 884                                 if (rth->rt_genid == atomic_read(&rt_genid) &&
 885                                         !rt_may_expire(rth, tmo, expire)) {
 886                                         tmo >>= 1;
 887                                         rthp = &rth->u.dst.rt_next;
 888                                         continue;
 889                                 }
 890                                 *rthp = rth->u.dst.rt_next;
 891                                 rt_free(rth);
 892                                 goal--;
 893                         }
 894                         spin_unlock_bh(rt_hash_lock_addr(k));
 895                         if (goal <= 0)
 896                                 break;
 897                 }
 898                 rover = k;
 899
 900                 if (goal <= 0)
 901                         goto work_done;
 902
 903                 /* Goal is not achieved. We stop process if:
 904
 905                    - if expire reduced to zero. Otherwise, expire is halfed.
 906                    - if table is not full.
 907                    - if we are called from interrupt.
 908                    - jiffies check is just fallback/debug loop breaker.
 909                      We will not spin here for long time in any case.
 910                  */
 911
 912                 RT_CACHE_STAT_INC(gc_goal_miss);
 913
 914                 if (expire == 0)
 915                         break;
 916
 917                 expire >>= 1;
 918 #if RT_CACHE_DEBUG >= 2
 919                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 920                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 921 #endif
 922
 923                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 924                         goto out;
 925         } while (!in_softirq() && time_before_eq(jiffies, now));
 926
 927         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 928                 goto out;
 929         if (net_ratelimit())
 930                 printk(KERN_WARNING "dst cache overflow\n");
 931         RT_CACHE_STAT_INC(gc_dst_overflow);
 932         return 1;
 933
 934 work_done:
 935         expire += ip_rt_gc_min_interval;
 936         if (expire > ip_rt_gc_timeout ||
 937             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 938                 expire = ip_rt_gc_timeout;
 939 #if RT_CACHE_DEBUG >= 2
 940         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 941                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 942 #endif
 943 out:    return 0;
 944 }
 945
 946 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 947 {
 948         struct rtable   *rth, **rthp;
 949         unsigned long   now;
 950         struct rtable *cand, **candp;
 951         u32             min_score;
 952         int             chain_length;
 953         int attempts = !in_softirq();
 954
 955 restart:
 956         chain_length = 0;
 957         min_score = ~(u32)0;
 958         cand = NULL;
 959         candp = NULL;
 960         now = jiffies;
 961
 962         rthp = &rt_hash_table[hash].chain;
 963
 964         spin_lock_bh(rt_hash_lock_addr(hash));
 965         while ((rth = *rthp) != NULL) {
 966                 if (rth->rt_genid != atomic_read(&rt_genid)) {
 967                         *rthp = rth->u.dst.rt_next;
 968                         rt_free(rth);
 969                         continue;
 970                 }
 971                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
 972                         /* Put it first */
 973                         *rthp = rth->u.dst.rt_next;
 974                         /*
 975                          * Since lookup is lockfree, the deletion
 976                          * must be visible to another weakly ordered CPU before
 977                          * the insertion at the start of the hash chain.
 978                          */
 979                         rcu_assign_pointer(rth->u.dst.rt_next,
 980                                            rt_hash_table[hash].chain);
 981                         /*
 982                          * Since lookup is lockfree, the update writes
 983                          * must be ordered for consistency on SMP.
 984                          */
 985                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 986
 987                         dst_use(&rth->u.dst, now);
 988                         spin_unlock_bh(rt_hash_lock_addr(hash));
 989
 990                         rt_drop(rt);
 991                         *rp = rth;
 992                         return 0;
 993                 }
 994
 995                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 996                         u32 score = rt_score(rth);
 997
 998                         if (score <= min_score) {
 999                                 cand = rth;
1000                                 candp = rthp;
1001                                 min_score = score;
1002                         }
1003                 }
1004
1005                 chain_length++;
1006
1007                 rthp = &rth->u.dst.rt_next;
1008         }
1009
1010         if (cand) {
1011                 /* ip_rt_gc_elasticity used to be average length of chain
1012                  * length, when exceeded gc becomes really aggressive.
1013                  *
1014                  * The second limit is less certain. At the moment it allows
1015                  * only 2 entries per bucket. We will see.
1016                  */
1017                 if (chain_length > ip_rt_gc_elasticity) {
1018                         *candp = cand->u.dst.rt_next;
1019                         rt_free(cand);
1020                 }
1021         }
1022
1023         /* Try to bind route to arp only if it is output
1024            route or unicast forwarding path.
1025          */
1026         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1027                 int err = arp_bind_neighbour(&rt->u.dst);
1028                 if (err) {
1029                         spin_unlock_bh(rt_hash_lock_addr(hash));
1030
1031                         if (err != -ENOBUFS) {
1032                                 rt_drop(rt);
1033                                 return err;
1034                         }
1035
1036                         /* Neighbour tables are full and nothing
1037                            can be released. Try to shrink route cache,
1038                            it is most likely it holds some neighbour records.
1039                          */
1040                         if (attempts-- > 0) {
1041                                 int saved_elasticity = ip_rt_gc_elasticity;
1042                                 int saved_int = ip_rt_gc_min_interval;
1043                                 ip_rt_gc_elasticity     = 1;
1044                                 ip_rt_gc_min_interval   = 0;
1045                                 rt_garbage_collect(&ipv4_dst_ops);
1046                                 ip_rt_gc_min_interval   = saved_int;
1047                                 ip_rt_gc_elasticity     = saved_elasticity;
1048                                 goto restart;
1049                         }
1050
1051                         if (net_ratelimit())
1052                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1053                         rt_drop(rt);
1054                         return -ENOBUFS;
1055                 }
1056         }
1057
1058         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1059 #if RT_CACHE_DEBUG >= 2
1060         if (rt->u.dst.rt_next) {
1061                 struct rtable *trt;
1062                 printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
1063                        NIPQUAD(rt->rt_dst));
1064                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1065                         printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
1066                 printk("\n");
1067         }
1068 #endif
1069         rt_hash_table[hash].chain = rt;
1070         spin_unlock_bh(rt_hash_lock_addr(hash));
1071         *rp = rt;
1072         return 0;
1073 }
1074
1075 void rt_bind_peer(struct rtable *rt, int create)
1076 {
1077         static DEFINE_SPINLOCK(rt_peer_lock);
1078         struct inet_peer *peer;
1079
1080         peer = inet_getpeer(rt->rt_dst, create);
1081
1082         spin_lock_bh(&rt_peer_lock);
1083         if (rt->peer == NULL) {
1084                 rt->peer = peer;
1085                 peer = NULL;
1086         }
1087         spin_unlock_bh(&rt_peer_lock);
1088         if (peer)
1089                 inet_putpeer(peer);
1090 }
1091
1092 /*
1093  * Peer allocation may fail only in serious out-of-memory conditions.  However
1094  * we still can generate some output.
1095  * Random ID selection looks a bit dangerous because we have no chances to
1096  * select ID being unique in a reasonable period of time.
1097  * But broken packet identifier may be better than no packet at all.
1098  */
1099 static void ip_select_fb_ident(struct iphdr *iph)
1100 {
1101         static DEFINE_SPINLOCK(ip_fb_id_lock);
1102         static u32 ip_fallback_id;
1103         u32 salt;
1104
1105         spin_lock_bh(&ip_fb_id_lock);
1106         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1107         iph->id = htons(salt & 0xFFFF);
1108         ip_fallback_id = salt;
1109         spin_unlock_bh(&ip_fb_id_lock);
1110 }
1111
1112 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1113 {
1114         struct rtable *rt = (struct rtable *) dst;
1115
1116         if (rt) {
1117                 if (rt->peer == NULL)
1118                         rt_bind_peer(rt, 1);
1119
1120                 /* If peer is attached to destination, it is never detached,
1121                    so that we need not to grab a lock to dereference it.
1122                  */
1123                 if (rt->peer) {
1124                         iph->id = htons(inet_getid(rt->peer, more));
1125                         return;
1126                 }
1127         } else
1128                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1129                        __builtin_return_address(0));
1130
1131         ip_select_fb_ident(iph);
1132 }
1133
1134 static void rt_del(unsigned hash, struct rtable *rt)
1135 {
1136         struct rtable **rthp, *aux;
1137
1138         rthp = &rt_hash_table[hash].chain;
1139         spin_lock_bh(rt_hash_lock_addr(hash));
1140         ip_rt_put(rt);
1141         while ((aux = *rthp) != NULL) {
1142                 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1143                         *rthp = aux->u.dst.rt_next;
1144                         rt_free(aux);
1145                         continue;
1146                 }
1147                 rthp = &aux->u.dst.rt_next;
1148         }
1149         spin_unlock_bh(rt_hash_lock_addr(hash));
1150 }
1151
1152 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1153                     __be32 saddr, struct net_device *dev)
1154 {
1155         int i, k;
1156         struct in_device *in_dev = in_dev_get(dev);
1157         struct rtable *rth, **rthp;
1158         __be32  skeys[2] = { saddr, 0 };
1159         int  ikeys[2] = { dev->ifindex, 0 };
1160         struct netevent_redirect netevent;
1161         struct net *net;
1162
1163         if (!in_dev)
1164                 return;
1165
1166         net = dev_net(dev);
1167         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1168             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1169             || ipv4_is_zeronet(new_gw))
1170                 goto reject_redirect;
1171
1172         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1173                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1174                         goto reject_redirect;
1175                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1176                         goto reject_redirect;
1177         } else {
1178                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1179                         goto reject_redirect;
1180         }
1181
1182         for (i = 0; i < 2; i++) {
1183                 for (k = 0; k < 2; k++) {
1184                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1185                                                 atomic_read(&rt_genid));
1186
1187                         rthp=&rt_hash_table[hash].chain;
1188
1189                         rcu_read_lock();
1190                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1191                                 struct rtable *rt;
1192
1193                                 if (rth->fl.fl4_dst != daddr ||
1194                                     rth->fl.fl4_src != skeys[i] ||
1195                                     rth->fl.oif != ikeys[k] ||
1196                                     rth->fl.iif != 0 ||
1197                                     rth->rt_genid != atomic_read(&rt_genid) ||
1198                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1199                                         rthp = &rth->u.dst.rt_next;
1200                                         continue;
1201                                 }
1202
1203                                 if (rth->rt_dst != daddr ||
1204                                     rth->rt_src != saddr ||
1205                                     rth->u.dst.error ||
1206                                     rth->rt_gateway != old_gw ||
1207                                     rth->u.dst.dev != dev)
1208                                         break;
1209
1210                                 dst_hold(&rth->u.dst);
1211                                 rcu_read_unlock();
1212
1213                                 rt = dst_alloc(&ipv4_dst_ops);
1214                                 if (rt == NULL) {
1215                                         ip_rt_put(rth);
1216                                         in_dev_put(in_dev);
1217                                         return;
1218                                 }
1219
1220                                 /* Copy all the information. */
1221                                 *rt = *rth;
1222                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1223                                 rt->u.dst.__use         = 1;
1224                                 atomic_set(&rt->u.dst.__refcnt, 1);
1225                                 rt->u.dst.child         = NULL;
1226                                 if (rt->u.dst.dev)
1227                                         dev_hold(rt->u.dst.dev);
1228                                 if (rt->idev)
1229                                         in_dev_hold(rt->idev);
1230                                 rt->u.dst.obsolete      = 0;
1231                                 rt->u.dst.lastuse       = jiffies;
1232                                 rt->u.dst.path          = &rt->u.dst;
1233                                 rt->u.dst.neighbour     = NULL;
1234                                 rt->u.dst.hh            = NULL;
1235                                 rt->u.dst.xfrm          = NULL;
1236                                 rt->rt_genid            = atomic_read(&rt_genid);
1237                                 rt->rt_flags            |= RTCF_REDIRECTED;
1238
1239                                 /* Gateway is different ... */
1240                                 rt->rt_gateway          = new_gw;
1241
1242                                 /* Redirect received -> path was valid */
1243                                 dst_confirm(&rth->u.dst);
1244
1245                                 if (rt->peer)
1246                                         atomic_inc(&rt->peer->refcnt);
1247
1248                                 if (arp_bind_neighbour(&rt->u.dst) ||
1249                                     !(rt->u.dst.neighbour->nud_state &
1250                                             NUD_VALID)) {
1251                                         if (rt->u.dst.neighbour)
1252                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1253                                         ip_rt_put(rth);
1254                                         rt_drop(rt);
1255                                         goto do_next;
1256                                 }
1257
1258                                 netevent.old = &rth->u.dst;
1259                                 netevent.new = &rt->u.dst;
1260                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1261                                                         &netevent);
1262
1263                                 rt_del(hash, rth);
1264                                 if (!rt_intern_hash(hash, rt, &rt))
1265                                         ip_rt_put(rt);
1266                                 goto do_next;
1267                         }
1268                         rcu_read_unlock();
1269                 do_next:
1270                         ;
1271                 }
1272         }
1273         in_dev_put(in_dev);
1274         return;
1275
1276 reject_redirect:
1277 #ifdef CONFIG_IP_ROUTE_VERBOSE
1278         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1279                 printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1280                         NIPQUAD_FMT " ignored.\n"
1281                         "  Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
1282                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1283                        NIPQUAD(saddr), NIPQUAD(daddr));
1284 #endif
1285         in_dev_put(in_dev);
1286 }
1287
1288 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1289 {
1290         struct rtable *rt = (struct rtable *)dst;
1291         struct dst_entry *ret = dst;
1292
1293         if (rt) {
1294                 if (dst->obsolete) {
1295                         ip_rt_put(rt);
1296                         ret = NULL;
1297                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1298                            rt->u.dst.expires) {
1299                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1300                                                 rt->fl.oif,
1301                                                 atomic_read(&rt_genid));
1302 #if RT_CACHE_DEBUG >= 1
1303                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1304                                           NIPQUAD_FMT "/%02x dropped\n",
1305                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1306 #endif
1307                         rt_del(hash, rt);
1308                         ret = NULL;
1309                 }
1310         }
1311         return ret;
1312 }
1313
1314 /*
1315  * Algorithm:
1316  *      1. The first ip_rt_redirect_number redirects are sent
1317  *         with exponential backoff, then we stop sending them at all,
1318  *         assuming that the host ignores our redirects.
1319  *      2. If we did not see packets requiring redirects
1320  *         during ip_rt_redirect_silence, we assume that the host
1321  *         forgot redirected route and start to send redirects again.
1322  *
1323  * This algorithm is much cheaper and more intelligent than dumb load limiting
1324  * in icmp.c.
1325  *
1326  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1327  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1328  */
1329
1330 void ip_rt_send_redirect(struct sk_buff *skb)
1331 {
1332         struct rtable *rt = skb->rtable;
1333         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1334
1335         if (!in_dev)
1336                 return;
1337
1338         if (!IN_DEV_TX_REDIRECTS(in_dev))
1339                 goto out;
1340
1341         /* No redirected packets during ip_rt_redirect_silence;
1342          * reset the algorithm.
1343          */
1344         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1345                 rt->u.dst.rate_tokens = 0;
1346
1347         /* Too many ignored redirects; do not send anything
1348          * set u.dst.rate_last to the last seen redirected packet.
1349          */
1350         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1351                 rt->u.dst.rate_last = jiffies;
1352                 goto out;
1353         }
1354
1355         /* Check for load limit; set rate_last to the latest sent
1356          * redirect.
1357          */
1358         if (rt->u.dst.rate_tokens == 0 ||
1359             time_after(jiffies,
1360                        (rt->u.dst.rate_last +
1361                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1362                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1363                 rt->u.dst.rate_last = jiffies;
1364                 ++rt->u.dst.rate_tokens;
1365 #ifdef CONFIG_IP_ROUTE_VERBOSE
1366                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1367                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1368                     net_ratelimit())
1369                         printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1370                                 "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
1371                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1372                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1373 #endif
1374         }
1375 out:
1376         in_dev_put(in_dev);
1377 }
1378
1379 static int ip_error(struct sk_buff *skb)
1380 {
1381         struct rtable *rt = skb->rtable;
1382         unsigned long now;
1383         int code;
1384
1385         switch (rt->u.dst.error) {
1386                 case EINVAL:
1387                 default:
1388                         goto out;
1389                 case EHOSTUNREACH:
1390                         code = ICMP_HOST_UNREACH;
1391                         break;
1392                 case ENETUNREACH:
1393                         code = ICMP_NET_UNREACH;
1394                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1395                         break;
1396                 case EACCES:
1397                         code = ICMP_PKT_FILTERED;
1398                         break;
1399         }
1400
1401         now = jiffies;
1402         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1403         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1404                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1405         rt->u.dst.rate_last = now;
1406         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1407                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1408                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1409         }
1410
1411 out:    kfree_skb(skb);
1412         return 0;
1413 }
1414
1415 /*
1416  *      The last two values are not from the RFC but
1417  *      are needed for AMPRnet AX.25 paths.
1418  */
1419
1420 static const unsigned short mtu_plateau[] =
1421 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1422
1423 static inline unsigned short guess_mtu(unsigned short old_mtu)
1424 {
1425         int i;
1426
1427         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1428                 if (old_mtu > mtu_plateau[i])
1429                         return mtu_plateau[i];
1430         return 68;
1431 }
1432
1433 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1434                                  unsigned short new_mtu,
1435                                  struct net_device *dev)
1436 {
1437         int i, k;
1438         unsigned short old_mtu = ntohs(iph->tot_len);
1439         struct rtable *rth;
1440         int  ikeys[2] = { dev->ifindex, 0 };
1441         __be32  skeys[2] = { iph->saddr, 0, };
1442         __be32  daddr = iph->daddr;
1443         unsigned short est_mtu = 0;
1444
1445         if (ipv4_config.no_pmtu_disc)
1446                 return 0;
1447
1448         for (k = 0; k < 2; k++) {
1449                 for (i = 0; i < 2; i++) {
1450                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1451                                                 atomic_read(&rt_genid));
1452
1453                         rcu_read_lock();
1454                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1455                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1456                                 unsigned short mtu = new_mtu;
1457
1458                                 if (rth->fl.fl4_dst != daddr ||
1459                                     rth->fl.fl4_src != skeys[i] ||
1460                                     rth->rt_dst != daddr ||
1461                                     rth->rt_src != iph->saddr ||
1462                                     rth->fl.oif != ikeys[k] ||
1463                                     rth->fl.iif != 0 ||
1464                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1465                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1466                                     rth->rt_genid != atomic_read(&rt_genid))
1467                                         continue;
1468
1469                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1470
1471                                         /* BSD 4.2 compatibility hack :-( */
1472                                         if (mtu == 0 &&
1473                                             old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) &&
1474                                             old_mtu >= 68 + (iph->ihl << 2))
1475                                                 old_mtu -= iph->ihl << 2;
1476
1477                                         mtu = guess_mtu(old_mtu);
1478                                 }
1479                                 if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) {
1480                                         if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) {
1481                                                 dst_confirm(&rth->u.dst);
1482                                                 if (mtu < ip_rt_min_pmtu) {
1483                                                         mtu = ip_rt_min_pmtu;
1484                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1485                                                                 (1 << RTAX_MTU);
1486                                                 }
1487                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1488                                                 dst_set_expires(&rth->u.dst,
1489                                                         ip_rt_mtu_expires);
1490                                         }
1491                                         est_mtu = mtu;
1492                                 }
1493                         }
1494                         rcu_read_unlock();
1495                 }
1496         }
1497         return est_mtu ? : new_mtu;
1498 }
1499
1500 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1501 {
1502         if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 &&
1503             !(dst_metric_locked(dst, RTAX_MTU))) {
1504                 if (mtu < ip_rt_min_pmtu) {
1505                         mtu = ip_rt_min_pmtu;
1506                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1507                 }
1508                 dst->metrics[RTAX_MTU-1] = mtu;
1509                 dst_set_expires(dst, ip_rt_mtu_expires);
1510                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1511         }
1512 }
1513
1514 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1515 {
1516         return NULL;
1517 }
1518
1519 static void ipv4_dst_destroy(struct dst_entry *dst)
1520 {
1521         struct rtable *rt = (struct rtable *) dst;
1522         struct inet_peer *peer = rt->peer;
1523         struct in_device *idev = rt->idev;
1524
1525         if (peer) {
1526                 rt->peer = NULL;
1527                 inet_putpeer(peer);
1528         }
1529
1530         if (idev) {
1531                 rt->idev = NULL;
1532                 in_dev_put(idev);
1533         }
1534 }
1535
1536 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1537                             int how)
1538 {
1539         struct rtable *rt = (struct rtable *) dst;
1540         struct in_device *idev = rt->idev;
1541         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1542                 struct in_device *loopback_idev =
1543                         in_dev_get(dev_net(dev)->loopback_dev);
1544                 if (loopback_idev) {
1545                         rt->idev = loopback_idev;
1546                         in_dev_put(idev);
1547                 }
1548         }
1549 }
1550
1551 static void ipv4_link_failure(struct sk_buff *skb)
1552 {
1553         struct rtable *rt;
1554
1555         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1556
1557         rt = skb->rtable;
1558         if (rt)
1559                 dst_set_expires(&rt->u.dst, 0);
1560 }
1561
1562 static int ip_rt_bug(struct sk_buff *skb)
1563 {
1564         printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
1565                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1566                 skb->dev ? skb->dev->name : "?");
1567         kfree_skb(skb);
1568         return 0;
1569 }
1570
1571 /*
1572    We do not cache source address of outgoing interface,
1573    because it is used only by IP RR, TS and SRR options,
1574    so that it out of fast path.
1575
1576    BTW remember: "addr" is allowed to be not aligned
1577    in IP options!
1578  */
1579
1580 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1581 {
1582         __be32 src;
1583         struct fib_result res;
1584
1585         if (rt->fl.iif == 0)
1586                 src = rt->rt_src;
1587         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1588                 src = FIB_RES_PREFSRC(res);
1589                 fib_res_put(&res);
1590         } else
1591                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1592                                         RT_SCOPE_UNIVERSE);
1593         memcpy(addr, &src, 4);
1594 }
1595
1596 #ifdef CONFIG_NET_CLS_ROUTE
1597 static void set_class_tag(struct rtable *rt, u32 tag)
1598 {
1599         if (!(rt->u.dst.tclassid & 0xFFFF))
1600                 rt->u.dst.tclassid |= tag & 0xFFFF;
1601         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1602                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1603 }
1604 #endif
1605
1606 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1607 {
1608         struct fib_info *fi = res->fi;
1609
1610         if (fi) {
1611                 if (FIB_RES_GW(*res) &&
1612                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1613                         rt->rt_gateway = FIB_RES_GW(*res);
1614                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1615                        sizeof(rt->u.dst.metrics));
1616                 if (fi->fib_mtu == 0) {
1617                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1618                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1619                             rt->rt_gateway != rt->rt_dst &&
1620                             rt->u.dst.dev->mtu > 576)
1621                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1622                 }
1623 #ifdef CONFIG_NET_CLS_ROUTE
1624                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1625 #endif
1626         } else
1627                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1628
1629         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1630                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1631         if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU)
1632                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1633         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1634                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1635                                        ip_rt_min_advmss);
1636         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1637                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1638
1639 #ifdef CONFIG_NET_CLS_ROUTE
1640 #ifdef CONFIG_IP_MULTIPLE_TABLES
1641         set_class_tag(rt, fib_rules_tclass(res));
1642 #endif
1643         set_class_tag(rt, itag);
1644 #endif
1645         rt->rt_type = res->type;
1646 }
1647
1648 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1649                                 u8 tos, struct net_device *dev, int our)
1650 {
1651         unsigned hash;
1652         struct rtable *rth;
1653         __be32 spec_dst;
1654         struct in_device *in_dev = in_dev_get(dev);
1655         u32 itag = 0;
1656
1657         /* Primary sanity checks. */
1658
1659         if (in_dev == NULL)
1660                 return -EINVAL;
1661
1662         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1663             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1664                 goto e_inval;
1665
1666         if (ipv4_is_zeronet(saddr)) {
1667                 if (!ipv4_is_local_multicast(daddr))
1668                         goto e_inval;
1669                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1670         } else if (fib_validate_source(saddr, 0, tos, 0,
1671                                         dev, &spec_dst, &itag) < 0)
1672                 goto e_inval;
1673
1674         rth = dst_alloc(&ipv4_dst_ops);
1675         if (!rth)
1676                 goto e_nobufs;
1677
1678         rth->u.dst.output= ip_rt_bug;
1679
1680         atomic_set(&rth->u.dst.__refcnt, 1);
1681         rth->u.dst.flags= DST_HOST;
1682         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1683                 rth->u.dst.flags |= DST_NOPOLICY;
1684         rth->fl.fl4_dst = daddr;
1685         rth->rt_dst     = daddr;
1686         rth->fl.fl4_tos = tos;
1687         rth->fl.mark    = skb->mark;
1688         rth->fl.fl4_src = saddr;
1689         rth->rt_src     = saddr;
1690 #ifdef CONFIG_NET_CLS_ROUTE
1691         rth->u.dst.tclassid = itag;
1692 #endif
1693         rth->rt_iif     =
1694         rth->fl.iif     = dev->ifindex;
1695         rth->u.dst.dev  = init_net.loopback_dev;
1696         dev_hold(rth->u.dst.dev);
1697         rth->idev       = in_dev_get(rth->u.dst.dev);
1698         rth->fl.oif     = 0;
1699         rth->rt_gateway = daddr;
1700         rth->rt_spec_dst= spec_dst;
1701         rth->rt_genid   = atomic_read(&rt_genid);
1702         rth->rt_flags   = RTCF_MULTICAST;
1703         rth->rt_type    = RTN_MULTICAST;
1704         if (our) {
1705                 rth->u.dst.input= ip_local_deliver;
1706                 rth->rt_flags |= RTCF_LOCAL;
1707         }
1708
1709 #ifdef CONFIG_IP_MROUTE
1710         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1711                 rth->u.dst.input = ip_mr_input;
1712 #endif
1713         RT_CACHE_STAT_INC(in_slow_mc);
1714
1715         in_dev_put(in_dev);
1716         hash = rt_hash(daddr, saddr, dev->ifindex, atomic_read(&rt_genid));
1717         return rt_intern_hash(hash, rth, &skb->rtable);
1718
1719 e_nobufs:
1720         in_dev_put(in_dev);
1721         return -ENOBUFS;
1722
1723 e_inval:
1724         in_dev_put(in_dev);
1725         return -EINVAL;
1726 }
1727
1728
1729 static void ip_handle_martian_source(struct net_device *dev,
1730                                      struct in_device *in_dev,
1731                                      struct sk_buff *skb,
1732                                      __be32 daddr,
1733                                      __be32 saddr)
1734 {
1735         RT_CACHE_STAT_INC(in_martian_src);
1736 #ifdef CONFIG_IP_ROUTE_VERBOSE
1737         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1738                 /*
1739                  *      RFC1812 recommendation, if source is martian,
1740                  *      the only hint is MAC header.
1741                  */
1742                 printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1743                         NIPQUAD_FMT", on dev %s\n",
1744                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1745                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1746                         int i;
1747                         const unsigned char *p = skb_mac_header(skb);
1748                         printk(KERN_WARNING "ll header: ");
1749                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1750                                 printk("%02x", *p);
1751                                 if (i < (dev->hard_header_len - 1))
1752                                         printk(":");
1753                         }
1754                         printk("\n");
1755                 }
1756         }
1757 #endif
1758 }
1759
1760 static int __mkroute_input(struct sk_buff *skb,
1761                            struct fib_result *res,
1762                            struct in_device *in_dev,
1763                            __be32 daddr, __be32 saddr, u32 tos,
1764                            struct rtable **result)
1765 {
1766
1767         struct rtable *rth;
1768         int err;
1769         struct in_device *out_dev;
1770         unsigned flags = 0;
1771         __be32 spec_dst;
1772         u32 itag;
1773
1774         /* get a working reference to the output device */
1775         out_dev = in_dev_get(FIB_RES_DEV(*res));
1776         if (out_dev == NULL) {
1777                 if (net_ratelimit())
1778                         printk(KERN_CRIT "Bug in ip_route_input" \
1779                                "_slow(). Please, report\n");
1780                 return -EINVAL;
1781         }
1782
1783
1784         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1785                                   in_dev->dev, &spec_dst, &itag);
1786         if (err < 0) {
1787                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1788                                          saddr);
1789
1790                 err = -EINVAL;
1791                 goto cleanup;
1792         }
1793
1794         if (err)
1795                 flags |= RTCF_DIRECTSRC;
1796
1797         if (out_dev == in_dev && err &&
1798             (IN_DEV_SHARED_MEDIA(out_dev) ||
1799              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1800                 flags |= RTCF_DOREDIRECT;
1801
1802         if (skb->protocol != htons(ETH_P_IP)) {
1803                 /* Not IP (i.e. ARP). Do not create route, if it is
1804                  * invalid for proxy arp. DNAT routes are always valid.
1805                  */
1806                 if (out_dev == in_dev) {
1807                         err = -EINVAL;
1808                         goto cleanup;
1809                 }
1810         }
1811
1812
1813         rth = dst_alloc(&ipv4_dst_ops);
1814         if (!rth) {
1815                 err = -ENOBUFS;
1816                 goto cleanup;
1817         }
1818
1819         atomic_set(&rth->u.dst.__refcnt, 1);
1820         rth->u.dst.flags= DST_HOST;
1821         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1822                 rth->u.dst.flags |= DST_NOPOLICY;
1823         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1824                 rth->u.dst.flags |= DST_NOXFRM;
1825         rth->fl.fl4_dst = daddr;
1826         rth->rt_dst     = daddr;
1827         rth->fl.fl4_tos = tos;
1828         rth->fl.mark    = skb->mark;
1829         rth->fl.fl4_src = saddr;
1830         rth->rt_src     = saddr;
1831         rth->rt_gateway = daddr;
1832         rth->rt_iif     =
1833                 rth->fl.iif     = in_dev->dev->ifindex;
1834         rth->u.dst.dev  = (out_dev)->dev;
1835         dev_hold(rth->u.dst.dev);
1836         rth->idev       = in_dev_get(rth->u.dst.dev);
1837         rth->fl.oif     = 0;
1838         rth->rt_spec_dst= spec_dst;
1839
1840         rth->u.dst.input = ip_forward;
1841         rth->u.dst.output = ip_output;
1842         rth->rt_genid = atomic_read(&rt_genid);
1843
1844         rt_set_nexthop(rth, res, itag);
1845
1846         rth->rt_flags = flags;
1847
1848         *result = rth;
1849         err = 0;
1850  cleanup:
1851         /* release the working reference to the output device */
1852         in_dev_put(out_dev);
1853         return err;
1854 }
1855
1856 static int ip_mkroute_input(struct sk_buff *skb,
1857                             struct fib_result *res,
1858                             const struct flowi *fl,
1859                             struct in_device *in_dev,
1860                             __be32 daddr, __be32 saddr, u32 tos)
1861 {
1862         struct rtable* rth = NULL;
1863         int err;
1864         unsigned hash;
1865
1866 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1867         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1868                 fib_select_multipath(fl, res);
1869 #endif
1870
1871         /* create a routing cache entry */
1872         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1873         if (err)
1874                 return err;
1875
1876         /* put it into the cache */
1877         hash = rt_hash(daddr, saddr, fl->iif, atomic_read(&rt_genid));
1878         return rt_intern_hash(hash, rth, &skb->rtable);
1879 }
1880
1881 /*
1882  *      NOTE. We drop all the packets that has local source
1883  *      addresses, because every properly looped back packet
1884  *      must have correct destination already attached by output routine.
1885  *
1886  *      Such approach solves two big problems:
1887  *      1. Not simplex devices are handled properly.
1888  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1889  */
1890
1891 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1892                                u8 tos, struct net_device *dev)
1893 {
1894         struct fib_result res;
1895         struct in_device *in_dev = in_dev_get(dev);
1896         struct flowi fl = { .nl_u = { .ip4_u =
1897                                       { .daddr = daddr,
1898                                         .saddr = saddr,
1899                                         .tos = tos,
1900                                         .scope = RT_SCOPE_UNIVERSE,
1901                                       } },
1902                             .mark = skb->mark,
1903                             .iif = dev->ifindex };
1904         unsigned        flags = 0;
1905         u32             itag = 0;
1906         struct rtable * rth;
1907         unsigned        hash;
1908         __be32          spec_dst;
1909         int             err = -EINVAL;
1910         int             free_res = 0;
1911         struct net    * net = dev_net(dev);
1912
1913         /* IP on this device is disabled. */
1914
1915         if (!in_dev)
1916                 goto out;
1917
1918         /* Check for the most weird martians, which can be not detected
1919            by fib_lookup.
1920          */
1921
1922         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1923             ipv4_is_loopback(saddr))
1924                 goto martian_source;
1925
1926         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1927                 goto brd_input;
1928
1929         /* Accept zero addresses only to limited broadcast;
1930          * I even do not know to fix it or not. Waiting for complains :-)
1931          */
1932         if (ipv4_is_zeronet(saddr))
1933                 goto martian_source;
1934
1935         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1936             ipv4_is_loopback(daddr))
1937                 goto martian_destination;
1938
1939         /*
1940          *      Now we are ready to route packet.
1941          */
1942         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1943                 if (!IN_DEV_FORWARD(in_dev))
1944                         goto e_hostunreach;
1945                 goto no_route;
1946         }
1947         free_res = 1;
1948
1949         RT_CACHE_STAT_INC(in_slow_tot);
1950
1951         if (res.type == RTN_BROADCAST)
1952                 goto brd_input;
1953
1954         if (res.type == RTN_LOCAL) {
1955                 int result;
1956                 result = fib_validate_source(saddr, daddr, tos,
1957                                              net->loopback_dev->ifindex,
1958                                              dev, &spec_dst, &itag);
1959                 if (result < 0)
1960                         goto martian_source;
1961                 if (result)
1962                         flags |= RTCF_DIRECTSRC;
1963                 spec_dst = daddr;
1964                 goto local_input;
1965         }
1966
1967         if (!IN_DEV_FORWARD(in_dev))
1968                 goto e_hostunreach;
1969         if (res.type != RTN_UNICAST)
1970                 goto martian_destination;
1971
1972         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1973 done:
1974         in_dev_put(in_dev);
1975         if (free_res)
1976                 fib_res_put(&res);
1977 out:    return err;
1978
1979 brd_input:
1980         if (skb->protocol != htons(ETH_P_IP))
1981                 goto e_inval;
1982
1983         if (ipv4_is_zeronet(saddr))
1984                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1985         else {
1986                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1987                                           &itag);
1988                 if (err < 0)
1989                         goto martian_source;
1990                 if (err)
1991                         flags |= RTCF_DIRECTSRC;
1992         }
1993         flags |= RTCF_BROADCAST;
1994         res.type = RTN_BROADCAST;
1995         RT_CACHE_STAT_INC(in_brd);
1996
1997 local_input:
1998         rth = dst_alloc(&ipv4_dst_ops);
1999         if (!rth)
2000                 goto e_nobufs;
2001
2002         rth->u.dst.output= ip_rt_bug;
2003         rth->rt_genid = atomic_read(&rt_genid);
2004
2005         atomic_set(&rth->u.dst.__refcnt, 1);
2006         rth->u.dst.flags= DST_HOST;
2007         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2008                 rth->u.dst.flags |= DST_NOPOLICY;
2009         rth->fl.fl4_dst = daddr;
2010         rth->rt_dst     = daddr;
2011         rth->fl.fl4_tos = tos;
2012         rth->fl.mark    = skb->mark;
2013         rth->fl.fl4_src = saddr;
2014         rth->rt_src     = saddr;
2015 #ifdef CONFIG_NET_CLS_ROUTE
2016         rth->u.dst.tclassid = itag;
2017 #endif
2018         rth->rt_iif     =
2019         rth->fl.iif     = dev->ifindex;
2020         rth->u.dst.dev  = net->loopback_dev;
2021         dev_hold(rth->u.dst.dev);
2022         rth->idev       = in_dev_get(rth->u.dst.dev);
2023         rth->rt_gateway = daddr;
2024         rth->rt_spec_dst= spec_dst;
2025         rth->u.dst.input= ip_local_deliver;
2026         rth->rt_flags   = flags|RTCF_LOCAL;
2027         if (res.type == RTN_UNREACHABLE) {
2028                 rth->u.dst.input= ip_error;
2029                 rth->u.dst.error= -err;
2030                 rth->rt_flags   &= ~RTCF_LOCAL;
2031         }
2032         rth->rt_type    = res.type;
2033         hash = rt_hash(daddr, saddr, fl.iif, atomic_read(&rt_genid));
2034         err = rt_intern_hash(hash, rth, &skb->rtable);
2035         goto done;
2036
2037 no_route:
2038         RT_CACHE_STAT_INC(in_no_route);
2039         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2040         res.type = RTN_UNREACHABLE;
2041         if (err == -ESRCH)
2042                 err = -ENETUNREACH;
2043         goto local_input;
2044
2045         /*
2046          *      Do not cache martian addresses: they should be logged (RFC1812)
2047          */
2048 martian_destination:
2049         RT_CACHE_STAT_INC(in_martian_dst);
2050 #ifdef CONFIG_IP_ROUTE_VERBOSE
2051         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2052                 printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2053                         NIPQUAD_FMT ", dev %s\n",
2054                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2055 #endif
2056
2057 e_hostunreach:
2058         err = -EHOSTUNREACH;
2059         goto done;
2060
2061 e_inval:
2062         err = -EINVAL;
2063         goto done;
2064
2065 e_nobufs:
2066         err = -ENOBUFS;
2067         goto done;
2068
2069 martian_source:
2070         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2071         goto e_inval;
2072 }
2073
2074 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2075                    u8 tos, struct net_device *dev)
2076 {
2077         struct rtable * rth;
2078         unsigned        hash;
2079         int iif = dev->ifindex;
2080         struct net *net;
2081
2082         net = dev_net(dev);
2083         tos &= IPTOS_RT_MASK;
2084         hash = rt_hash(daddr, saddr, iif, atomic_read(&rt_genid));
2085
2086         rcu_read_lock();
2087         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2088              rth = rcu_dereference(rth->u.dst.rt_next)) {
2089                 if (((rth->fl.fl4_dst ^ daddr) |
2090                      (rth->fl.fl4_src ^ saddr) |
2091                      (rth->fl.iif ^ iif) |
2092                      rth->fl.oif |
2093                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2094                     rth->fl.mark == skb->mark &&
2095                     net_eq(dev_net(rth->u.dst.dev), net) &&
2096                     rth->rt_genid == atomic_read(&rt_genid)) {
2097                         dst_use(&rth->u.dst, jiffies);
2098                         RT_CACHE_STAT_INC(in_hit);
2099                         rcu_read_unlock();
2100                         skb->rtable = rth;
2101                         return 0;
2102                 }
2103                 RT_CACHE_STAT_INC(in_hlist_search);
2104         }
2105         rcu_read_unlock();
2106
2107         /* Multicast recognition logic is moved from route cache to here.
2108            The problem was that too many Ethernet cards have broken/missing
2109            hardware multicast filters :-( As result the host on multicasting
2110            network acquires a lot of useless route cache entries, sort of
2111            SDR messages from all the world. Now we try to get rid of them.
2112            Really, provided software IP multicast filter is organized
2113            reasonably (at least, hashed), it does not result in a slowdown
2114            comparing with route cache reject entries.
2115            Note, that multicast routers are not affected, because
2116            route cache entry is created eventually.
2117          */
2118         if (ipv4_is_multicast(daddr)) {
2119                 struct in_device *in_dev;
2120
2121                 rcu_read_lock();
2122                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2123                         int our = ip_check_mc(in_dev, daddr, saddr,
2124                                 ip_hdr(skb)->protocol);
2125                         if (our
2126 #ifdef CONFIG_IP_MROUTE
2127                             || (!ipv4_is_local_multicast(daddr) &&
2128                                 IN_DEV_MFORWARD(in_dev))
2129 #endif
2130                             ) {
2131                                 rcu_read_unlock();
2132                                 return ip_route_input_mc(skb, daddr, saddr,
2133                                                          tos, dev, our);
2134                         }
2135                 }
2136                 rcu_read_unlock();
2137                 return -EINVAL;
2138         }
2139         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2140 }
2141
2142 static int __mkroute_output(struct rtable **result,
2143                             struct fib_result *res,
2144                             const struct flowi *fl,
2145                             const struct flowi *oldflp,
2146                             struct net_device *dev_out,
2147                             unsigned flags)
2148 {
2149         struct rtable *rth;
2150         struct in_device *in_dev;
2151         u32 tos = RT_FL_TOS(oldflp);
2152         int err = 0;
2153
2154         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2155                 return -EINVAL;
2156
2157         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2158                 res->type = RTN_BROADCAST;
2159         else if (ipv4_is_multicast(fl->fl4_dst))
2160                 res->type = RTN_MULTICAST;
2161         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2162                 return -EINVAL;
2163
2164         if (dev_out->flags & IFF_LOOPBACK)
2165                 flags |= RTCF_LOCAL;
2166
2167         /* get work reference to inet device */
2168         in_dev = in_dev_get(dev_out);
2169         if (!in_dev)
2170                 return -EINVAL;
2171
2172         if (res->type == RTN_BROADCAST) {
2173                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2174                 if (res->fi) {
2175                         fib_info_put(res->fi);
2176                         res->fi = NULL;
2177                 }
2178         } else if (res->type == RTN_MULTICAST) {
2179                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2180                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2181                                  oldflp->proto))
2182                         flags &= ~RTCF_LOCAL;
2183                 /* If multicast route do not exist use
2184                    default one, but do not gateway in this case.
2185                    Yes, it is hack.
2186                  */
2187                 if (res->fi && res->prefixlen < 4) {
2188                         fib_info_put(res->fi);
2189                         res->fi = NULL;
2190                 }
2191         }
2192
2193
2194         rth = dst_alloc(&ipv4_dst_ops);
2195         if (!rth) {
2196                 err = -ENOBUFS;
2197                 goto cleanup;
2198         }
2199
2200         atomic_set(&rth->u.dst.__refcnt, 1);
2201         rth->u.dst.flags= DST_HOST;
2202         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2203                 rth->u.dst.flags |= DST_NOXFRM;
2204         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2205                 rth->u.dst.flags |= DST_NOPOLICY;
2206
2207         rth->fl.fl4_dst = oldflp->fl4_dst;
2208         rth->fl.fl4_tos = tos;
2209         rth->fl.fl4_src = oldflp->fl4_src;
2210         rth->fl.oif     = oldflp->oif;
2211         rth->fl.mark    = oldflp->mark;
2212         rth->rt_dst     = fl->fl4_dst;
2213         rth->rt_src     = fl->fl4_src;
2214         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2215         /* get references to the devices that are to be hold by the routing
2216            cache entry */
2217         rth->u.dst.dev  = dev_out;
2218         dev_hold(dev_out);
2219         rth->idev       = in_dev_get(dev_out);
2220         rth->rt_gateway = fl->fl4_dst;
2221         rth->rt_spec_dst= fl->fl4_src;
2222
2223         rth->u.dst.output=ip_output;
2224         rth->rt_genid = atomic_read(&rt_genid);
2225
2226         RT_CACHE_STAT_INC(out_slow_tot);
2227
2228         if (flags & RTCF_LOCAL) {
2229                 rth->u.dst.input = ip_local_deliver;
2230                 rth->rt_spec_dst = fl->fl4_dst;
2231         }
2232         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2233                 rth->rt_spec_dst = fl->fl4_src;
2234                 if (flags & RTCF_LOCAL &&
2235                     !(dev_out->flags & IFF_LOOPBACK)) {
2236                         rth->u.dst.output = ip_mc_output;
2237                         RT_CACHE_STAT_INC(out_slow_mc);
2238                 }
2239 #ifdef CONFIG_IP_MROUTE
2240                 if (res->type == RTN_MULTICAST) {
2241                         if (IN_DEV_MFORWARD(in_dev) &&
2242                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2243                                 rth->u.dst.input = ip_mr_input;
2244                                 rth->u.dst.output = ip_mc_output;
2245                         }
2246                 }
2247 #endif
2248         }
2249
2250         rt_set_nexthop(rth, res, 0);
2251
2252         rth->rt_flags = flags;
2253
2254         *result = rth;
2255  cleanup:
2256         /* release work reference to inet device */
2257         in_dev_put(in_dev);
2258
2259         return err;
2260 }
2261
2262 static int ip_mkroute_output(struct rtable **rp,
2263                              struct fib_result *res,
2264                              const struct flowi *fl,
2265                              const struct flowi *oldflp,
2266                              struct net_device *dev_out,
2267                              unsigned flags)
2268 {
2269         struct rtable *rth = NULL;
2270         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2271         unsigned hash;
2272         if (err == 0) {
2273                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2274                                atomic_read(&rt_genid));
2275                 err = rt_intern_hash(hash, rth, rp);
2276         }
2277
2278         return err;
2279 }
2280
2281 /*
2282  * Major route resolver routine.
2283  */
2284
2285 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2286                                 const struct flowi *oldflp)
2287 {
2288         u32 tos = RT_FL_TOS(oldflp);
2289         struct flowi fl = { .nl_u = { .ip4_u =
2290                                       { .daddr = oldflp->fl4_dst,
2291                                         .saddr = oldflp->fl4_src,
2292                                         .tos = tos & IPTOS_RT_MASK,
2293                                         .scope = ((tos & RTO_ONLINK) ?
2294                                                   RT_SCOPE_LINK :
2295                                                   RT_SCOPE_UNIVERSE),
2296                                       } },
2297                             .mark = oldflp->mark,
2298                             .iif = net->loopback_dev->ifindex,
2299                             .oif = oldflp->oif };
2300         struct fib_result res;
2301         unsigned flags = 0;
2302         struct net_device *dev_out = NULL;
2303         int free_res = 0;
2304         int err;
2305
2306
2307         res.fi          = NULL;
2308 #ifdef CONFIG_IP_MULTIPLE_TABLES
2309         res.r           = NULL;
2310 #endif
2311
2312         if (oldflp->fl4_src) {
2313                 err = -EINVAL;
2314                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2315                     ipv4_is_lbcast(oldflp->fl4_src) ||
2316                     ipv4_is_zeronet(oldflp->fl4_src))
2317                         goto out;
2318
2319                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2320                 dev_out = ip_dev_find(net, oldflp->fl4_src);
2321                 if (dev_out == NULL)
2322                         goto out;
2323
2324                 /* I removed check for oif == dev_out->oif here.
2325                    It was wrong for two reasons:
2326                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2327                       is assigned to multiple interfaces.
2328                    2. Moreover, we are allowed to send packets with saddr
2329                       of another iface. --ANK
2330                  */
2331
2332                 if (oldflp->oif == 0
2333                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2334                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2335                         /* Special hack: user can direct multicasts
2336                            and limited broadcast via necessary interface
2337                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2338                            This hack is not just for fun, it allows
2339                            vic,vat and friends to work.
2340                            They bind socket to loopback, set ttl to zero
2341                            and expect that it will work.
2342                            From the viewpoint of routing cache they are broken,
2343                            because we are not allowed to build multicast path
2344                            with loopback source addr (look, routing cache
2345                            cannot know, that ttl is zero, so that packet
2346                            will not leave this host and route is valid).
2347                            Luckily, this hack is good workaround.
2348                          */
2349
2350                         fl.oif = dev_out->ifindex;
2351                         goto make_route;
2352                 }
2353                 if (dev_out)
2354                         dev_put(dev_out);
2355                 dev_out = NULL;
2356         }
2357
2358
2359         if (oldflp->oif) {
2360                 dev_out = dev_get_by_index(net, oldflp->oif);
2361                 err = -ENODEV;
2362                 if (dev_out == NULL)
2363                         goto out;
2364
2365                 /* RACE: Check return value of inet_select_addr instead. */
2366                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2367                         dev_put(dev_out);
2368                         goto out;       /* Wrong error code */
2369                 }
2370
2371                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2372                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2373                         if (!fl.fl4_src)
2374                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2375                                                               RT_SCOPE_LINK);
2376                         goto make_route;
2377                 }
2378                 if (!fl.fl4_src) {
2379                         if (ipv4_is_multicast(oldflp->fl4_dst))
2380                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2381                                                               fl.fl4_scope);
2382                         else if (!oldflp->fl4_dst)
2383                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2384                                                               RT_SCOPE_HOST);
2385                 }
2386         }
2387
2388         if (!fl.fl4_dst) {
2389                 fl.fl4_dst = fl.fl4_src;
2390                 if (!fl.fl4_dst)
2391                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2392                 if (dev_out)
2393                         dev_put(dev_out);
2394                 dev_out = net->loopback_dev;
2395                 dev_hold(dev_out);
2396                 fl.oif = net->loopback_dev->ifindex;
2397                 res.type = RTN_LOCAL;
2398                 flags |= RTCF_LOCAL;
2399                 goto make_route;
2400         }
2401
2402         if (fib_lookup(net, &fl, &res)) {
2403                 res.fi = NULL;
2404                 if (oldflp->oif) {
2405                         /* Apparently, routing tables are wrong. Assume,
2406                            that the destination is on link.
2407
2408                            WHY? DW.
2409                            Because we are allowed to send to iface
2410                            even if it has NO routes and NO assigned
2411                            addresses. When oif is specified, routing
2412                            tables are looked up with only one purpose:
2413                            to catch if destination is gatewayed, rather than
2414                            direct. Moreover, if MSG_DONTROUTE is set,
2415                            we send packet, ignoring both routing tables
2416                            and ifaddr state. --ANK
2417
2418
2419                            We could make it even if oif is unknown,
2420                            likely IPv6, but we do not.
2421                          */
2422
2423                         if (fl.fl4_src == 0)
2424                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2425                                                               RT_SCOPE_LINK);
2426                         res.type = RTN_UNICAST;
2427                         goto make_route;
2428                 }
2429                 if (dev_out)
2430                         dev_put(dev_out);
2431                 err = -ENETUNREACH;
2432                 goto out;
2433         }
2434         free_res = 1;
2435
2436         if (res.type == RTN_LOCAL) {
2437                 if (!fl.fl4_src)
2438                         fl.fl4_src = fl.fl4_dst;
2439                 if (dev_out)
2440                         dev_put(dev_out);
2441                 dev_out = net->loopback_dev;
2442                 dev_hold(dev_out);
2443                 fl.oif = dev_out->ifindex;
2444                 if (res.fi)
2445                         fib_info_put(res.fi);
2446                 res.fi = NULL;
2447                 flags |= RTCF_LOCAL;
2448                 goto make_route;
2449         }
2450
2451 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2452         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2453                 fib_select_multipath(&fl, &res);
2454         else
2455 #endif
2456         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2457                 fib_select_default(net, &fl, &res);
2458
2459         if (!fl.fl4_src)
2460                 fl.fl4_src = FIB_RES_PREFSRC(res);
2461
2462         if (dev_out)
2463                 dev_put(dev_out);
2464         dev_out = FIB_RES_DEV(res);
2465         dev_hold(dev_out);
2466         fl.oif = dev_out->ifindex;
2467
2468
2469 make_route:
2470         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2471
2472
2473         if (free_res)
2474                 fib_res_put(&res);
2475         if (dev_out)
2476                 dev_put(dev_out);
2477 out:    return err;
2478 }
2479
2480 int __ip_route_output_key(struct net *net, struct rtable **rp,
2481                           const struct flowi *flp)
2482 {
2483         unsigned hash;
2484         struct rtable *rth;
2485
2486         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif,
2487                        atomic_read(&rt_genid));
2488
2489         rcu_read_lock_bh();
2490         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2491                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2492                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2493                     rth->fl.fl4_src == flp->fl4_src &&
2494                     rth->fl.iif == 0 &&
2495                     rth->fl.oif == flp->oif &&
2496                     rth->fl.mark == flp->mark &&
2497                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2498                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2499                     net_eq(dev_net(rth->u.dst.dev), net) &&
2500                     rth->rt_genid == atomic_read(&rt_genid)) {
2501                         dst_use(&rth->u.dst, jiffies);
2502                         RT_CACHE_STAT_INC(out_hit);
2503                         rcu_read_unlock_bh();
2504                         *rp = rth;
2505                         return 0;
2506                 }
2507                 RT_CACHE_STAT_INC(out_hlist_search);
2508         }
2509         rcu_read_unlock_bh();
2510
2511         return ip_route_output_slow(net, rp, flp);
2512 }
2513
2514 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2515
2516 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2517 {
2518 }
2519
2520 static struct dst_ops ipv4_dst_blackhole_ops = {
2521         .family                 =       AF_INET,
2522         .protocol               =       __constant_htons(ETH_P_IP),
2523         .destroy                =       ipv4_dst_destroy,
2524         .check                  =       ipv4_dst_check,
2525         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2526         .entry_size             =       sizeof(struct rtable),
2527         .entries                =       ATOMIC_INIT(0),
2528 };
2529
2530
2531 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
2532 {
2533         struct rtable *ort = *rp;
2534         struct rtable *rt = (struct rtable *)
2535                 dst_alloc(&ipv4_dst_blackhole_ops);
2536
2537         if (rt) {
2538                 struct dst_entry *new = &rt->u.dst;
2539
2540                 atomic_set(&new->__refcnt, 1);
2541                 new->__use = 1;
2542                 new->input = dst_discard;
2543                 new->output = dst_discard;
2544                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2545
2546                 new->dev = ort->u.dst.dev;
2547                 if (new->dev)
2548                         dev_hold(new->dev);
2549
2550                 rt->fl = ort->fl;
2551
2552                 rt->idev = ort->idev;
2553                 if (rt->idev)
2554                         in_dev_hold(rt->idev);
2555                 rt->rt_genid = atomic_read(&rt_genid);
2556                 rt->rt_flags = ort->rt_flags;
2557                 rt->rt_type = ort->rt_type;
2558                 rt->rt_dst = ort->rt_dst;
2559                 rt->rt_src = ort->rt_src;
2560                 rt->rt_iif = ort->rt_iif;
2561                 rt->rt_gateway = ort->rt_gateway;
2562                 rt->rt_spec_dst = ort->rt_spec_dst;
2563                 rt->peer = ort->peer;
2564                 if (rt->peer)
2565                         atomic_inc(&rt->peer->refcnt);
2566
2567                 dst_free(new);
2568         }
2569
2570         dst_release(&(*rp)->u.dst);
2571         *rp = rt;
2572         return (rt ? 0 : -ENOMEM);
2573 }
2574
2575 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2576                          struct sock *sk, int flags)
2577 {
2578         int err;
2579
2580         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2581                 return err;
2582
2583         if (flp->proto) {
2584                 if (!flp->fl4_src)
2585                         flp->fl4_src = (*rp)->rt_src;
2586                 if (!flp->fl4_dst)
2587                         flp->fl4_dst = (*rp)->rt_dst;
2588                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2589                                     flags ? XFRM_LOOKUP_WAIT : 0);
2590                 if (err == -EREMOTE)
2591                         err = ipv4_dst_blackhole(rp, flp);
2592
2593                 return err;
2594         }
2595
2596         return 0;
2597 }
2598
2599 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2600
2601 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2602 {
2603         return ip_route_output_flow(net, rp, flp, NULL, 0);
2604 }
2605
2606 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2607                         int nowait, unsigned int flags)
2608 {
2609         struct rtable *rt = skb->rtable;
2610         struct rtmsg *r;
2611         struct nlmsghdr *nlh;
2612         long expires;
2613         u32 id = 0, ts = 0, tsage = 0, error;
2614
2615         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2616         if (nlh == NULL)
2617                 return -EMSGSIZE;
2618
2619         r = nlmsg_data(nlh);
2620         r->rtm_family    = AF_INET;
2621         r->rtm_dst_len  = 32;
2622         r->rtm_src_len  = 0;
2623         r->rtm_tos      = rt->fl.fl4_tos;
2624         r->rtm_table    = RT_TABLE_MAIN;
2625         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2626         r->rtm_type     = rt->rt_type;
2627         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2628         r->rtm_protocol = RTPROT_UNSPEC;
2629         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2630         if (rt->rt_flags & RTCF_NOTIFY)
2631                 r->rtm_flags |= RTM_F_NOTIFY;
2632
2633         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2634
2635         if (rt->fl.fl4_src) {
2636                 r->rtm_src_len = 32;
2637                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2638         }
2639         if (rt->u.dst.dev)
2640                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2641 #ifdef CONFIG_NET_CLS_ROUTE
2642         if (rt->u.dst.tclassid)
2643                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2644 #endif
2645         if (rt->fl.iif)
2646                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2647         else if (rt->rt_src != rt->fl.fl4_src)
2648                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2649
2650         if (rt->rt_dst != rt->rt_gateway)
2651                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2652
2653         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2654                 goto nla_put_failure;
2655
2656         error = rt->u.dst.error;
2657         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2658         if (rt->peer) {
2659                 id = rt->peer->ip_id_count;
2660                 if (rt->peer->tcp_ts_stamp) {
2661                         ts = rt->peer->tcp_ts;
2662                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2663                 }
2664         }
2665
2666         if (rt->fl.iif) {
2667 #ifdef CONFIG_IP_MROUTE
2668                 __be32 dst = rt->rt_dst;
2669
2670                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2671                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2672                         int err = ipmr_get_route(skb, r, nowait);
2673                         if (err <= 0) {
2674                                 if (!nowait) {
2675                                         if (err == 0)
2676                                                 return 0;
2677                                         goto nla_put_failure;
2678                                 } else {
2679                                         if (err == -EMSGSIZE)
2680                                                 goto nla_put_failure;
2681                                         error = err;
2682                                 }
2683                         }
2684                 } else
2685 #endif
2686                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2687         }
2688
2689         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2690                                expires, error) < 0)
2691                 goto nla_put_failure;
2692
2693         return nlmsg_end(skb, nlh);
2694
2695 nla_put_failure:
2696         nlmsg_cancel(skb, nlh);
2697         return -EMSGSIZE;
2698 }
2699
2700 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2701 {
2702         struct net *net = sock_net(in_skb->sk);
2703         struct rtmsg *rtm;
2704         struct nlattr *tb[RTA_MAX+1];
2705         struct rtable *rt = NULL;
2706         __be32 dst = 0;
2707         __be32 src = 0;
2708         u32 iif;
2709         int err;
2710         struct sk_buff *skb;
2711
2712         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2713         if (err < 0)
2714                 goto errout;
2715
2716         rtm = nlmsg_data(nlh);
2717
2718         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2719         if (skb == NULL) {
2720                 err = -ENOBUFS;
2721                 goto errout;
2722         }
2723
2724         /* Reserve room for dummy headers, this skb can pass
2725            through good chunk of routing engine.
2726          */
2727         skb_reset_mac_header(skb);
2728         skb_reset_network_header(skb);
2729
2730         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2731         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2732         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2733
2734         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2735         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2736         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2737
2738         if (iif) {
2739                 struct net_device *dev;
2740
2741                 dev = __dev_get_by_index(net, iif);
2742                 if (dev == NULL) {
2743                         err = -ENODEV;
2744                         goto errout_free;
2745                 }
2746
2747                 skb->protocol   = htons(ETH_P_IP);
2748                 skb->dev        = dev;
2749                 local_bh_disable();
2750                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2751                 local_bh_enable();
2752
2753                 rt = skb->rtable;
2754                 if (err == 0 && rt->u.dst.error)
2755                         err = -rt->u.dst.error;
2756         } else {
2757                 struct flowi fl = {
2758                         .nl_u = {
2759                                 .ip4_u = {
2760                                         .daddr = dst,
2761                                         .saddr = src,
2762                                         .tos = rtm->rtm_tos,
2763                                 },
2764                         },
2765                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2766                 };
2767                 err = ip_route_output_key(net, &rt, &fl);
2768         }
2769
2770         if (err)
2771                 goto errout_free;
2772
2773         skb->rtable = rt;
2774         if (rtm->rtm_flags & RTM_F_NOTIFY)
2775                 rt->rt_flags |= RTCF_NOTIFY;
2776
2777         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2778                            RTM_NEWROUTE, 0, 0);
2779         if (err <= 0)
2780                 goto errout_free;
2781
2782         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2783 errout:
2784         return err;
2785
2786 errout_free:
2787         kfree_skb(skb);
2788         goto errout;
2789 }
2790
2791 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2792 {
2793         struct rtable *rt;
2794         int h, s_h;
2795         int idx, s_idx;
2796         struct net *net;
2797
2798         net = sock_net(skb->sk);
2799
2800         s_h = cb->args[0];
2801         if (s_h < 0)
2802                 s_h = 0;
2803         s_idx = idx = cb->args[1];
2804         for (h = s_h; h <= rt_hash_mask; h++) {
2805                 rcu_read_lock_bh();
2806                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2807                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2808                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2809                                 continue;
2810                         if (rt->rt_genid != atomic_read(&rt_genid))
2811                                 continue;
2812                         skb->dst = dst_clone(&rt->u.dst);
2813                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2814                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2815                                          1, NLM_F_MULTI) <= 0) {
2816                                 dst_release(xchg(&skb->dst, NULL));
2817                                 rcu_read_unlock_bh();
2818                                 goto done;
2819                         }
2820                         dst_release(xchg(&skb->dst, NULL));
2821                 }
2822                 rcu_read_unlock_bh();
2823                 s_idx = 0;
2824         }
2825
2826 done:
2827         cb->args[0] = h;
2828         cb->args[1] = idx;
2829         return skb->len;
2830 }
2831
2832 void ip_rt_multicast_event(struct in_device *in_dev)
2833 {
2834         rt_cache_flush(dev_net(in_dev->dev), 0);
2835 }
2836
2837 #ifdef CONFIG_SYSCTL
2838 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2839                                         struct file *filp, void __user *buffer,
2840                                         size_t *lenp, loff_t *ppos)
2841 {
2842         if (write) {
2843                 int flush_delay;
2844                 struct net *net;
2845                 static DEFINE_MUTEX(flush_mutex);
2846
2847                 mutex_lock(&flush_mutex);
2848                 ctl->data = &flush_delay;
2849                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2850                 ctl->data = NULL;
2851                 mutex_unlock(&flush_mutex);
2852
2853                 net = (struct net *)ctl->extra1;
2854                 rt_cache_flush(net, flush_delay);
2855                 return 0;
2856         }
2857
2858         return -EINVAL;
2859 }
2860
2861 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2862                                                 int __user *name,
2863                                                 int nlen,
2864                                                 void __user *oldval,
2865                                                 size_t __user *oldlenp,
2866                                                 void __user *newval,
2867                                                 size_t newlen)
2868 {
2869         int delay;
2870         struct net *net;
2871         if (newlen != sizeof(int))
2872                 return -EINVAL;
2873         if (get_user(delay, (int __user *)newval))
2874                 return -EFAULT;
2875         net = (struct net *)table->extra1;
2876         rt_cache_flush(net, delay);
2877         return 0;
2878 }
2879
2880 ctl_table ipv4_route_table[] = {
2881         {
2882                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2883                 .procname       = "gc_thresh",
2884                 .data           = &ipv4_dst_ops.gc_thresh,
2885                 .maxlen         = sizeof(int),
2886                 .mode           = 0644,
2887                 .proc_handler   = &proc_dointvec,
2888         },
2889         {
2890                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2891                 .procname       = "max_size",
2892                 .data           = &ip_rt_max_size,
2893                 .maxlen         = sizeof(int),
2894                 .mode           = 0644,
2895                 .proc_handler   = &proc_dointvec,
2896         },
2897         {
2898                 /*  Deprecated. Use gc_min_interval_ms */
2899
2900                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2901                 .procname       = "gc_min_interval",
2902                 .data           = &ip_rt_gc_min_interval,
2903                 .maxlen         = sizeof(int),
2904                 .mode           = 0644,
2905                 .proc_handler   = &proc_dointvec_jiffies,
2906                 .strategy       = &sysctl_jiffies,
2907         },
2908         {
2909                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2910                 .procname       = "gc_min_interval_ms",
2911                 .data           = &ip_rt_gc_min_interval,
2912                 .maxlen         = sizeof(int),
2913                 .mode           = 0644,
2914                 .proc_handler   = &proc_dointvec_ms_jiffies,
2915                 .strategy       = &sysctl_ms_jiffies,
2916         },
2917         {
2918                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2919                 .procname       = "gc_timeout",
2920                 .data           = &ip_rt_gc_timeout,
2921                 .maxlen         = sizeof(int),
2922                 .mode           = 0644,
2923                 .proc_handler   = &proc_dointvec_jiffies,
2924                 .strategy       = &sysctl_jiffies,
2925         },
2926         {
2927                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2928                 .procname       = "gc_interval",
2929                 .data           = &ip_rt_gc_interval,
2930                 .maxlen         = sizeof(int),
2931                 .mode           = 0644,
2932                 .proc_handler   = &proc_dointvec_jiffies,
2933                 .strategy       = &sysctl_jiffies,
2934         },
2935         {
2936                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2937                 .procname       = "redirect_load",
2938                 .data           = &ip_rt_redirect_load,
2939                 .maxlen         = sizeof(int),
2940                 .mode           = 0644,
2941                 .proc_handler   = &proc_dointvec,
2942         },
2943         {
2944                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2945                 .procname       = "redirect_number",
2946                 .data           = &ip_rt_redirect_number,
2947                 .maxlen         = sizeof(int),
2948                 .mode           = 0644,
2949                 .proc_handler   = &proc_dointvec,
2950         },
2951         {
2952                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2953                 .procname       = "redirect_silence",
2954                 .data           = &ip_rt_redirect_silence,
2955                 .maxlen         = sizeof(int),
2956                 .mode           = 0644,
2957                 .proc_handler   = &proc_dointvec,
2958         },
2959         {
2960                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2961                 .procname       = "error_cost",
2962                 .data           = &ip_rt_error_cost,
2963                 .maxlen         = sizeof(int),
2964                 .mode           = 0644,
2965                 .proc_handler   = &proc_dointvec,
2966         },
2967         {
2968                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2969                 .procname       = "error_burst",
2970                 .data           = &ip_rt_error_burst,
2971                 .maxlen         = sizeof(int),
2972                 .mode           = 0644,
2973                 .proc_handler   = &proc_dointvec,
2974         },
2975         {
2976                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2977                 .procname       = "gc_elasticity",
2978                 .data           = &ip_rt_gc_elasticity,
2979                 .maxlen         = sizeof(int),
2980                 .mode           = 0644,
2981                 .proc_handler   = &proc_dointvec,
2982         },
2983         {
2984                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2985                 .procname       = "mtu_expires",
2986                 .data           = &ip_rt_mtu_expires,
2987                 .maxlen         = sizeof(int),
2988                 .mode           = 0644,
2989                 .proc_handler   = &proc_dointvec_jiffies,
2990                 .strategy       = &sysctl_jiffies,
2991         },
2992         {
2993                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2994                 .procname       = "min_pmtu",
2995                 .data           = &ip_rt_min_pmtu,
2996                 .maxlen         = sizeof(int),
2997                 .mode           = 0644,
2998                 .proc_handler   = &proc_dointvec,
2999         },
3000         {
3001                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3002                 .procname       = "min_adv_mss",
3003                 .data           = &ip_rt_min_advmss,
3004                 .maxlen         = sizeof(int),
3005                 .mode           = 0644,
3006                 .proc_handler   = &proc_dointvec,
3007         },
3008         {
3009                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3010                 .procname       = "secret_interval",
3011                 .data           = &ip_rt_secret_interval,
3012                 .maxlen         = sizeof(int),
3013                 .mode           = 0644,
3014                 .proc_handler   = &proc_dointvec_jiffies,
3015                 .strategy       = &sysctl_jiffies,
3016         },
3017         { .ctl_name = 0 }
3018 };
3019
3020 static __net_initdata struct ctl_path ipv4_route_path[] = {
3021         { .procname = "net", .ctl_name = CTL_NET, },
3022         { .procname = "ipv4", .ctl_name = NET_IPV4, },
3023         { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3024         { },
3025 };
3026
3027
3028 static struct ctl_table ipv4_route_flush_table[] = {
3029         {
3030                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
3031                 .procname       = "flush",
3032                 .maxlen         = sizeof(int),
3033                 .mode           = 0200,
3034                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
3035                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
3036         },
3037         { .ctl_name = 0 },
3038 };
3039
3040 static __net_init int sysctl_route_net_init(struct net *net)
3041 {
3042         struct ctl_table *tbl;
3043
3044         tbl = ipv4_route_flush_table;
3045         if (net != &init_net) {
3046                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3047                 if (tbl == NULL)
3048                         goto err_dup;
3049         }
3050         tbl[0].extra1 = net;
3051
3052         net->ipv4.route_hdr =
3053                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3054         if (net->ipv4.route_hdr == NULL)
3055                 goto err_reg;
3056         return 0;
3057
3058 err_reg:
3059         if (tbl != ipv4_route_flush_table)
3060                 kfree(tbl);
3061 err_dup:
3062         return -ENOMEM;
3063 }
3064
3065 static __net_exit void sysctl_route_net_exit(struct net *net)
3066 {
3067         struct ctl_table *tbl;
3068
3069         tbl = net->ipv4.route_hdr->ctl_table_arg;
3070         unregister_net_sysctl_table(net->ipv4.route_hdr);
3071         BUG_ON(tbl == ipv4_route_flush_table);
3072         kfree(tbl);
3073 }
3074
3075 static __net_initdata struct pernet_operations sysctl_route_ops = {
3076         .init = sysctl_route_net_init,
3077         .exit = sysctl_route_net_exit,
3078 };
3079 #endif
3080
3081
3082 static __net_init int rt_secret_timer_init(struct net *net)
3083 {
3084         net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3085         net->ipv4.rt_secret_timer.data = (unsigned long)net;
3086         init_timer_deferrable(&net->ipv4.rt_secret_timer);
3087
3088         net->ipv4.rt_secret_timer.expires =
3089                 jiffies + net_random() % ip_rt_secret_interval +
3090                 ip_rt_secret_interval;
3091         add_timer(&net->ipv4.rt_secret_timer);
3092         return 0;
3093 }
3094
3095 static __net_exit void rt_secret_timer_exit(struct net *net)
3096 {
3097         del_timer_sync(&net->ipv4.rt_secret_timer);
3098 }
3099
3100 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3101         .init = rt_secret_timer_init,
3102         .exit = rt_secret_timer_exit,
3103 };
3104
3105
3106 #ifdef CONFIG_NET_CLS_ROUTE
3107 struct ip_rt_acct *ip_rt_acct __read_mostly;
3108 #endif /* CONFIG_NET_CLS_ROUTE */
3109
3110 static __initdata unsigned long rhash_entries;
3111 static int __init set_rhash_entries(char *str)
3112 {
3113         if (!str)
3114                 return 0;
3115         rhash_entries = simple_strtoul(str, &str, 0);
3116         return 1;
3117 }
3118 __setup("rhash_entries=", set_rhash_entries);
3119
3120 int __init ip_rt_init(void)
3121 {
3122         int rc = 0;
3123
3124         atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3125                              (jiffies ^ (jiffies >> 7))));
3126
3127 #ifdef CONFIG_NET_CLS_ROUTE
3128         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3129         if (!ip_rt_acct)
3130                 panic("IP: failed to allocate ip_rt_acct\n");
3131 #endif
3132
3133         ipv4_dst_ops.kmem_cachep =
3134                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3135                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3136
3137         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3138
3139         rt_hash_table = (struct rt_hash_bucket *)
3140                 alloc_large_system_hash("IP route cache",
3141                                         sizeof(struct rt_hash_bucket),
3142                                         rhash_entries,
3143                                         (num_physpages >= 128 * 1024) ?
3144                                         15 : 17,
3145                                         0,
3146                                         &rt_hash_log,
3147                                         &rt_hash_mask,
3148                                         0);
3149         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3150         rt_hash_lock_init();
3151
3152         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3153         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3154
3155         devinet_init();
3156         ip_fib_init();
3157
3158         /* All the timers, started at system startup tend
3159            to synchronize. Perturb it a bit.
3160          */
3161         schedule_delayed_work(&expires_work,
3162                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3163
3164         if (register_pernet_subsys(&rt_secret_timer_ops))
3165                 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3166
3167         if (ip_rt_proc_init())
3168                 printk(KERN_ERR "Unable to create route proc files\n");
3169 #ifdef CONFIG_XFRM
3170         xfrm_init();
3171         xfrm4_init();
3172 #endif
3173         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3174
3175 #ifdef CONFIG_SYSCTL
3176         register_pernet_subsys(&sysctl_route_ops);
3177 #endif
3178         return rc;
3179 }
3180
3181 EXPORT_SYMBOL(__ip_select_ident);
3182 EXPORT_SYMBOL(ip_route_input);
3183 EXPORT_SYMBOL(ip_route_output_key);