net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <linux/prefetch.h>
  95 #include <net/dst.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #endif
 112 #include <net/atmclip.h>
 113 #include <net/secure_seq.h>
 114
 115 #define RT_FL_TOS(oldflp4) \
 116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 117
 118 #define IP_MAX_MTU      0xFFF0
 119
 120 #define RT_GC_TIMEOUT (300*HZ)
 121
 122 static int ip_rt_max_size;
 123 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 124 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 125 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 126 static int ip_rt_redirect_number __read_mostly  = 9;
 127 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 128 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 129 static int ip_rt_error_cost __read_mostly       = HZ;
 130 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 131 static int ip_rt_gc_elasticity __read_mostly    = 8;
 132 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 133 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 134 static int ip_rt_min_advmss __read_mostly       = 256;
 135 static int rt_chain_length_max __read_mostly    = 20;
 136 static int redirect_genid;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152 static int rt_garbage_collect(struct dst_ops *ops);
 153
 154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 155                             int how)
 156 {
 157 }
 158
 159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 160 {
 161         struct rtable *rt = (struct rtable *) dst;
 162         struct inet_peer *peer;
 163         u32 *p = NULL;
 164
 165         if (!rt->peer)
 166                 rt_bind_peer(rt, rt->rt_dst, 1);
 167
 168         peer = rt->peer;
 169         if (peer) {
 170                 u32 *old_p = __DST_METRICS_PTR(old);
 171                 unsigned long prev, new;
 172
 173                 p = peer->metrics;
 174                 if (inet_metrics_new(peer))
 175                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 176
 177                 new = (unsigned long) p;
 178                 prev = cmpxchg(&dst->_metrics, old, new);
 179
 180                 if (prev != old) {
 181                         p = __DST_METRICS_PTR(prev);
 182                         if (prev & DST_METRICS_READ_ONLY)
 183                                 p = NULL;
 184                 } else {
 185                         if (rt->fi) {
 186                                 fib_info_put(rt->fi);
 187                                 rt->fi = NULL;
 188                         }
 189                 }
 190         }
 191         return p;
 192 }
 193
 194 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 195
 196 static struct dst_ops ipv4_dst_ops = {
 197         .family =               AF_INET,
 198         .protocol =             cpu_to_be16(ETH_P_IP),
 199         .gc =                   rt_garbage_collect,
 200         .check =                ipv4_dst_check,
 201         .default_advmss =       ipv4_default_advmss,
 202         .mtu =                  ipv4_mtu,
 203         .cow_metrics =          ipv4_cow_metrics,
 204         .destroy =              ipv4_dst_destroy,
 205         .ifdown =               ipv4_dst_ifdown,
 206         .negative_advice =      ipv4_negative_advice,
 207         .link_failure =         ipv4_link_failure,
 208         .update_pmtu =          ip_rt_update_pmtu,
 209         .local_out =            __ip_local_out,
 210         .neigh_lookup =         ipv4_neigh_lookup,
 211 };
 212
 213 #define ECN_OR_COST(class)      TC_PRIO_##class
 214
 215 const __u8 ip_tos2prio[16] = {
 216         TC_PRIO_BESTEFFORT,
 217         ECN_OR_COST(BESTEFFORT),
 218         TC_PRIO_BESTEFFORT,
 219         ECN_OR_COST(BESTEFFORT),
 220         TC_PRIO_BULK,
 221         ECN_OR_COST(BULK),
 222         TC_PRIO_BULK,
 223         ECN_OR_COST(BULK),
 224         TC_PRIO_INTERACTIVE,
 225         ECN_OR_COST(INTERACTIVE),
 226         TC_PRIO_INTERACTIVE,
 227         ECN_OR_COST(INTERACTIVE),
 228         TC_PRIO_INTERACTIVE_BULK,
 229         ECN_OR_COST(INTERACTIVE_BULK),
 230         TC_PRIO_INTERACTIVE_BULK,
 231         ECN_OR_COST(INTERACTIVE_BULK)
 232 };
 233
 234
 235 /*
 236  * Route cache.
 237  */
 238
 239 /* The locking scheme is rather straight forward:
 240  *
 241  * 1) Read-Copy Update protects the buckets of the central route hash.
 242  * 2) Only writers remove entries, and they hold the lock
 243  *    as they look at rtable reference counts.
 244  * 3) Only readers acquire references to rtable entries,
 245  *    they do so with atomic increments and with the
 246  *    lock held.
 247  */
 248
 249 struct rt_hash_bucket {
 250         struct rtable __rcu     *chain;
 251 };
 252
 253 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 254         defined(CONFIG_PROVE_LOCKING)
 255 /*
 256  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 257  * The size of this table is a power of two and depends on the number of CPUS.
 258  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 259  */
 260 #ifdef CONFIG_LOCKDEP
 261 # define RT_HASH_LOCK_SZ        256
 262 #else
 263 # if NR_CPUS >= 32
 264 #  define RT_HASH_LOCK_SZ       4096
 265 # elif NR_CPUS >= 16
 266 #  define RT_HASH_LOCK_SZ       2048
 267 # elif NR_CPUS >= 8
 268 #  define RT_HASH_LOCK_SZ       1024
 269 # elif NR_CPUS >= 4
 270 #  define RT_HASH_LOCK_SZ       512
 271 # else
 272 #  define RT_HASH_LOCK_SZ       256
 273 # endif
 274 #endif
 275
 276 static spinlock_t       *rt_hash_locks;
 277 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 278
 279 static __init void rt_hash_lock_init(void)
 280 {
 281         int i;
 282
 283         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 284                         GFP_KERNEL);
 285         if (!rt_hash_locks)
 286                 panic("IP: failed to allocate rt_hash_locks\n");
 287
 288         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 289                 spin_lock_init(&rt_hash_locks[i]);
 290 }
 291 #else
 292 # define rt_hash_lock_addr(slot) NULL
 293
 294 static inline void rt_hash_lock_init(void)
 295 {
 296 }
 297 #endif
 298
 299 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 300 static unsigned                 rt_hash_mask __read_mostly;
 301 static unsigned int             rt_hash_log  __read_mostly;
 302
 303 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 304 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 305
 306 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 307                                    int genid)
 308 {
 309         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 310                             idx, genid)
 311                 & rt_hash_mask;
 312 }
 313
 314 static inline int rt_genid(struct net *net)
 315 {
 316         return atomic_read(&net->ipv4.rt_genid);
 317 }
 318
 319 #ifdef CONFIG_PROC_FS
 320 struct rt_cache_iter_state {
 321         struct seq_net_private p;
 322         int bucket;
 323         int genid;
 324 };
 325
 326 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 327 {
 328         struct rt_cache_iter_state *st = seq->private;
 329         struct rtable *r = NULL;
 330
 331         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 332                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 333                         continue;
 334                 rcu_read_lock_bh();
 335                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 336                 while (r) {
 337                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 338                             r->rt_genid == st->genid)
 339                                 return r;
 340                         r = rcu_dereference_bh(r->dst.rt_next);
 341                 }
 342                 rcu_read_unlock_bh();
 343         }
 344         return r;
 345 }
 346
 347 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 348                                           struct rtable *r)
 349 {
 350         struct rt_cache_iter_state *st = seq->private;
 351
 352         r = rcu_dereference_bh(r->dst.rt_next);
 353         while (!r) {
 354                 rcu_read_unlock_bh();
 355                 do {
 356                         if (--st->bucket < 0)
 357                                 return NULL;
 358                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 359                 rcu_read_lock_bh();
 360                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 361         }
 362         return r;
 363 }
 364
 365 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 366                                         struct rtable *r)
 367 {
 368         struct rt_cache_iter_state *st = seq->private;
 369         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 370                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 371                         continue;
 372                 if (r->rt_genid == st->genid)
 373                         break;
 374         }
 375         return r;
 376 }
 377
 378 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 379 {
 380         struct rtable *r = rt_cache_get_first(seq);
 381
 382         if (r)
 383                 while (pos && (r = rt_cache_get_next(seq, r)))
 384                         --pos;
 385         return pos ? NULL : r;
 386 }
 387
 388 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 389 {
 390         struct rt_cache_iter_state *st = seq->private;
 391         if (*pos)
 392                 return rt_cache_get_idx(seq, *pos - 1);
 393         st->genid = rt_genid(seq_file_net(seq));
 394         return SEQ_START_TOKEN;
 395 }
 396
 397 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 398 {
 399         struct rtable *r;
 400
 401         if (v == SEQ_START_TOKEN)
 402                 r = rt_cache_get_first(seq);
 403         else
 404                 r = rt_cache_get_next(seq, v);
 405         ++*pos;
 406         return r;
 407 }
 408
 409 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 410 {
 411         if (v && v != SEQ_START_TOKEN)
 412                 rcu_read_unlock_bh();
 413 }
 414
 415 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 416 {
 417         if (v == SEQ_START_TOKEN)
 418                 seq_printf(seq, "%-127s\n",
 419                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 420                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 421                            "HHUptod\tSpecDst");
 422         else {
 423                 struct rtable *r = v;
 424                 struct neighbour *n;
 425                 int len, HHUptod;
 426
 427                 rcu_read_lock();
 428                 n = dst_get_neighbour(&r->dst);
 429                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 430                 rcu_read_unlock();
 431
 432                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 433                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 434                         r->dst.dev ? r->dst.dev->name : "*",
 435                         (__force u32)r->rt_dst,
 436                         (__force u32)r->rt_gateway,
 437                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 438                         r->dst.__use, 0, (__force u32)r->rt_src,
 439                         dst_metric_advmss(&r->dst) + 40,
 440                         dst_metric(&r->dst, RTAX_WINDOW),
 441                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 442                               dst_metric(&r->dst, RTAX_RTTVAR)),
 443                         r->rt_key_tos,
 444                         -1,
 445                         HHUptod,
 446                         r->rt_spec_dst, &len);
 447
 448                 seq_printf(seq, "%*s\n", 127 - len, "");
 449         }
 450         return 0;
 451 }
 452
 453 static const struct seq_operations rt_cache_seq_ops = {
 454         .start  = rt_cache_seq_start,
 455         .next   = rt_cache_seq_next,
 456         .stop   = rt_cache_seq_stop,
 457         .show   = rt_cache_seq_show,
 458 };
 459
 460 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 461 {
 462         return seq_open_net(inode, file, &rt_cache_seq_ops,
 463                         sizeof(struct rt_cache_iter_state));
 464 }
 465
 466 static const struct file_operations rt_cache_seq_fops = {
 467         .owner   = THIS_MODULE,
 468         .open    = rt_cache_seq_open,
 469         .read    = seq_read,
 470         .llseek  = seq_lseek,
 471         .release = seq_release_net,
 472 };
 473
 474
 475 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 476 {
 477         int cpu;
 478
 479         if (*pos == 0)
 480                 return SEQ_START_TOKEN;
 481
 482         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 483                 if (!cpu_possible(cpu))
 484                         continue;
 485                 *pos = cpu+1;
 486                 return &per_cpu(rt_cache_stat, cpu);
 487         }
 488         return NULL;
 489 }
 490
 491 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 492 {
 493         int cpu;
 494
 495         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 496                 if (!cpu_possible(cpu))
 497                         continue;
 498                 *pos = cpu+1;
 499                 return &per_cpu(rt_cache_stat, cpu);
 500         }
 501         return NULL;
 502
 503 }
 504
 505 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 506 {
 507
 508 }
 509
 510 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 511 {
 512         struct rt_cache_stat *st = v;
 513
 514         if (v == SEQ_START_TOKEN) {
 515                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 516                 return 0;
 517         }
 518
 519         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 520                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 521                    dst_entries_get_slow(&ipv4_dst_ops),
 522                    st->in_hit,
 523                    st->in_slow_tot,
 524                    st->in_slow_mc,
 525                    st->in_no_route,
 526                    st->in_brd,
 527                    st->in_martian_dst,
 528                    st->in_martian_src,
 529
 530                    st->out_hit,
 531                    st->out_slow_tot,
 532                    st->out_slow_mc,
 533
 534                    st->gc_total,
 535                    st->gc_ignored,
 536                    st->gc_goal_miss,
 537                    st->gc_dst_overflow,
 538                    st->in_hlist_search,
 539                    st->out_hlist_search
 540                 );
 541         return 0;
 542 }
 543
 544 static const struct seq_operations rt_cpu_seq_ops = {
 545         .start  = rt_cpu_seq_start,
 546         .next   = rt_cpu_seq_next,
 547         .stop   = rt_cpu_seq_stop,
 548         .show   = rt_cpu_seq_show,
 549 };
 550
 551
 552 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 553 {
 554         return seq_open(file, &rt_cpu_seq_ops);
 555 }
 556
 557 static const struct file_operations rt_cpu_seq_fops = {
 558         .owner   = THIS_MODULE,
 559         .open    = rt_cpu_seq_open,
 560         .read    = seq_read,
 561         .llseek  = seq_lseek,
 562         .release = seq_release,
 563 };
 564
 565 #ifdef CONFIG_IP_ROUTE_CLASSID
 566 static int rt_acct_proc_show(struct seq_file *m, void *v)
 567 {
 568         struct ip_rt_acct *dst, *src;
 569         unsigned int i, j;
 570
 571         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 572         if (!dst)
 573                 return -ENOMEM;
 574
 575         for_each_possible_cpu(i) {
 576                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 577                 for (j = 0; j < 256; j++) {
 578                         dst[j].o_bytes   += src[j].o_bytes;
 579                         dst[j].o_packets += src[j].o_packets;
 580                         dst[j].i_bytes   += src[j].i_bytes;
 581                         dst[j].i_packets += src[j].i_packets;
 582                 }
 583         }
 584
 585         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 586         kfree(dst);
 587         return 0;
 588 }
 589
 590 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 591 {
 592         return single_open(file, rt_acct_proc_show, NULL);
 593 }
 594
 595 static const struct file_operations rt_acct_proc_fops = {
 596         .owner          = THIS_MODULE,
 597         .open           = rt_acct_proc_open,
 598         .read           = seq_read,
 599         .llseek         = seq_lseek,
 600         .release        = single_release,
 601 };
 602 #endif
 603
 604 static int __net_init ip_rt_do_proc_init(struct net *net)
 605 {
 606         struct proc_dir_entry *pde;
 607
 608         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 609                         &rt_cache_seq_fops);
 610         if (!pde)
 611                 goto err1;
 612
 613         pde = proc_create("rt_cache", S_IRUGO,
 614                           net->proc_net_stat, &rt_cpu_seq_fops);
 615         if (!pde)
 616                 goto err2;
 617
 618 #ifdef CONFIG_IP_ROUTE_CLASSID
 619         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 620         if (!pde)
 621                 goto err3;
 622 #endif
 623         return 0;
 624
 625 #ifdef CONFIG_IP_ROUTE_CLASSID
 626 err3:
 627         remove_proc_entry("rt_cache", net->proc_net_stat);
 628 #endif
 629 err2:
 630         remove_proc_entry("rt_cache", net->proc_net);
 631 err1:
 632         return -ENOMEM;
 633 }
 634
 635 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 636 {
 637         remove_proc_entry("rt_cache", net->proc_net_stat);
 638         remove_proc_entry("rt_cache", net->proc_net);
 639 #ifdef CONFIG_IP_ROUTE_CLASSID
 640         remove_proc_entry("rt_acct", net->proc_net);
 641 #endif
 642 }
 643
 644 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 645         .init = ip_rt_do_proc_init,
 646         .exit = ip_rt_do_proc_exit,
 647 };
 648
 649 static int __init ip_rt_proc_init(void)
 650 {
 651         return register_pernet_subsys(&ip_rt_proc_ops);
 652 }
 653
 654 #else
 655 static inline int ip_rt_proc_init(void)
 656 {
 657         return 0;
 658 }
 659 #endif /* CONFIG_PROC_FS */
 660
 661 static inline void rt_free(struct rtable *rt)
 662 {
 663         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 664 }
 665
 666 static inline void rt_drop(struct rtable *rt)
 667 {
 668         ip_rt_put(rt);
 669         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 670 }
 671
 672 static inline int rt_fast_clean(struct rtable *rth)
 673 {
 674         /* Kill broadcast/multicast entries very aggresively, if they
 675            collide in hash table with more useful entries */
 676         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 677                 rt_is_input_route(rth) && rth->dst.rt_next;
 678 }
 679
 680 static inline int rt_valuable(struct rtable *rth)
 681 {
 682         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 683                 (rth->peer && rth->peer->pmtu_expires);
 684 }
 685
 686 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 687 {
 688         unsigned long age;
 689         int ret = 0;
 690
 691         if (atomic_read(&rth->dst.__refcnt))
 692                 goto out;
 693
 694         age = jiffies - rth->dst.lastuse;
 695         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 696             (age <= tmo2 && rt_valuable(rth)))
 697                 goto out;
 698         ret = 1;
 699 out:    return ret;
 700 }
 701
 702 /* Bits of score are:
 703  * 31: very valuable
 704  * 30: not quite useless
 705  * 29..0: usage counter
 706  */
 707 static inline u32 rt_score(struct rtable *rt)
 708 {
 709         u32 score = jiffies - rt->dst.lastuse;
 710
 711         score = ~score & ~(3<<30);
 712
 713         if (rt_valuable(rt))
 714                 score |= (1<<31);
 715
 716         if (rt_is_output_route(rt) ||
 717             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 718                 score |= (1<<30);
 719
 720         return score;
 721 }
 722
 723 static inline bool rt_caching(const struct net *net)
 724 {
 725         return net->ipv4.current_rt_cache_rebuild_count <=
 726                 net->ipv4.sysctl_rt_cache_rebuild_count;
 727 }
 728
 729 static inline bool compare_hash_inputs(const struct rtable *rt1,
 730                                        const struct rtable *rt2)
 731 {
 732         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 733                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 734                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 735 }
 736
 737 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 738 {
 739         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 740                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 741                 (rt1->rt_mark ^ rt2->rt_mark) |
 742                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 743                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 744                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 745 }
 746
 747 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 748 {
 749         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 750 }
 751
 752 static inline int rt_is_expired(struct rtable *rth)
 753 {
 754         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 755 }
 756
 757 /*
 758  * Perform a full scan of hash table and free all entries.
 759  * Can be called by a softirq or a process.
 760  * In the later case, we want to be reschedule if necessary
 761  */
 762 static void rt_do_flush(struct net *net, int process_context)
 763 {
 764         unsigned int i;
 765         struct rtable *rth, *next;
 766
 767         for (i = 0; i <= rt_hash_mask; i++) {
 768                 struct rtable __rcu **pprev;
 769                 struct rtable *list;
 770
 771                 if (process_context && need_resched())
 772                         cond_resched();
 773                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 774                 if (!rth)
 775                         continue;
 776
 777                 spin_lock_bh(rt_hash_lock_addr(i));
 778
 779                 list = NULL;
 780                 pprev = &rt_hash_table[i].chain;
 781                 rth = rcu_dereference_protected(*pprev,
 782                         lockdep_is_held(rt_hash_lock_addr(i)));
 783
 784                 while (rth) {
 785                         next = rcu_dereference_protected(rth->dst.rt_next,
 786                                 lockdep_is_held(rt_hash_lock_addr(i)));
 787
 788                         if (!net ||
 789                             net_eq(dev_net(rth->dst.dev), net)) {
 790                                 rcu_assign_pointer(*pprev, next);
 791                                 rcu_assign_pointer(rth->dst.rt_next, list);
 792                                 list = rth;
 793                         } else {
 794                                 pprev = &rth->dst.rt_next;
 795                         }
 796                         rth = next;
 797                 }
 798
 799                 spin_unlock_bh(rt_hash_lock_addr(i));
 800
 801                 for (; list; list = next) {
 802                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 803                         rt_free(list);
 804                 }
 805         }
 806 }
 807
 808 /*
 809  * While freeing expired entries, we compute average chain length
 810  * and standard deviation, using fixed-point arithmetic.
 811  * This to have an estimation of rt_chain_length_max
 812  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 813  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 814  */
 815
 816 #define FRACT_BITS 3
 817 #define ONE (1UL << FRACT_BITS)
 818
 819 /*
 820  * Given a hash chain and an item in this hash chain,
 821  * find if a previous entry has the same hash_inputs
 822  * (but differs on tos, mark or oif)
 823  * Returns 0 if an alias is found.
 824  * Returns ONE if rth has no alias before itself.
 825  */
 826 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 827 {
 828         const struct rtable *aux = head;
 829
 830         while (aux != rth) {
 831                 if (compare_hash_inputs(aux, rth))
 832                         return 0;
 833                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 834         }
 835         return ONE;
 836 }
 837
 838 static void rt_check_expire(void)
 839 {
 840         static unsigned int rover;
 841         unsigned int i = rover, goal;
 842         struct rtable *rth;
 843         struct rtable __rcu **rthp;
 844         unsigned long samples = 0;
 845         unsigned long sum = 0, sum2 = 0;
 846         unsigned long delta;
 847         u64 mult;
 848
 849         delta = jiffies - expires_ljiffies;
 850         expires_ljiffies = jiffies;
 851         mult = ((u64)delta) << rt_hash_log;
 852         if (ip_rt_gc_timeout > 1)
 853                 do_div(mult, ip_rt_gc_timeout);
 854         goal = (unsigned int)mult;
 855         if (goal > rt_hash_mask)
 856                 goal = rt_hash_mask + 1;
 857         for (; goal > 0; goal--) {
 858                 unsigned long tmo = ip_rt_gc_timeout;
 859                 unsigned long length;
 860
 861                 i = (i + 1) & rt_hash_mask;
 862                 rthp = &rt_hash_table[i].chain;
 863
 864                 if (need_resched())
 865                         cond_resched();
 866
 867                 samples++;
 868
 869                 if (rcu_dereference_raw(*rthp) == NULL)
 870                         continue;
 871                 length = 0;
 872                 spin_lock_bh(rt_hash_lock_addr(i));
 873                 while ((rth = rcu_dereference_protected(*rthp,
 874                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 875                         prefetch(rth->dst.rt_next);
 876                         if (rt_is_expired(rth)) {
 877                                 *rthp = rth->dst.rt_next;
 878                                 rt_free(rth);
 879                                 continue;
 880                         }
 881                         if (rth->dst.expires) {
 882                                 /* Entry is expired even if it is in use */
 883                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 884 nofree:
 885                                         tmo >>= 1;
 886                                         rthp = &rth->dst.rt_next;
 887                                         /*
 888                                          * We only count entries on
 889                                          * a chain with equal hash inputs once
 890                                          * so that entries for different QOS
 891                                          * levels, and other non-hash input
 892                                          * attributes don't unfairly skew
 893                                          * the length computation
 894                                          */
 895                                         length += has_noalias(rt_hash_table[i].chain, rth);
 896                                         continue;
 897                                 }
 898                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 899                                 goto nofree;
 900
 901                         /* Cleanup aged off entries. */
 902                         *rthp = rth->dst.rt_next;
 903                         rt_free(rth);
 904                 }
 905                 spin_unlock_bh(rt_hash_lock_addr(i));
 906                 sum += length;
 907                 sum2 += length*length;
 908         }
 909         if (samples) {
 910                 unsigned long avg = sum / samples;
 911                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 912                 rt_chain_length_max = max_t(unsigned long,
 913                                         ip_rt_gc_elasticity,
 914                                         (avg + 4*sd) >> FRACT_BITS);
 915         }
 916         rover = i;
 917 }
 918
 919 /*
 920  * rt_worker_func() is run in process context.
 921  * we call rt_check_expire() to scan part of the hash table
 922  */
 923 static void rt_worker_func(struct work_struct *work)
 924 {
 925         rt_check_expire();
 926         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 927 }
 928
 929 /*
 930  * Perturbation of rt_genid by a small quantity [1..256]
 931  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 932  * many times (2^24) without giving recent rt_genid.
 933  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 934  */
 935 static void rt_cache_invalidate(struct net *net)
 936 {
 937         unsigned char shuffle;
 938
 939         get_random_bytes(&shuffle, sizeof(shuffle));
 940         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 941         redirect_genid++;
 942         inetpeer_invalidate_tree(AF_INET);
 943 }
 944
 945 /*
 946  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 947  * delay >= 0 : invalidate & flush cache (can be long)
 948  */
 949 void rt_cache_flush(struct net *net, int delay)
 950 {
 951         rt_cache_invalidate(net);
 952         if (delay >= 0)
 953                 rt_do_flush(net, !in_softirq());
 954 }
 955
 956 /* Flush previous cache invalidated entries from the cache */
 957 void rt_cache_flush_batch(struct net *net)
 958 {
 959         rt_do_flush(net, !in_softirq());
 960 }
 961
 962 static void rt_emergency_hash_rebuild(struct net *net)
 963 {
 964         if (net_ratelimit())
 965                 printk(KERN_WARNING "Route hash chain too long!\n");
 966         rt_cache_invalidate(net);
 967 }
 968
 969 /*
 970    Short description of GC goals.
 971
 972    We want to build algorithm, which will keep routing cache
 973    at some equilibrium point, when number of aged off entries
 974    is kept approximately equal to newly generated ones.
 975
 976    Current expiration strength is variable "expire".
 977    We try to adjust it dynamically, so that if networking
 978    is idle expires is large enough to keep enough of warm entries,
 979    and when load increases it reduces to limit cache size.
 980  */
 981
 982 static int rt_garbage_collect(struct dst_ops *ops)
 983 {
 984         static unsigned long expire = RT_GC_TIMEOUT;
 985         static unsigned long last_gc;
 986         static int rover;
 987         static int equilibrium;
 988         struct rtable *rth;
 989         struct rtable __rcu **rthp;
 990         unsigned long now = jiffies;
 991         int goal;
 992         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 993
 994         /*
 995          * Garbage collection is pretty expensive,
 996          * do not make it too frequently.
 997          */
 998
 999         RT_CACHE_STAT_INC(gc_total);
1000
1001         if (now - last_gc < ip_rt_gc_min_interval &&
1002             entries < ip_rt_max_size) {
1003                 RT_CACHE_STAT_INC(gc_ignored);
1004                 goto out;
1005         }
1006
1007         entries = dst_entries_get_slow(&ipv4_dst_ops);
1008         /* Calculate number of entries, which we want to expire now. */
1009         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1010         if (goal <= 0) {
1011                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1012                         equilibrium = ipv4_dst_ops.gc_thresh;
1013                 goal = entries - equilibrium;
1014                 if (goal > 0) {
1015                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1016                         goal = entries - equilibrium;
1017                 }
1018         } else {
1019                 /* We are in dangerous area. Try to reduce cache really
1020                  * aggressively.
1021                  */
1022                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1023                 equilibrium = entries - goal;
1024         }
1025
1026         if (now - last_gc >= ip_rt_gc_min_interval)
1027                 last_gc = now;
1028
1029         if (goal <= 0) {
1030                 equilibrium += goal;
1031                 goto work_done;
1032         }
1033
1034         do {
1035                 int i, k;
1036
1037                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1038                         unsigned long tmo = expire;
1039
1040                         k = (k + 1) & rt_hash_mask;
1041                         rthp = &rt_hash_table[k].chain;
1042                         spin_lock_bh(rt_hash_lock_addr(k));
1043                         while ((rth = rcu_dereference_protected(*rthp,
1044                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1045                                 if (!rt_is_expired(rth) &&
1046                                         !rt_may_expire(rth, tmo, expire)) {
1047                                         tmo >>= 1;
1048                                         rthp = &rth->dst.rt_next;
1049                                         continue;
1050                                 }
1051                                 *rthp = rth->dst.rt_next;
1052                                 rt_free(rth);
1053                                 goal--;
1054                         }
1055                         spin_unlock_bh(rt_hash_lock_addr(k));
1056                         if (goal <= 0)
1057                                 break;
1058                 }
1059                 rover = k;
1060
1061                 if (goal <= 0)
1062                         goto work_done;
1063
1064                 /* Goal is not achieved. We stop process if:
1065
1066                    - if expire reduced to zero. Otherwise, expire is halfed.
1067                    - if table is not full.
1068                    - if we are called from interrupt.
1069                    - jiffies check is just fallback/debug loop breaker.
1070                      We will not spin here for long time in any case.
1071                  */
1072
1073                 RT_CACHE_STAT_INC(gc_goal_miss);
1074
1075                 if (expire == 0)
1076                         break;
1077
1078                 expire >>= 1;
1079
1080                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1081                         goto out;
1082         } while (!in_softirq() && time_before_eq(jiffies, now));
1083
1084         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1085                 goto out;
1086         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1087                 goto out;
1088         if (net_ratelimit())
1089                 printk(KERN_WARNING "dst cache overflow\n");
1090         RT_CACHE_STAT_INC(gc_dst_overflow);
1091         return 1;
1092
1093 work_done:
1094         expire += ip_rt_gc_min_interval;
1095         if (expire > ip_rt_gc_timeout ||
1096             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1097             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1098                 expire = ip_rt_gc_timeout;
1099 out:    return 0;
1100 }
1101
1102 /*
1103  * Returns number of entries in a hash chain that have different hash_inputs
1104  */
1105 static int slow_chain_length(const struct rtable *head)
1106 {
1107         int length = 0;
1108         const struct rtable *rth = head;
1109
1110         while (rth) {
1111                 length += has_noalias(head, rth);
1112                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1113         }
1114         return length >> FRACT_BITS;
1115 }
1116
1117 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1118 {
1119         struct neigh_table *tbl = &arp_tbl;
1120         static const __be32 inaddr_any = 0;
1121         struct net_device *dev = dst->dev;
1122         const __be32 *pkey = daddr;
1123         struct neighbour *n;
1124
1125 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1126         if (dev->type == ARPHRD_ATM)
1127                 tbl = clip_tbl_hook;
1128 #endif
1129         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1130                 pkey = &inaddr_any;
1131
1132         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1133         if (n)
1134                 return n;
1135         return neigh_create(tbl, pkey, dev);
1136 }
1137
1138 static int rt_bind_neighbour(struct rtable *rt)
1139 {
1140         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1141         if (IS_ERR(n))
1142                 return PTR_ERR(n);
1143         dst_set_neighbour(&rt->dst, n);
1144
1145         return 0;
1146 }
1147
1148 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1149                                      struct sk_buff *skb, int ifindex)
1150 {
1151         struct rtable   *rth, *cand;
1152         struct rtable __rcu **rthp, **candp;
1153         unsigned long   now;
1154         u32             min_score;
1155         int             chain_length;
1156         int attempts = !in_softirq();
1157
1158 restart:
1159         chain_length = 0;
1160         min_score = ~(u32)0;
1161         cand = NULL;
1162         candp = NULL;
1163         now = jiffies;
1164
1165         if (!rt_caching(dev_net(rt->dst.dev))) {
1166                 /*
1167                  * If we're not caching, just tell the caller we
1168                  * were successful and don't touch the route.  The
1169                  * caller hold the sole reference to the cache entry, and
1170                  * it will be released when the caller is done with it.
1171                  * If we drop it here, the callers have no way to resolve routes
1172                  * when we're not caching.  Instead, just point *rp at rt, so
1173                  * the caller gets a single use out of the route
1174                  * Note that we do rt_free on this new route entry, so that
1175                  * once its refcount hits zero, we are still able to reap it
1176                  * (Thanks Alexey)
1177                  * Note: To avoid expensive rcu stuff for this uncached dst,
1178                  * we set DST_NOCACHE so that dst_release() can free dst without
1179                  * waiting a grace period.
1180                  */
1181
1182                 rt->dst.flags |= DST_NOCACHE;
1183                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1184                         int err = rt_bind_neighbour(rt);
1185                         if (err) {
1186                                 if (net_ratelimit())
1187                                         printk(KERN_WARNING
1188                                             "Neighbour table failure & not caching routes.\n");
1189                                 ip_rt_put(rt);
1190                                 return ERR_PTR(err);
1191                         }
1192                 }
1193
1194                 goto skip_hashing;
1195         }
1196
1197         rthp = &rt_hash_table[hash].chain;
1198
1199         spin_lock_bh(rt_hash_lock_addr(hash));
1200         while ((rth = rcu_dereference_protected(*rthp,
1201                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1202                 if (rt_is_expired(rth)) {
1203                         *rthp = rth->dst.rt_next;
1204                         rt_free(rth);
1205                         continue;
1206                 }
1207                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1208                         /* Put it first */
1209                         *rthp = rth->dst.rt_next;
1210                         /*
1211                          * Since lookup is lockfree, the deletion
1212                          * must be visible to another weakly ordered CPU before
1213                          * the insertion at the start of the hash chain.
1214                          */
1215                         rcu_assign_pointer(rth->dst.rt_next,
1216                                            rt_hash_table[hash].chain);
1217                         /*
1218                          * Since lookup is lockfree, the update writes
1219                          * must be ordered for consistency on SMP.
1220                          */
1221                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1222
1223                         dst_use(&rth->dst, now);
1224                         spin_unlock_bh(rt_hash_lock_addr(hash));
1225
1226                         rt_drop(rt);
1227                         if (skb)
1228                                 skb_dst_set(skb, &rth->dst);
1229                         return rth;
1230                 }
1231
1232                 if (!atomic_read(&rth->dst.__refcnt)) {
1233                         u32 score = rt_score(rth);
1234
1235                         if (score <= min_score) {
1236                                 cand = rth;
1237                                 candp = rthp;
1238                                 min_score = score;
1239                         }
1240                 }
1241
1242                 chain_length++;
1243
1244                 rthp = &rth->dst.rt_next;
1245         }
1246
1247         if (cand) {
1248                 /* ip_rt_gc_elasticity used to be average length of chain
1249                  * length, when exceeded gc becomes really aggressive.
1250                  *
1251                  * The second limit is less certain. At the moment it allows
1252                  * only 2 entries per bucket. We will see.
1253                  */
1254                 if (chain_length > ip_rt_gc_elasticity) {
1255                         *candp = cand->dst.rt_next;
1256                         rt_free(cand);
1257                 }
1258         } else {
1259                 if (chain_length > rt_chain_length_max &&
1260                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1261                         struct net *net = dev_net(rt->dst.dev);
1262                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1263                         if (!rt_caching(net)) {
1264                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1265                                         rt->dst.dev->name, num);
1266                         }
1267                         rt_emergency_hash_rebuild(net);
1268                         spin_unlock_bh(rt_hash_lock_addr(hash));
1269
1270                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1271                                         ifindex, rt_genid(net));
1272                         goto restart;
1273                 }
1274         }
1275
1276         /* Try to bind route to arp only if it is output
1277            route or unicast forwarding path.
1278          */
1279         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1280                 int err = rt_bind_neighbour(rt);
1281                 if (err) {
1282                         spin_unlock_bh(rt_hash_lock_addr(hash));
1283
1284                         if (err != -ENOBUFS) {
1285                                 rt_drop(rt);
1286                                 return ERR_PTR(err);
1287                         }
1288
1289                         /* Neighbour tables are full and nothing
1290                            can be released. Try to shrink route cache,
1291                            it is most likely it holds some neighbour records.
1292                          */
1293                         if (attempts-- > 0) {
1294                                 int saved_elasticity = ip_rt_gc_elasticity;
1295                                 int saved_int = ip_rt_gc_min_interval;
1296                                 ip_rt_gc_elasticity     = 1;
1297                                 ip_rt_gc_min_interval   = 0;
1298                                 rt_garbage_collect(&ipv4_dst_ops);
1299                                 ip_rt_gc_min_interval   = saved_int;
1300                                 ip_rt_gc_elasticity     = saved_elasticity;
1301                                 goto restart;
1302                         }
1303
1304                         if (net_ratelimit())
1305                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1306                         rt_drop(rt);
1307                         return ERR_PTR(-ENOBUFS);
1308                 }
1309         }
1310
1311         rt->dst.rt_next = rt_hash_table[hash].chain;
1312
1313         /*
1314          * Since lookup is lockfree, we must make sure
1315          * previous writes to rt are committed to memory
1316          * before making rt visible to other CPUS.
1317          */
1318         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1319
1320         spin_unlock_bh(rt_hash_lock_addr(hash));
1321
1322 skip_hashing:
1323         if (skb)
1324                 skb_dst_set(skb, &rt->dst);
1325         return rt;
1326 }
1327
1328 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1329
1330 static u32 rt_peer_genid(void)
1331 {
1332         return atomic_read(&__rt_peer_genid);
1333 }
1334
1335 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1336 {
1337         struct inet_peer *peer;
1338
1339         peer = inet_getpeer_v4(daddr, create);
1340
1341         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1342                 inet_putpeer(peer);
1343         else
1344                 rt->rt_peer_genid = rt_peer_genid();
1345 }
1346
1347 /*
1348  * Peer allocation may fail only in serious out-of-memory conditions.  However
1349  * we still can generate some output.
1350  * Random ID selection looks a bit dangerous because we have no chances to
1351  * select ID being unique in a reasonable period of time.
1352  * But broken packet identifier may be better than no packet at all.
1353  */
1354 static void ip_select_fb_ident(struct iphdr *iph)
1355 {
1356         static DEFINE_SPINLOCK(ip_fb_id_lock);
1357         static u32 ip_fallback_id;
1358         u32 salt;
1359
1360         spin_lock_bh(&ip_fb_id_lock);
1361         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1362         iph->id = htons(salt & 0xFFFF);
1363         ip_fallback_id = salt;
1364         spin_unlock_bh(&ip_fb_id_lock);
1365 }
1366
1367 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1368 {
1369         struct rtable *rt = (struct rtable *) dst;
1370
1371         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1372                 if (rt->peer == NULL)
1373                         rt_bind_peer(rt, rt->rt_dst, 1);
1374
1375                 /* If peer is attached to destination, it is never detached,
1376                    so that we need not to grab a lock to dereference it.
1377                  */
1378                 if (rt->peer) {
1379                         iph->id = htons(inet_getid(rt->peer, more));
1380                         return;
1381                 }
1382         } else if (!rt)
1383                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1384                        __builtin_return_address(0));
1385
1386         ip_select_fb_ident(iph);
1387 }
1388 EXPORT_SYMBOL(__ip_select_ident);
1389
1390 static void rt_del(unsigned hash, struct rtable *rt)
1391 {
1392         struct rtable __rcu **rthp;
1393         struct rtable *aux;
1394
1395         rthp = &rt_hash_table[hash].chain;
1396         spin_lock_bh(rt_hash_lock_addr(hash));
1397         ip_rt_put(rt);
1398         while ((aux = rcu_dereference_protected(*rthp,
1399                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1400                 if (aux == rt || rt_is_expired(aux)) {
1401                         *rthp = aux->dst.rt_next;
1402                         rt_free(aux);
1403                         continue;
1404                 }
1405                 rthp = &aux->dst.rt_next;
1406         }
1407         spin_unlock_bh(rt_hash_lock_addr(hash));
1408 }
1409
1410 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1411 {
1412         struct rtable *rt = (struct rtable *) dst;
1413         __be32 orig_gw = rt->rt_gateway;
1414         struct neighbour *n, *old_n;
1415
1416         dst_confirm(&rt->dst);
1417
1418         rt->rt_gateway = peer->redirect_learned.a4;
1419
1420         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1421         if (IS_ERR(n)) {
1422                 rt->rt_gateway = orig_gw;
1423                 return;
1424         }
1425         old_n = xchg(&rt->dst._neighbour, n);
1426         if (old_n)
1427                 neigh_release(old_n);
1428         if (!(n->nud_state & NUD_VALID)) {
1429                 neigh_event_send(n, NULL);
1430         } else {
1431                 rt->rt_flags |= RTCF_REDIRECTED;
1432                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1433         }
1434 }
1435
1436 /* called in rcu_read_lock() section */
1437 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1438                     __be32 saddr, struct net_device *dev)
1439 {
1440         int s, i;
1441         struct in_device *in_dev = __in_dev_get_rcu(dev);
1442         __be32 skeys[2] = { saddr, 0 };
1443         int    ikeys[2] = { dev->ifindex, 0 };
1444         struct inet_peer *peer;
1445         struct net *net;
1446
1447         if (!in_dev)
1448                 return;
1449
1450         net = dev_net(dev);
1451         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1452             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1453             ipv4_is_zeronet(new_gw))
1454                 goto reject_redirect;
1455
1456         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1457                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1458                         goto reject_redirect;
1459                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1460                         goto reject_redirect;
1461         } else {
1462                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1463                         goto reject_redirect;
1464         }
1465
1466         for (s = 0; s < 2; s++) {
1467                 for (i = 0; i < 2; i++) {
1468                         unsigned int hash;
1469                         struct rtable __rcu **rthp;
1470                         struct rtable *rt;
1471
1472                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1473
1474                         rthp = &rt_hash_table[hash].chain;
1475
1476                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1477                                 rthp = &rt->dst.rt_next;
1478
1479                                 if (rt->rt_key_dst != daddr ||
1480                                     rt->rt_key_src != skeys[s] ||
1481                                     rt->rt_oif != ikeys[i] ||
1482                                     rt_is_input_route(rt) ||
1483                                     rt_is_expired(rt) ||
1484                                     !net_eq(dev_net(rt->dst.dev), net) ||
1485                                     rt->dst.error ||
1486                                     rt->dst.dev != dev ||
1487                                     rt->rt_gateway != old_gw)
1488                                         continue;
1489
1490                                 if (!rt->peer)
1491                                         rt_bind_peer(rt, rt->rt_dst, 1);
1492
1493                                 peer = rt->peer;
1494                                 if (peer) {
1495                                         if (peer->redirect_learned.a4 != new_gw ||
1496                                             peer->redirect_genid != redirect_genid) {
1497                                                 peer->redirect_learned.a4 = new_gw;
1498                                                 peer->redirect_genid = redirect_genid;
1499                                                 atomic_inc(&__rt_peer_genid);
1500                                         }
1501                                         check_peer_redir(&rt->dst, peer);
1502                                 }
1503                         }
1504                 }
1505         }
1506         return;
1507
1508 reject_redirect:
1509 #ifdef CONFIG_IP_ROUTE_VERBOSE
1510         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1511                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1512                         "  Advised path = %pI4 -> %pI4\n",
1513                        &old_gw, dev->name, &new_gw,
1514                        &saddr, &daddr);
1515 #endif
1516         ;
1517 }
1518
1519 static bool peer_pmtu_expired(struct inet_peer *peer)
1520 {
1521         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1522
1523         return orig &&
1524                time_after_eq(jiffies, orig) &&
1525                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1526 }
1527
1528 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1529 {
1530         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1531
1532         return orig &&
1533                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1534 }
1535
1536 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1537 {
1538         struct rtable *rt = (struct rtable *)dst;
1539         struct dst_entry *ret = dst;
1540
1541         if (rt) {
1542                 if (dst->obsolete > 0) {
1543                         ip_rt_put(rt);
1544                         ret = NULL;
1545                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1546                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1547                                                 rt->rt_oif,
1548                                                 rt_genid(dev_net(dst->dev)));
1549                         rt_del(hash, rt);
1550                         ret = NULL;
1551                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1552                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1553                 }
1554         }
1555         return ret;
1556 }
1557
1558 /*
1559  * Algorithm:
1560  *      1. The first ip_rt_redirect_number redirects are sent
1561  *         with exponential backoff, then we stop sending them at all,
1562  *         assuming that the host ignores our redirects.
1563  *      2. If we did not see packets requiring redirects
1564  *         during ip_rt_redirect_silence, we assume that the host
1565  *         forgot redirected route and start to send redirects again.
1566  *
1567  * This algorithm is much cheaper and more intelligent than dumb load limiting
1568  * in icmp.c.
1569  *
1570  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1571  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1572  */
1573
1574 void ip_rt_send_redirect(struct sk_buff *skb)
1575 {
1576         struct rtable *rt = skb_rtable(skb);
1577         struct in_device *in_dev;
1578         struct inet_peer *peer;
1579         int log_martians;
1580
1581         rcu_read_lock();
1582         in_dev = __in_dev_get_rcu(rt->dst.dev);
1583         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1584                 rcu_read_unlock();
1585                 return;
1586         }
1587         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1588         rcu_read_unlock();
1589
1590         if (!rt->peer)
1591                 rt_bind_peer(rt, rt->rt_dst, 1);
1592         peer = rt->peer;
1593         if (!peer) {
1594                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1595                 return;
1596         }
1597
1598         /* No redirected packets during ip_rt_redirect_silence;
1599          * reset the algorithm.
1600          */
1601         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1602                 peer->rate_tokens = 0;
1603
1604         /* Too many ignored redirects; do not send anything
1605          * set dst.rate_last to the last seen redirected packet.
1606          */
1607         if (peer->rate_tokens >= ip_rt_redirect_number) {
1608                 peer->rate_last = jiffies;
1609                 return;
1610         }
1611
1612         /* Check for load limit; set rate_last to the latest sent
1613          * redirect.
1614          */
1615         if (peer->rate_tokens == 0 ||
1616             time_after(jiffies,
1617                        (peer->rate_last +
1618                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1619                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1620                 peer->rate_last = jiffies;
1621                 ++peer->rate_tokens;
1622 #ifdef CONFIG_IP_ROUTE_VERBOSE
1623                 if (log_martians &&
1624                     peer->rate_tokens == ip_rt_redirect_number &&
1625                     net_ratelimit())
1626                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1627                                &ip_hdr(skb)->saddr, rt->rt_iif,
1628                                 &rt->rt_dst, &rt->rt_gateway);
1629 #endif
1630         }
1631 }
1632
1633 static int ip_error(struct sk_buff *skb)
1634 {
1635         struct rtable *rt = skb_rtable(skb);
1636         struct inet_peer *peer;
1637         unsigned long now;
1638         bool send;
1639         int code;
1640
1641         switch (rt->dst.error) {
1642         case EINVAL:
1643         default:
1644                 goto out;
1645         case EHOSTUNREACH:
1646                 code = ICMP_HOST_UNREACH;
1647                 break;
1648         case ENETUNREACH:
1649                 code = ICMP_NET_UNREACH;
1650                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1651                                 IPSTATS_MIB_INNOROUTES);
1652                 break;
1653         case EACCES:
1654                 code = ICMP_PKT_FILTERED;
1655                 break;
1656         }
1657
1658         if (!rt->peer)
1659                 rt_bind_peer(rt, rt->rt_dst, 1);
1660         peer = rt->peer;
1661
1662         send = true;
1663         if (peer) {
1664                 now = jiffies;
1665                 peer->rate_tokens += now - peer->rate_last;
1666                 if (peer->rate_tokens > ip_rt_error_burst)
1667                         peer->rate_tokens = ip_rt_error_burst;
1668                 peer->rate_last = now;
1669                 if (peer->rate_tokens >= ip_rt_error_cost)
1670                         peer->rate_tokens -= ip_rt_error_cost;
1671                 else
1672                         send = false;
1673         }
1674         if (send)
1675                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1676
1677 out:    kfree_skb(skb);
1678         return 0;
1679 }
1680
1681 /*
1682  *      The last two values are not from the RFC but
1683  *      are needed for AMPRnet AX.25 paths.
1684  */
1685
1686 static const unsigned short mtu_plateau[] =
1687 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1688
1689 static inline unsigned short guess_mtu(unsigned short old_mtu)
1690 {
1691         int i;
1692
1693         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1694                 if (old_mtu > mtu_plateau[i])
1695                         return mtu_plateau[i];
1696         return 68;
1697 }
1698
1699 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1700                                  unsigned short new_mtu,
1701                                  struct net_device *dev)
1702 {
1703         unsigned short old_mtu = ntohs(iph->tot_len);
1704         unsigned short est_mtu = 0;
1705         struct inet_peer *peer;
1706
1707         peer = inet_getpeer_v4(iph->daddr, 1);
1708         if (peer) {
1709                 unsigned short mtu = new_mtu;
1710
1711                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1712                         /* BSD 4.2 derived systems incorrectly adjust
1713                          * tot_len by the IP header length, and report
1714                          * a zero MTU in the ICMP message.
1715                          */
1716                         if (mtu == 0 &&
1717                             old_mtu >= 68 + (iph->ihl << 2))
1718                                 old_mtu -= iph->ihl << 2;
1719                         mtu = guess_mtu(old_mtu);
1720                 }
1721
1722                 if (mtu < ip_rt_min_pmtu)
1723                         mtu = ip_rt_min_pmtu;
1724                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1725                         unsigned long pmtu_expires;
1726
1727                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1728                         if (!pmtu_expires)
1729                                 pmtu_expires = 1UL;
1730
1731                         est_mtu = mtu;
1732                         peer->pmtu_learned = mtu;
1733                         peer->pmtu_expires = pmtu_expires;
1734                         atomic_inc(&__rt_peer_genid);
1735                 }
1736
1737                 inet_putpeer(peer);
1738         }
1739         return est_mtu ? : new_mtu;
1740 }
1741
1742 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1743 {
1744         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1745
1746         if (!expires)
1747                 return;
1748         if (time_before(jiffies, expires)) {
1749                 u32 orig_dst_mtu = dst_mtu(dst);
1750                 if (peer->pmtu_learned < orig_dst_mtu) {
1751                         if (!peer->pmtu_orig)
1752                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1753                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1754                 }
1755         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1756                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1757 }
1758
1759 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1760 {
1761         struct rtable *rt = (struct rtable *) dst;
1762         struct inet_peer *peer;
1763
1764         dst_confirm(dst);
1765
1766         if (!rt->peer)
1767                 rt_bind_peer(rt, rt->rt_dst, 1);
1768         peer = rt->peer;
1769         if (peer) {
1770                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1771
1772                 if (mtu < ip_rt_min_pmtu)
1773                         mtu = ip_rt_min_pmtu;
1774                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1775
1776                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1777                         if (!pmtu_expires)
1778                                 pmtu_expires = 1UL;
1779
1780                         peer->pmtu_learned = mtu;
1781                         peer->pmtu_expires = pmtu_expires;
1782
1783                         atomic_inc(&__rt_peer_genid);
1784                         rt->rt_peer_genid = rt_peer_genid();
1785                 }
1786                 check_peer_pmtu(dst, peer);
1787         }
1788 }
1789
1790
1791 static void ipv4_validate_peer(struct rtable *rt)
1792 {
1793         if (rt->rt_peer_genid != rt_peer_genid()) {
1794                 struct inet_peer *peer;
1795
1796                 if (!rt->peer)
1797                         rt_bind_peer(rt, rt->rt_dst, 0);
1798
1799                 peer = rt->peer;
1800                 if (peer) {
1801                         check_peer_pmtu(&rt->dst, peer);
1802
1803                         if (peer->redirect_genid != redirect_genid)
1804                                 peer->redirect_learned.a4 = 0;
1805                         if (peer->redirect_learned.a4 &&
1806                             peer->redirect_learned.a4 != rt->rt_gateway)
1807                                 check_peer_redir(&rt->dst, peer);
1808                 }
1809
1810                 rt->rt_peer_genid = rt_peer_genid();
1811         }
1812 }
1813
1814 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1815 {
1816         struct rtable *rt = (struct rtable *) dst;
1817
1818         if (rt_is_expired(rt))
1819                 return NULL;
1820         ipv4_validate_peer(rt);
1821         return dst;
1822 }
1823
1824 static void ipv4_dst_destroy(struct dst_entry *dst)
1825 {
1826         struct rtable *rt = (struct rtable *) dst;
1827         struct inet_peer *peer = rt->peer;
1828
1829         if (rt->fi) {
1830                 fib_info_put(rt->fi);
1831                 rt->fi = NULL;
1832         }
1833         if (peer) {
1834                 rt->peer = NULL;
1835                 inet_putpeer(peer);
1836         }
1837 }
1838
1839
1840 static void ipv4_link_failure(struct sk_buff *skb)
1841 {
1842         struct rtable *rt;
1843
1844         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1845
1846         rt = skb_rtable(skb);
1847         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1848                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1849 }
1850
1851 static int ip_rt_bug(struct sk_buff *skb)
1852 {
1853         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1854                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1855                 skb->dev ? skb->dev->name : "?");
1856         kfree_skb(skb);
1857         WARN_ON(1);
1858         return 0;
1859 }
1860
1861 /*
1862    We do not cache source address of outgoing interface,
1863    because it is used only by IP RR, TS and SRR options,
1864    so that it out of fast path.
1865
1866    BTW remember: "addr" is allowed to be not aligned
1867    in IP options!
1868  */
1869
1870 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1871 {
1872         __be32 src;
1873
1874         if (rt_is_output_route(rt))
1875                 src = ip_hdr(skb)->saddr;
1876         else {
1877                 struct fib_result res;
1878                 struct flowi4 fl4;
1879                 struct iphdr *iph;
1880
1881                 iph = ip_hdr(skb);
1882
1883                 memset(&fl4, 0, sizeof(fl4));
1884                 fl4.daddr = iph->daddr;
1885                 fl4.saddr = iph->saddr;
1886                 fl4.flowi4_tos = RT_TOS(iph->tos);
1887                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1888                 fl4.flowi4_iif = skb->dev->ifindex;
1889                 fl4.flowi4_mark = skb->mark;
1890
1891                 rcu_read_lock();
1892                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1893                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1894                 else
1895                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1896                                         RT_SCOPE_UNIVERSE);
1897                 rcu_read_unlock();
1898         }
1899         memcpy(addr, &src, 4);
1900 }
1901
1902 #ifdef CONFIG_IP_ROUTE_CLASSID
1903 static void set_class_tag(struct rtable *rt, u32 tag)
1904 {
1905         if (!(rt->dst.tclassid & 0xFFFF))
1906                 rt->dst.tclassid |= tag & 0xFFFF;
1907         if (!(rt->dst.tclassid & 0xFFFF0000))
1908                 rt->dst.tclassid |= tag & 0xFFFF0000;
1909 }
1910 #endif
1911
1912 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1913 {
1914         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1915
1916         if (advmss == 0) {
1917                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1918                                ip_rt_min_advmss);
1919                 if (advmss > 65535 - 40)
1920                         advmss = 65535 - 40;
1921         }
1922         return advmss;
1923 }
1924
1925 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1926 {
1927         const struct rtable *rt = (const struct rtable *) dst;
1928         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1929
1930         if (mtu && rt_is_output_route(rt))
1931                 return mtu;
1932
1933         mtu = dst->dev->mtu;
1934
1935         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1936
1937                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1938                         mtu = 576;
1939         }
1940
1941         if (mtu > IP_MAX_MTU)
1942                 mtu = IP_MAX_MTU;
1943
1944         return mtu;
1945 }
1946
1947 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1948                             struct fib_info *fi)
1949 {
1950         struct inet_peer *peer;
1951         int create = 0;
1952
1953         /* If a peer entry exists for this destination, we must hook
1954          * it up in order to get at cached metrics.
1955          */
1956         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1957                 create = 1;
1958
1959         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1960         if (peer) {
1961                 rt->rt_peer_genid = rt_peer_genid();
1962                 if (inet_metrics_new(peer))
1963                         memcpy(peer->metrics, fi->fib_metrics,
1964                                sizeof(u32) * RTAX_MAX);
1965                 dst_init_metrics(&rt->dst, peer->metrics, false);
1966
1967                 check_peer_pmtu(&rt->dst, peer);
1968                 if (peer->redirect_genid != redirect_genid)
1969                         peer->redirect_learned.a4 = 0;
1970                 if (peer->redirect_learned.a4 &&
1971                     peer->redirect_learned.a4 != rt->rt_gateway) {
1972                         rt->rt_gateway = peer->redirect_learned.a4;
1973                         rt->rt_flags |= RTCF_REDIRECTED;
1974                 }
1975         } else {
1976                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1977                         rt->fi = fi;
1978                         atomic_inc(&fi->fib_clntref);
1979                 }
1980                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1981         }
1982 }
1983
1984 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1985                            const struct fib_result *res,
1986                            struct fib_info *fi, u16 type, u32 itag)
1987 {
1988         struct dst_entry *dst = &rt->dst;
1989
1990         if (fi) {
1991                 if (FIB_RES_GW(*res) &&
1992                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1993                         rt->rt_gateway = FIB_RES_GW(*res);
1994                 rt_init_metrics(rt, fl4, fi);
1995 #ifdef CONFIG_IP_ROUTE_CLASSID
1996                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1997 #endif
1998         }
1999
2000         if (dst_mtu(dst) > IP_MAX_MTU)
2001                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
2002         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
2003                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
2004
2005 #ifdef CONFIG_IP_ROUTE_CLASSID
2006 #ifdef CONFIG_IP_MULTIPLE_TABLES
2007         set_class_tag(rt, fib_rules_tclass(res));
2008 #endif
2009         set_class_tag(rt, itag);
2010 #endif
2011 }
2012
2013 static struct rtable *rt_dst_alloc(struct net_device *dev,
2014                                    bool nopolicy, bool noxfrm)
2015 {
2016         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2017                          DST_HOST |
2018                          (nopolicy ? DST_NOPOLICY : 0) |
2019                          (noxfrm ? DST_NOXFRM : 0));
2020 }
2021
2022 /* called in rcu_read_lock() section */
2023 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2024                                 u8 tos, struct net_device *dev, int our)
2025 {
2026         unsigned int hash;
2027         struct rtable *rth;
2028         __be32 spec_dst;
2029         struct in_device *in_dev = __in_dev_get_rcu(dev);
2030         u32 itag = 0;
2031         int err;
2032
2033         /* Primary sanity checks. */
2034
2035         if (in_dev == NULL)
2036                 return -EINVAL;
2037
2038         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2039             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2040                 goto e_inval;
2041
2042         if (ipv4_is_zeronet(saddr)) {
2043                 if (!ipv4_is_local_multicast(daddr))
2044                         goto e_inval;
2045                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2046         } else {
2047                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2048                                           &itag);
2049                 if (err < 0)
2050                         goto e_err;
2051         }
2052         rth = rt_dst_alloc(init_net.loopback_dev,
2053                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2054         if (!rth)
2055                 goto e_nobufs;
2056
2057 #ifdef CONFIG_IP_ROUTE_CLASSID
2058         rth->dst.tclassid = itag;
2059 #endif
2060         rth->dst.output = ip_rt_bug;
2061
2062         rth->rt_key_dst = daddr;
2063         rth->rt_key_src = saddr;
2064         rth->rt_genid   = rt_genid(dev_net(dev));
2065         rth->rt_flags   = RTCF_MULTICAST;
2066         rth->rt_type    = RTN_MULTICAST;
2067         rth->rt_key_tos = tos;
2068         rth->rt_dst     = daddr;
2069         rth->rt_src     = saddr;
2070         rth->rt_route_iif = dev->ifindex;
2071         rth->rt_iif     = dev->ifindex;
2072         rth->rt_oif     = 0;
2073         rth->rt_mark    = skb->mark;
2074         rth->rt_gateway = daddr;
2075         rth->rt_spec_dst= spec_dst;
2076         rth->rt_peer_genid = 0;
2077         rth->peer = NULL;
2078         rth->fi = NULL;
2079         if (our) {
2080                 rth->dst.input= ip_local_deliver;
2081                 rth->rt_flags |= RTCF_LOCAL;
2082         }
2083
2084 #ifdef CONFIG_IP_MROUTE
2085         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2086                 rth->dst.input = ip_mr_input;
2087 #endif
2088         RT_CACHE_STAT_INC(in_slow_mc);
2089
2090         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2091         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2092         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2093
2094 e_nobufs:
2095         return -ENOBUFS;
2096 e_inval:
2097         return -EINVAL;
2098 e_err:
2099         return err;
2100 }
2101
2102
2103 static void ip_handle_martian_source(struct net_device *dev,
2104                                      struct in_device *in_dev,
2105                                      struct sk_buff *skb,
2106                                      __be32 daddr,
2107                                      __be32 saddr)
2108 {
2109         RT_CACHE_STAT_INC(in_martian_src);
2110 #ifdef CONFIG_IP_ROUTE_VERBOSE
2111         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2112                 /*
2113                  *      RFC1812 recommendation, if source is martian,
2114                  *      the only hint is MAC header.
2115                  */
2116                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2117                         &daddr, &saddr, dev->name);
2118                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2119                         int i;
2120                         const unsigned char *p = skb_mac_header(skb);
2121                         printk(KERN_WARNING "ll header: ");
2122                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2123                                 printk("%02x", *p);
2124                                 if (i < (dev->hard_header_len - 1))
2125                                         printk(":");
2126                         }
2127                         printk("\n");
2128                 }
2129         }
2130 #endif
2131 }
2132
2133 /* called in rcu_read_lock() section */
2134 static int __mkroute_input(struct sk_buff *skb,
2135                            const struct fib_result *res,
2136                            struct in_device *in_dev,
2137                            __be32 daddr, __be32 saddr, u32 tos,
2138                            struct rtable **result)
2139 {
2140         struct rtable *rth;
2141         int err;
2142         struct in_device *out_dev;
2143         unsigned int flags = 0;
2144         __be32 spec_dst;
2145         u32 itag;
2146
2147         /* get a working reference to the output device */
2148         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2149         if (out_dev == NULL) {
2150                 if (net_ratelimit())
2151                         printk(KERN_CRIT "Bug in ip_route_input" \
2152                                "_slow(). Please, report\n");
2153                 return -EINVAL;
2154         }
2155
2156
2157         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2158                                   in_dev->dev, &spec_dst, &itag);
2159         if (err < 0) {
2160                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2161                                          saddr);
2162
2163                 goto cleanup;
2164         }
2165
2166         if (err)
2167                 flags |= RTCF_DIRECTSRC;
2168
2169         if (out_dev == in_dev && err &&
2170             (IN_DEV_SHARED_MEDIA(out_dev) ||
2171              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2172                 flags |= RTCF_DOREDIRECT;
2173
2174         if (skb->protocol != htons(ETH_P_IP)) {
2175                 /* Not IP (i.e. ARP). Do not create route, if it is
2176                  * invalid for proxy arp. DNAT routes are always valid.
2177                  *
2178                  * Proxy arp feature have been extended to allow, ARP
2179                  * replies back to the same interface, to support
2180                  * Private VLAN switch technologies. See arp.c.
2181                  */
2182                 if (out_dev == in_dev &&
2183                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2184                         err = -EINVAL;
2185                         goto cleanup;
2186                 }
2187         }
2188
2189         rth = rt_dst_alloc(out_dev->dev,
2190                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2191                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2192         if (!rth) {
2193                 err = -ENOBUFS;
2194                 goto cleanup;
2195         }
2196
2197         rth->rt_key_dst = daddr;
2198         rth->rt_key_src = saddr;
2199         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2200         rth->rt_flags = flags;
2201         rth->rt_type = res->type;
2202         rth->rt_key_tos = tos;
2203         rth->rt_dst     = daddr;
2204         rth->rt_src     = saddr;
2205         rth->rt_route_iif = in_dev->dev->ifindex;
2206         rth->rt_iif     = in_dev->dev->ifindex;
2207         rth->rt_oif     = 0;
2208         rth->rt_mark    = skb->mark;
2209         rth->rt_gateway = daddr;
2210         rth->rt_spec_dst= spec_dst;
2211         rth->rt_peer_genid = 0;
2212         rth->peer = NULL;
2213         rth->fi = NULL;
2214
2215         rth->dst.input = ip_forward;
2216         rth->dst.output = ip_output;
2217
2218         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2219
2220         *result = rth;
2221         err = 0;
2222  cleanup:
2223         return err;
2224 }
2225
2226 static int ip_mkroute_input(struct sk_buff *skb,
2227                             struct fib_result *res,
2228                             const struct flowi4 *fl4,
2229                             struct in_device *in_dev,
2230                             __be32 daddr, __be32 saddr, u32 tos)
2231 {
2232         struct rtable* rth = NULL;
2233         int err;
2234         unsigned hash;
2235
2236 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2237         if (res->fi && res->fi->fib_nhs > 1)
2238                 fib_select_multipath(res);
2239 #endif
2240
2241         /* create a routing cache entry */
2242         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2243         if (err)
2244                 return err;
2245
2246         /* put it into the cache */
2247         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2248                        rt_genid(dev_net(rth->dst.dev)));
2249         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2250         if (IS_ERR(rth))
2251                 return PTR_ERR(rth);
2252         return 0;
2253 }
2254
2255 /*
2256  *      NOTE. We drop all the packets that has local source
2257  *      addresses, because every properly looped back packet
2258  *      must have correct destination already attached by output routine.
2259  *
2260  *      Such approach solves two big problems:
2261  *      1. Not simplex devices are handled properly.
2262  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2263  *      called with rcu_read_lock()
2264  */
2265
2266 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2267                                u8 tos, struct net_device *dev)
2268 {
2269         struct fib_result res;
2270         struct in_device *in_dev = __in_dev_get_rcu(dev);
2271         struct flowi4   fl4;
2272         unsigned        flags = 0;
2273         u32             itag = 0;
2274         struct rtable * rth;
2275         unsigned        hash;
2276         __be32          spec_dst;
2277         int             err = -EINVAL;
2278         struct net    * net = dev_net(dev);
2279
2280         /* IP on this device is disabled. */
2281
2282         if (!in_dev)
2283                 goto out;
2284
2285         /* Check for the most weird martians, which can be not detected
2286            by fib_lookup.
2287          */
2288
2289         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2290             ipv4_is_loopback(saddr))
2291                 goto martian_source;
2292
2293         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2294                 goto brd_input;
2295
2296         /* Accept zero addresses only to limited broadcast;
2297          * I even do not know to fix it or not. Waiting for complains :-)
2298          */
2299         if (ipv4_is_zeronet(saddr))
2300                 goto martian_source;
2301
2302         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2303                 goto martian_destination;
2304
2305         /*
2306          *      Now we are ready to route packet.
2307          */
2308         fl4.flowi4_oif = 0;
2309         fl4.flowi4_iif = dev->ifindex;
2310         fl4.flowi4_mark = skb->mark;
2311         fl4.flowi4_tos = tos;
2312         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2313         fl4.daddr = daddr;
2314         fl4.saddr = saddr;
2315         err = fib_lookup(net, &fl4, &res);
2316         if (err != 0) {
2317                 if (!IN_DEV_FORWARD(in_dev))
2318                         goto e_hostunreach;
2319                 goto no_route;
2320         }
2321
2322         RT_CACHE_STAT_INC(in_slow_tot);
2323
2324         if (res.type == RTN_BROADCAST)
2325                 goto brd_input;
2326
2327         if (res.type == RTN_LOCAL) {
2328                 err = fib_validate_source(skb, saddr, daddr, tos,
2329                                           net->loopback_dev->ifindex,
2330                                           dev, &spec_dst, &itag);
2331                 if (err < 0)
2332                         goto martian_source_keep_err;
2333                 if (err)
2334                         flags |= RTCF_DIRECTSRC;
2335                 spec_dst = daddr;
2336                 goto local_input;
2337         }
2338
2339         if (!IN_DEV_FORWARD(in_dev))
2340                 goto e_hostunreach;
2341         if (res.type != RTN_UNICAST)
2342                 goto martian_destination;
2343
2344         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2345 out:    return err;
2346
2347 brd_input:
2348         if (skb->protocol != htons(ETH_P_IP))
2349                 goto e_inval;
2350
2351         if (ipv4_is_zeronet(saddr))
2352                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2353         else {
2354                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2355                                           &itag);
2356                 if (err < 0)
2357                         goto martian_source_keep_err;
2358                 if (err)
2359                         flags |= RTCF_DIRECTSRC;
2360         }
2361         flags |= RTCF_BROADCAST;
2362         res.type = RTN_BROADCAST;
2363         RT_CACHE_STAT_INC(in_brd);
2364
2365 local_input:
2366         rth = rt_dst_alloc(net->loopback_dev,
2367                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2368         if (!rth)
2369                 goto e_nobufs;
2370
2371         rth->dst.input= ip_local_deliver;
2372         rth->dst.output= ip_rt_bug;
2373 #ifdef CONFIG_IP_ROUTE_CLASSID
2374         rth->dst.tclassid = itag;
2375 #endif
2376
2377         rth->rt_key_dst = daddr;
2378         rth->rt_key_src = saddr;
2379         rth->rt_genid = rt_genid(net);
2380         rth->rt_flags   = flags|RTCF_LOCAL;
2381         rth->rt_type    = res.type;
2382         rth->rt_key_tos = tos;
2383         rth->rt_dst     = daddr;
2384         rth->rt_src     = saddr;
2385 #ifdef CONFIG_IP_ROUTE_CLASSID
2386         rth->dst.tclassid = itag;
2387 #endif
2388         rth->rt_route_iif = dev->ifindex;
2389         rth->rt_iif     = dev->ifindex;
2390         rth->rt_oif     = 0;
2391         rth->rt_mark    = skb->mark;
2392         rth->rt_gateway = daddr;
2393         rth->rt_spec_dst= spec_dst;
2394         rth->rt_peer_genid = 0;
2395         rth->peer = NULL;
2396         rth->fi = NULL;
2397         if (res.type == RTN_UNREACHABLE) {
2398                 rth->dst.input= ip_error;
2399                 rth->dst.error= -err;
2400                 rth->rt_flags   &= ~RTCF_LOCAL;
2401         }
2402         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2403         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2404         err = 0;
2405         if (IS_ERR(rth))
2406                 err = PTR_ERR(rth);
2407         goto out;
2408
2409 no_route:
2410         RT_CACHE_STAT_INC(in_no_route);
2411         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2412         res.type = RTN_UNREACHABLE;
2413         if (err == -ESRCH)
2414                 err = -ENETUNREACH;
2415         goto local_input;
2416
2417         /*
2418          *      Do not cache martian addresses: they should be logged (RFC1812)
2419          */
2420 martian_destination:
2421         RT_CACHE_STAT_INC(in_martian_dst);
2422 #ifdef CONFIG_IP_ROUTE_VERBOSE
2423         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2424                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2425                         &daddr, &saddr, dev->name);
2426 #endif
2427
2428 e_hostunreach:
2429         err = -EHOSTUNREACH;
2430         goto out;
2431
2432 e_inval:
2433         err = -EINVAL;
2434         goto out;
2435
2436 e_nobufs:
2437         err = -ENOBUFS;
2438         goto out;
2439
2440 martian_source:
2441         err = -EINVAL;
2442 martian_source_keep_err:
2443         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2444         goto out;
2445 }
2446
2447 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2448                            u8 tos, struct net_device *dev, bool noref)
2449 {
2450         struct rtable * rth;
2451         unsigned        hash;
2452         int iif = dev->ifindex;
2453         struct net *net;
2454         int res;
2455
2456         net = dev_net(dev);
2457
2458         rcu_read_lock();
2459
2460         if (!rt_caching(net))
2461                 goto skip_cache;
2462
2463         tos &= IPTOS_RT_MASK;
2464         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2465
2466         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2467              rth = rcu_dereference(rth->dst.rt_next)) {
2468                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2469                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2470                      (rth->rt_route_iif ^ iif) |
2471                      (rth->rt_key_tos ^ tos)) == 0 &&
2472                     rth->rt_mark == skb->mark &&
2473                     net_eq(dev_net(rth->dst.dev), net) &&
2474                     !rt_is_expired(rth)) {
2475                         ipv4_validate_peer(rth);
2476                         if (noref) {
2477                                 dst_use_noref(&rth->dst, jiffies);
2478                                 skb_dst_set_noref(skb, &rth->dst);
2479                         } else {
2480                                 dst_use(&rth->dst, jiffies);
2481                                 skb_dst_set(skb, &rth->dst);
2482                         }
2483                         RT_CACHE_STAT_INC(in_hit);
2484                         rcu_read_unlock();
2485                         return 0;
2486                 }
2487                 RT_CACHE_STAT_INC(in_hlist_search);
2488         }
2489
2490 skip_cache:
2491         /* Multicast recognition logic is moved from route cache to here.
2492            The problem was that too many Ethernet cards have broken/missing
2493            hardware multicast filters :-( As result the host on multicasting
2494            network acquires a lot of useless route cache entries, sort of
2495            SDR messages from all the world. Now we try to get rid of them.
2496            Really, provided software IP multicast filter is organized
2497            reasonably (at least, hashed), it does not result in a slowdown
2498            comparing with route cache reject entries.
2499            Note, that multicast routers are not affected, because
2500            route cache entry is created eventually.
2501          */
2502         if (ipv4_is_multicast(daddr)) {
2503                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2504
2505                 if (in_dev) {
2506                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2507                                                   ip_hdr(skb)->protocol);
2508                         if (our
2509 #ifdef CONFIG_IP_MROUTE
2510                                 ||
2511                             (!ipv4_is_local_multicast(daddr) &&
2512                              IN_DEV_MFORWARD(in_dev))
2513 #endif
2514                            ) {
2515                                 int res = ip_route_input_mc(skb, daddr, saddr,
2516                                                             tos, dev, our);
2517                                 rcu_read_unlock();
2518                                 return res;
2519                         }
2520                 }
2521                 rcu_read_unlock();
2522                 return -EINVAL;
2523         }
2524         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2525         rcu_read_unlock();
2526         return res;
2527 }
2528 EXPORT_SYMBOL(ip_route_input_common);
2529
2530 /* called with rcu_read_lock() */
2531 static struct rtable *__mkroute_output(const struct fib_result *res,
2532                                        const struct flowi4 *fl4,
2533                                        __be32 orig_daddr, __be32 orig_saddr,
2534                                        int orig_oif, __u8 orig_rtos,
2535                                        struct net_device *dev_out,
2536                                        unsigned int flags)
2537 {
2538         struct fib_info *fi = res->fi;
2539         struct in_device *in_dev;
2540         u16 type = res->type;
2541         struct rtable *rth;
2542
2543         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2544                 return ERR_PTR(-EINVAL);
2545
2546         if (ipv4_is_lbcast(fl4->daddr))
2547                 type = RTN_BROADCAST;
2548         else if (ipv4_is_multicast(fl4->daddr))
2549                 type = RTN_MULTICAST;
2550         else if (ipv4_is_zeronet(fl4->daddr))
2551                 return ERR_PTR(-EINVAL);
2552
2553         if (dev_out->flags & IFF_LOOPBACK)
2554                 flags |= RTCF_LOCAL;
2555
2556         in_dev = __in_dev_get_rcu(dev_out);
2557         if (!in_dev)
2558                 return ERR_PTR(-EINVAL);
2559
2560         if (type == RTN_BROADCAST) {
2561                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2562                 fi = NULL;
2563         } else if (type == RTN_MULTICAST) {
2564                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2565                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2566                                      fl4->flowi4_proto))
2567                         flags &= ~RTCF_LOCAL;
2568                 /* If multicast route do not exist use
2569                  * default one, but do not gateway in this case.
2570                  * Yes, it is hack.
2571                  */
2572                 if (fi && res->prefixlen < 4)
2573                         fi = NULL;
2574         }
2575
2576         rth = rt_dst_alloc(dev_out,
2577                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2578                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2579         if (!rth)
2580                 return ERR_PTR(-ENOBUFS);
2581
2582         rth->dst.output = ip_output;
2583
2584         rth->rt_key_dst = orig_daddr;
2585         rth->rt_key_src = orig_saddr;
2586         rth->rt_genid = rt_genid(dev_net(dev_out));
2587         rth->rt_flags   = flags;
2588         rth->rt_type    = type;
2589         rth->rt_key_tos = orig_rtos;
2590         rth->rt_dst     = fl4->daddr;
2591         rth->rt_src     = fl4->saddr;
2592         rth->rt_route_iif = 0;
2593         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2594         rth->rt_oif     = orig_oif;
2595         rth->rt_mark    = fl4->flowi4_mark;
2596         rth->rt_gateway = fl4->daddr;
2597         rth->rt_spec_dst= fl4->saddr;
2598         rth->rt_peer_genid = 0;
2599         rth->peer = NULL;
2600         rth->fi = NULL;
2601
2602         RT_CACHE_STAT_INC(out_slow_tot);
2603
2604         if (flags & RTCF_LOCAL) {
2605                 rth->dst.input = ip_local_deliver;
2606                 rth->rt_spec_dst = fl4->daddr;
2607         }
2608         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2609                 rth->rt_spec_dst = fl4->saddr;
2610                 if (flags & RTCF_LOCAL &&
2611                     !(dev_out->flags & IFF_LOOPBACK)) {
2612                         rth->dst.output = ip_mc_output;
2613                         RT_CACHE_STAT_INC(out_slow_mc);
2614                 }
2615 #ifdef CONFIG_IP_MROUTE
2616                 if (type == RTN_MULTICAST) {
2617                         if (IN_DEV_MFORWARD(in_dev) &&
2618                             !ipv4_is_local_multicast(fl4->daddr)) {
2619                                 rth->dst.input = ip_mr_input;
2620                                 rth->dst.output = ip_mc_output;
2621                         }
2622                 }
2623 #endif
2624         }
2625
2626         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2627
2628         return rth;
2629 }
2630
2631 /*
2632  * Major route resolver routine.
2633  * called with rcu_read_lock();
2634  */
2635
2636 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2637 {
2638         struct net_device *dev_out = NULL;
2639         __u8 tos = RT_FL_TOS(fl4);
2640         unsigned int flags = 0;
2641         struct fib_result res;
2642         struct rtable *rth;
2643         __be32 orig_daddr;
2644         __be32 orig_saddr;
2645         int orig_oif;
2646
2647         res.fi          = NULL;
2648 #ifdef CONFIG_IP_MULTIPLE_TABLES
2649         res.r           = NULL;
2650 #endif
2651
2652         orig_daddr = fl4->daddr;
2653         orig_saddr = fl4->saddr;
2654         orig_oif = fl4->flowi4_oif;
2655
2656         fl4->flowi4_iif = net->loopback_dev->ifindex;
2657         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2658         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2659                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2660
2661         rcu_read_lock();
2662         if (fl4->saddr) {
2663                 rth = ERR_PTR(-EINVAL);
2664                 if (ipv4_is_multicast(fl4->saddr) ||
2665                     ipv4_is_lbcast(fl4->saddr) ||
2666                     ipv4_is_zeronet(fl4->saddr))
2667                         goto out;
2668
2669                 /* I removed check for oif == dev_out->oif here.
2670                    It was wrong for two reasons:
2671                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2672                       is assigned to multiple interfaces.
2673                    2. Moreover, we are allowed to send packets with saddr
2674                       of another iface. --ANK
2675                  */
2676
2677                 if (fl4->flowi4_oif == 0 &&
2678                     (ipv4_is_multicast(fl4->daddr) ||
2679                      ipv4_is_lbcast(fl4->daddr))) {
2680                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2681                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2682                         if (dev_out == NULL)
2683                                 goto out;
2684
2685                         /* Special hack: user can direct multicasts
2686                            and limited broadcast via necessary interface
2687                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2688                            This hack is not just for fun, it allows
2689                            vic,vat and friends to work.
2690                            They bind socket to loopback, set ttl to zero
2691                            and expect that it will work.
2692                            From the viewpoint of routing cache they are broken,
2693                            because we are not allowed to build multicast path
2694                            with loopback source addr (look, routing cache
2695                            cannot know, that ttl is zero, so that packet
2696                            will not leave this host and route is valid).
2697                            Luckily, this hack is good workaround.
2698                          */
2699
2700                         fl4->flowi4_oif = dev_out->ifindex;
2701                         goto make_route;
2702                 }
2703
2704                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2705                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2706                         if (!__ip_dev_find(net, fl4->saddr, false))
2707                                 goto out;
2708                 }
2709         }
2710
2711
2712         if (fl4->flowi4_oif) {
2713                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2714                 rth = ERR_PTR(-ENODEV);
2715                 if (dev_out == NULL)
2716                         goto out;
2717
2718                 /* RACE: Check return value of inet_select_addr instead. */
2719                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2720                         rth = ERR_PTR(-ENETUNREACH);
2721                         goto out;
2722                 }
2723                 if (ipv4_is_local_multicast(fl4->daddr) ||
2724                     ipv4_is_lbcast(fl4->daddr)) {
2725                         if (!fl4->saddr)
2726                                 fl4->saddr = inet_select_addr(dev_out, 0,
2727                                                               RT_SCOPE_LINK);
2728                         goto make_route;
2729                 }
2730                 if (fl4->saddr) {
2731                         if (ipv4_is_multicast(fl4->daddr))
2732                                 fl4->saddr = inet_select_addr(dev_out, 0,
2733                                                               fl4->flowi4_scope);
2734                         else if (!fl4->daddr)
2735                                 fl4->saddr = inet_select_addr(dev_out, 0,
2736                                                               RT_SCOPE_HOST);
2737                 }
2738         }
2739
2740         if (!fl4->daddr) {
2741                 fl4->daddr = fl4->saddr;
2742                 if (!fl4->daddr)
2743                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2744                 dev_out = net->loopback_dev;
2745                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2746                 res.type = RTN_LOCAL;
2747                 flags |= RTCF_LOCAL;
2748                 goto make_route;
2749         }
2750
2751         if (fib_lookup(net, fl4, &res)) {
2752                 res.fi = NULL;
2753                 if (fl4->flowi4_oif) {
2754                         /* Apparently, routing tables are wrong. Assume,
2755                            that the destination is on link.
2756
2757                            WHY? DW.
2758                            Because we are allowed to send to iface
2759                            even if it has NO routes and NO assigned
2760                            addresses. When oif is specified, routing
2761                            tables are looked up with only one purpose:
2762                            to catch if destination is gatewayed, rather than
2763                            direct. Moreover, if MSG_DONTROUTE is set,
2764                            we send packet, ignoring both routing tables
2765                            and ifaddr state. --ANK
2766
2767
2768                            We could make it even if oif is unknown,
2769                            likely IPv6, but we do not.
2770                          */
2771
2772                         if (fl4->saddr == 0)
2773                                 fl4->saddr = inet_select_addr(dev_out, 0,
2774                                                               RT_SCOPE_LINK);
2775                         res.type = RTN_UNICAST;
2776                         goto make_route;
2777                 }
2778                 rth = ERR_PTR(-ENETUNREACH);
2779                 goto out;
2780         }
2781
2782         if (res.type == RTN_LOCAL) {
2783                 if (!fl4->saddr) {
2784                         if (res.fi->fib_prefsrc)
2785                                 fl4->saddr = res.fi->fib_prefsrc;
2786                         else
2787                                 fl4->saddr = fl4->daddr;
2788                 }
2789                 dev_out = net->loopback_dev;
2790                 fl4->flowi4_oif = dev_out->ifindex;
2791                 res.fi = NULL;
2792                 flags |= RTCF_LOCAL;
2793                 goto make_route;
2794         }
2795
2796 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2797         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2798                 fib_select_multipath(&res);
2799         else
2800 #endif
2801         if (!res.prefixlen &&
2802             res.table->tb_num_default > 1 &&
2803             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2804                 fib_select_default(&res);
2805
2806         if (!fl4->saddr)
2807                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2808
2809         dev_out = FIB_RES_DEV(res);
2810         fl4->flowi4_oif = dev_out->ifindex;
2811
2812
2813 make_route:
2814         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2815                                tos, dev_out, flags);
2816         if (!IS_ERR(rth)) {
2817                 unsigned int hash;
2818
2819                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2820                                rt_genid(dev_net(dev_out)));
2821                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2822         }
2823
2824 out:
2825         rcu_read_unlock();
2826         return rth;
2827 }
2828
2829 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2830 {
2831         struct rtable *rth;
2832         unsigned int hash;
2833
2834         if (!rt_caching(net))
2835                 goto slow_output;
2836
2837         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2838
2839         rcu_read_lock_bh();
2840         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2841                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2842                 if (rth->rt_key_dst == flp4->daddr &&
2843                     rth->rt_key_src == flp4->saddr &&
2844                     rt_is_output_route(rth) &&
2845                     rth->rt_oif == flp4->flowi4_oif &&
2846                     rth->rt_mark == flp4->flowi4_mark &&
2847                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2848                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2849                     net_eq(dev_net(rth->dst.dev), net) &&
2850                     !rt_is_expired(rth)) {
2851                         ipv4_validate_peer(rth);
2852                         dst_use(&rth->dst, jiffies);
2853                         RT_CACHE_STAT_INC(out_hit);
2854                         rcu_read_unlock_bh();
2855                         if (!flp4->saddr)
2856                                 flp4->saddr = rth->rt_src;
2857                         if (!flp4->daddr)
2858                                 flp4->daddr = rth->rt_dst;
2859                         return rth;
2860                 }
2861                 RT_CACHE_STAT_INC(out_hlist_search);
2862         }
2863         rcu_read_unlock_bh();
2864
2865 slow_output:
2866         return ip_route_output_slow(net, flp4);
2867 }
2868 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2869
2870 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2871 {
2872         return NULL;
2873 }
2874
2875 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2876 {
2877         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2878
2879         return mtu ? : dst->dev->mtu;
2880 }
2881
2882 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2883 {
2884 }
2885
2886 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2887                                           unsigned long old)
2888 {
2889         return NULL;
2890 }
2891
2892 static struct dst_ops ipv4_dst_blackhole_ops = {
2893         .family                 =       AF_INET,
2894         .protocol               =       cpu_to_be16(ETH_P_IP),
2895         .destroy                =       ipv4_dst_destroy,
2896         .check                  =       ipv4_blackhole_dst_check,
2897         .mtu                    =       ipv4_blackhole_mtu,
2898         .default_advmss         =       ipv4_default_advmss,
2899         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2900         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2901         .neigh_lookup           =       ipv4_neigh_lookup,
2902 };
2903
2904 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2905 {
2906         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2907         struct rtable *ort = (struct rtable *) dst_orig;
2908
2909         if (rt) {
2910                 struct dst_entry *new = &rt->dst;
2911
2912                 new->__use = 1;
2913                 new->input = dst_discard;
2914                 new->output = dst_discard;
2915                 dst_copy_metrics(new, &ort->dst);
2916
2917                 new->dev = ort->dst.dev;
2918                 if (new->dev)
2919                         dev_hold(new->dev);
2920
2921                 rt->rt_key_dst = ort->rt_key_dst;
2922                 rt->rt_key_src = ort->rt_key_src;
2923                 rt->rt_key_tos = ort->rt_key_tos;
2924                 rt->rt_route_iif = ort->rt_route_iif;
2925                 rt->rt_iif = ort->rt_iif;
2926                 rt->rt_oif = ort->rt_oif;
2927                 rt->rt_mark = ort->rt_mark;
2928
2929                 rt->rt_genid = rt_genid(net);
2930                 rt->rt_flags = ort->rt_flags;
2931                 rt->rt_type = ort->rt_type;
2932                 rt->rt_dst = ort->rt_dst;
2933                 rt->rt_src = ort->rt_src;
2934                 rt->rt_gateway = ort->rt_gateway;
2935                 rt->rt_spec_dst = ort->rt_spec_dst;
2936                 rt->peer = ort->peer;
2937                 if (rt->peer)
2938                         atomic_inc(&rt->peer->refcnt);
2939                 rt->fi = ort->fi;
2940                 if (rt->fi)
2941                         atomic_inc(&rt->fi->fib_clntref);
2942
2943                 dst_free(new);
2944         }
2945
2946         dst_release(dst_orig);
2947
2948         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2949 }
2950
2951 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2952                                     struct sock *sk)
2953 {
2954         struct rtable *rt = __ip_route_output_key(net, flp4);
2955
2956         if (IS_ERR(rt))
2957                 return rt;
2958
2959         if (flp4->flowi4_proto)
2960                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2961                                                    flowi4_to_flowi(flp4),
2962                                                    sk, 0);
2963
2964         return rt;
2965 }
2966 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2967
2968 static int rt_fill_info(struct net *net,
2969                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2970                         int nowait, unsigned int flags)
2971 {
2972         struct rtable *rt = skb_rtable(skb);
2973         struct rtmsg *r;
2974         struct nlmsghdr *nlh;
2975         unsigned long expires = 0;
2976         const struct inet_peer *peer = rt->peer;
2977         u32 id = 0, ts = 0, tsage = 0, error;
2978
2979         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2980         if (nlh == NULL)
2981                 return -EMSGSIZE;
2982
2983         r = nlmsg_data(nlh);
2984         r->rtm_family    = AF_INET;
2985         r->rtm_dst_len  = 32;
2986         r->rtm_src_len  = 0;
2987         r->rtm_tos      = rt->rt_key_tos;
2988         r->rtm_table    = RT_TABLE_MAIN;
2989         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2990         r->rtm_type     = rt->rt_type;
2991         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2992         r->rtm_protocol = RTPROT_UNSPEC;
2993         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2994         if (rt->rt_flags & RTCF_NOTIFY)
2995                 r->rtm_flags |= RTM_F_NOTIFY;
2996
2997         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2998
2999         if (rt->rt_key_src) {
3000                 r->rtm_src_len = 32;
3001                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
3002         }
3003         if (rt->dst.dev)
3004                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
3005 #ifdef CONFIG_IP_ROUTE_CLASSID
3006         if (rt->dst.tclassid)
3007                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
3008 #endif
3009         if (rt_is_input_route(rt))
3010                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
3011         else if (rt->rt_src != rt->rt_key_src)
3012                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3013
3014         if (rt->rt_dst != rt->rt_gateway)
3015                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3016
3017         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3018                 goto nla_put_failure;
3019
3020         if (rt->rt_mark)
3021                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3022
3023         error = rt->dst.error;
3024         if (peer) {
3025                 inet_peer_refcheck(rt->peer);
3026                 id = atomic_read(&peer->ip_id_count) & 0xffff;
3027                 if (peer->tcp_ts_stamp) {
3028                         ts = peer->tcp_ts;
3029                         tsage = get_seconds() - peer->tcp_ts_stamp;
3030                 }
3031                 expires = ACCESS_ONCE(peer->pmtu_expires);
3032                 if (expires) {
3033                         if (time_before(jiffies, expires))
3034                                 expires -= jiffies;
3035                         else
3036                                 expires = 0;
3037                 }
3038         }
3039
3040         if (rt_is_input_route(rt)) {
3041 #ifdef CONFIG_IP_MROUTE
3042                 __be32 dst = rt->rt_dst;
3043
3044                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3045                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3046                         int err = ipmr_get_route(net, skb,
3047                                                  rt->rt_src, rt->rt_dst,
3048                                                  r, nowait);
3049                         if (err <= 0) {
3050                                 if (!nowait) {
3051                                         if (err == 0)
3052                                                 return 0;
3053                                         goto nla_put_failure;
3054                                 } else {
3055                                         if (err == -EMSGSIZE)
3056                                                 goto nla_put_failure;
3057                                         error = err;
3058                                 }
3059                         }
3060                 } else
3061 #endif
3062                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3063         }
3064
3065         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3066                                expires, error) < 0)
3067                 goto nla_put_failure;
3068
3069         return nlmsg_end(skb, nlh);
3070
3071 nla_put_failure:
3072         nlmsg_cancel(skb, nlh);
3073         return -EMSGSIZE;
3074 }
3075
3076 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3077 {
3078         struct net *net = sock_net(in_skb->sk);
3079         struct rtmsg *rtm;
3080         struct nlattr *tb[RTA_MAX+1];
3081         struct rtable *rt = NULL;
3082         __be32 dst = 0;
3083         __be32 src = 0;
3084         u32 iif;
3085         int err;
3086         int mark;
3087         struct sk_buff *skb;
3088
3089         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3090         if (err < 0)
3091                 goto errout;
3092
3093         rtm = nlmsg_data(nlh);
3094
3095         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3096         if (skb == NULL) {
3097                 err = -ENOBUFS;
3098                 goto errout;
3099         }
3100
3101         /* Reserve room for dummy headers, this skb can pass
3102            through good chunk of routing engine.
3103          */
3104         skb_reset_mac_header(skb);
3105         skb_reset_network_header(skb);
3106
3107         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3108         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3109         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3110
3111         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3112         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3113         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3114         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3115
3116         if (iif) {
3117                 struct net_device *dev;
3118
3119                 dev = __dev_get_by_index(net, iif);
3120                 if (dev == NULL) {
3121                         err = -ENODEV;
3122                         goto errout_free;
3123                 }
3124
3125                 skb->protocol   = htons(ETH_P_IP);
3126                 skb->dev        = dev;
3127                 skb->mark       = mark;
3128                 local_bh_disable();
3129                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3130                 local_bh_enable();
3131
3132                 rt = skb_rtable(skb);
3133                 if (err == 0 && rt->dst.error)
3134                         err = -rt->dst.error;
3135         } else {
3136                 struct flowi4 fl4 = {
3137                         .daddr = dst,
3138                         .saddr = src,
3139                         .flowi4_tos = rtm->rtm_tos,
3140                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3141                         .flowi4_mark = mark,
3142                 };
3143                 rt = ip_route_output_key(net, &fl4);
3144
3145                 err = 0;
3146                 if (IS_ERR(rt))
3147                         err = PTR_ERR(rt);
3148         }
3149
3150         if (err)
3151                 goto errout_free;
3152
3153         skb_dst_set(skb, &rt->dst);
3154         if (rtm->rtm_flags & RTM_F_NOTIFY)
3155                 rt->rt_flags |= RTCF_NOTIFY;
3156
3157         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3158                            RTM_NEWROUTE, 0, 0);
3159         if (err <= 0)
3160                 goto errout_free;
3161
3162         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3163 errout:
3164         return err;
3165
3166 errout_free:
3167         kfree_skb(skb);
3168         goto errout;
3169 }
3170
3171 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3172 {
3173         struct rtable *rt;
3174         int h, s_h;
3175         int idx, s_idx;
3176         struct net *net;
3177
3178         net = sock_net(skb->sk);
3179
3180         s_h = cb->args[0];
3181         if (s_h < 0)
3182                 s_h = 0;
3183         s_idx = idx = cb->args[1];
3184         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3185                 if (!rt_hash_table[h].chain)
3186                         continue;
3187                 rcu_read_lock_bh();
3188                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3189                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3190                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3191                                 continue;
3192                         if (rt_is_expired(rt))
3193                                 continue;
3194                         skb_dst_set_noref(skb, &rt->dst);
3195                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3196                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3197                                          1, NLM_F_MULTI) <= 0) {
3198                                 skb_dst_drop(skb);
3199                                 rcu_read_unlock_bh();
3200                                 goto done;
3201                         }
3202                         skb_dst_drop(skb);
3203                 }
3204                 rcu_read_unlock_bh();
3205         }
3206
3207 done:
3208         cb->args[0] = h;
3209         cb->args[1] = idx;
3210         return skb->len;
3211 }
3212
3213 void ip_rt_multicast_event(struct in_device *in_dev)
3214 {
3215         rt_cache_flush(dev_net(in_dev->dev), 0);
3216 }
3217
3218 #ifdef CONFIG_SYSCTL
3219 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3220                                         void __user *buffer,
3221                                         size_t *lenp, loff_t *ppos)
3222 {
3223         if (write) {
3224                 int flush_delay;
3225                 ctl_table ctl;
3226                 struct net *net;
3227
3228                 memcpy(&ctl, __ctl, sizeof(ctl));
3229                 ctl.data = &flush_delay;
3230                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3231
3232                 net = (struct net *)__ctl->extra1;
3233                 rt_cache_flush(net, flush_delay);
3234                 return 0;
3235         }
3236
3237         return -EINVAL;
3238 }
3239
3240 static ctl_table ipv4_route_table[] = {
3241         {
3242                 .procname       = "gc_thresh",
3243                 .data           = &ipv4_dst_ops.gc_thresh,
3244                 .maxlen         = sizeof(int),
3245                 .mode           = 0644,
3246                 .proc_handler   = proc_dointvec,
3247         },
3248         {
3249                 .procname       = "max_size",
3250                 .data           = &ip_rt_max_size,
3251                 .maxlen         = sizeof(int),
3252                 .mode           = 0644,
3253                 .proc_handler   = proc_dointvec,
3254         },
3255         {
3256                 /*  Deprecated. Use gc_min_interval_ms */
3257
3258                 .procname       = "gc_min_interval",
3259                 .data           = &ip_rt_gc_min_interval,
3260                 .maxlen         = sizeof(int),
3261                 .mode           = 0644,
3262                 .proc_handler   = proc_dointvec_jiffies,
3263         },
3264         {
3265                 .procname       = "gc_min_interval_ms",
3266                 .data           = &ip_rt_gc_min_interval,
3267                 .maxlen         = sizeof(int),
3268                 .mode           = 0644,
3269                 .proc_handler   = proc_dointvec_ms_jiffies,
3270         },
3271         {
3272                 .procname       = "gc_timeout",
3273                 .data           = &ip_rt_gc_timeout,
3274                 .maxlen         = sizeof(int),
3275                 .mode           = 0644,
3276                 .proc_handler   = proc_dointvec_jiffies,
3277         },
3278         {
3279                 .procname       = "gc_interval",
3280                 .data           = &ip_rt_gc_interval,
3281                 .maxlen         = sizeof(int),
3282                 .mode           = 0644,
3283                 .proc_handler   = proc_dointvec_jiffies,
3284         },
3285         {
3286                 .procname       = "redirect_load",
3287                 .data           = &ip_rt_redirect_load,
3288                 .maxlen         = sizeof(int),
3289                 .mode           = 0644,
3290                 .proc_handler   = proc_dointvec,
3291         },
3292         {
3293                 .procname       = "redirect_number",
3294                 .data           = &ip_rt_redirect_number,
3295                 .maxlen         = sizeof(int),
3296                 .mode           = 0644,
3297                 .proc_handler   = proc_dointvec,
3298         },
3299         {
3300                 .procname       = "redirect_silence",
3301                 .data           = &ip_rt_redirect_silence,
3302                 .maxlen         = sizeof(int),
3303                 .mode           = 0644,
3304                 .proc_handler   = proc_dointvec,
3305         },
3306         {
3307                 .procname       = "error_cost",
3308                 .data           = &ip_rt_error_cost,
3309                 .maxlen         = sizeof(int),
3310                 .mode           = 0644,
3311                 .proc_handler   = proc_dointvec,
3312         },
3313         {
3314                 .procname       = "error_burst",
3315                 .data           = &ip_rt_error_burst,
3316                 .maxlen         = sizeof(int),
3317                 .mode           = 0644,
3318                 .proc_handler   = proc_dointvec,
3319         },
3320         {
3321                 .procname       = "gc_elasticity",
3322                 .data           = &ip_rt_gc_elasticity,
3323                 .maxlen         = sizeof(int),
3324                 .mode           = 0644,
3325                 .proc_handler   = proc_dointvec,
3326         },
3327         {
3328                 .procname       = "mtu_expires",
3329                 .data           = &ip_rt_mtu_expires,
3330                 .maxlen         = sizeof(int),
3331                 .mode           = 0644,
3332                 .proc_handler   = proc_dointvec_jiffies,
3333         },
3334         {
3335                 .procname       = "min_pmtu",
3336                 .data           = &ip_rt_min_pmtu,
3337                 .maxlen         = sizeof(int),
3338                 .mode           = 0644,
3339                 .proc_handler   = proc_dointvec,
3340         },
3341         {
3342                 .procname       = "min_adv_mss",
3343                 .data           = &ip_rt_min_advmss,
3344                 .maxlen         = sizeof(int),
3345                 .mode           = 0644,
3346                 .proc_handler   = proc_dointvec,
3347         },
3348         { }
3349 };
3350
3351 static struct ctl_table empty[1];
3352
3353 static struct ctl_table ipv4_skeleton[] =
3354 {
3355         { .procname = "route",
3356           .mode = 0555, .child = ipv4_route_table},
3357         { .procname = "neigh",
3358           .mode = 0555, .child = empty},
3359         { }
3360 };
3361
3362 static __net_initdata struct ctl_path ipv4_path[] = {
3363         { .procname = "net", },
3364         { .procname = "ipv4", },
3365         { },
3366 };
3367
3368 static struct ctl_table ipv4_route_flush_table[] = {
3369         {
3370                 .procname       = "flush",
3371                 .maxlen         = sizeof(int),
3372                 .mode           = 0200,
3373                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3374         },
3375         { },
3376 };
3377
3378 static __net_initdata struct ctl_path ipv4_route_path[] = {
3379         { .procname = "net", },
3380         { .procname = "ipv4", },
3381         { .procname = "route", },
3382         { },
3383 };
3384
3385 static __net_init int sysctl_route_net_init(struct net *net)
3386 {
3387         struct ctl_table *tbl;
3388
3389         tbl = ipv4_route_flush_table;
3390         if (!net_eq(net, &init_net)) {
3391                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3392                 if (tbl == NULL)
3393                         goto err_dup;
3394         }
3395         tbl[0].extra1 = net;
3396
3397         net->ipv4.route_hdr =
3398                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3399         if (net->ipv4.route_hdr == NULL)
3400                 goto err_reg;
3401         return 0;
3402
3403 err_reg:
3404         if (tbl != ipv4_route_flush_table)
3405                 kfree(tbl);
3406 err_dup:
3407         return -ENOMEM;
3408 }
3409
3410 static __net_exit void sysctl_route_net_exit(struct net *net)
3411 {
3412         struct ctl_table *tbl;
3413
3414         tbl = net->ipv4.route_hdr->ctl_table_arg;
3415         unregister_net_sysctl_table(net->ipv4.route_hdr);
3416         BUG_ON(tbl == ipv4_route_flush_table);
3417         kfree(tbl);
3418 }
3419
3420 static __net_initdata struct pernet_operations sysctl_route_ops = {
3421         .init = sysctl_route_net_init,
3422         .exit = sysctl_route_net_exit,
3423 };
3424 #endif
3425
3426 static __net_init int rt_genid_init(struct net *net)
3427 {
3428         get_random_bytes(&net->ipv4.rt_genid,
3429                          sizeof(net->ipv4.rt_genid));
3430         get_random_bytes(&net->ipv4.dev_addr_genid,
3431                          sizeof(net->ipv4.dev_addr_genid));
3432         return 0;
3433 }
3434
3435 static __net_initdata struct pernet_operations rt_genid_ops = {
3436         .init = rt_genid_init,
3437 };
3438
3439
3440 #ifdef CONFIG_IP_ROUTE_CLASSID
3441 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3442 #endif /* CONFIG_IP_ROUTE_CLASSID */
3443
3444 static __initdata unsigned long rhash_entries;
3445 static int __init set_rhash_entries(char *str)
3446 {
3447         if (!str)
3448                 return 0;
3449         rhash_entries = simple_strtoul(str, &str, 0);
3450         return 1;
3451 }
3452 __setup("rhash_entries=", set_rhash_entries);
3453
3454 int __init ip_rt_init(void)
3455 {
3456         int rc = 0;
3457
3458 #ifdef CONFIG_IP_ROUTE_CLASSID
3459         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3460         if (!ip_rt_acct)
3461                 panic("IP: failed to allocate ip_rt_acct\n");
3462 #endif
3463
3464         ipv4_dst_ops.kmem_cachep =
3465                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3466                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3467
3468         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3469
3470         if (dst_entries_init(&ipv4_dst_ops) < 0)
3471                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3472
3473         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3474                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3475
3476         rt_hash_table = (struct rt_hash_bucket *)
3477                 alloc_large_system_hash("IP route cache",
3478                                         sizeof(struct rt_hash_bucket),
3479                                         rhash_entries,
3480                                         (totalram_pages >= 128 * 1024) ?
3481                                         15 : 17,
3482                                         0,
3483                                         &rt_hash_log,
3484                                         &rt_hash_mask,
3485                                         rhash_entries ? 0 : 512 * 1024);
3486         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3487         rt_hash_lock_init();
3488
3489         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3490         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3491
3492         devinet_init();
3493         ip_fib_init();
3494
3495         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3496         expires_ljiffies = jiffies;
3497         schedule_delayed_work(&expires_work,
3498                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3499
3500         if (ip_rt_proc_init())
3501                 printk(KERN_ERR "Unable to create route proc files\n");
3502 #ifdef CONFIG_XFRM
3503         xfrm_init();
3504         xfrm4_init(ip_rt_max_size);
3505 #endif
3506         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3507
3508 #ifdef CONFIG_SYSCTL
3509         register_pernet_subsys(&sysctl_route_ops);
3510 #endif
3511         register_pernet_subsys(&rt_genid_ops);
3512         return rc;
3513 }
3514
3515 #ifdef CONFIG_SYSCTL
3516 /*
3517  * We really need to sanitize the damn ipv4 init order, then all
3518  * this nonsense will go away.
3519  */
3520 void __init ip_static_sysctl_init(void)
3521 {
3522         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3523 }
3524 #endif