net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <linux/prefetch.h>
  95 #include <net/dst.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #endif
 112 #include <net/atmclip.h>
 113 #include <net/secure_seq.h>
 114
 115 #define RT_FL_TOS(oldflp4) \
 116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 117
 118 #define IP_MAX_MTU      0xFFF0
 119
 120 #define RT_GC_TIMEOUT (300*HZ)
 121
 122 static int ip_rt_max_size;
 123 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 124 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 125 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 126 static int ip_rt_redirect_number __read_mostly  = 9;
 127 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 128 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 129 static int ip_rt_error_cost __read_mostly       = HZ;
 130 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 131 static int ip_rt_gc_elasticity __read_mostly    = 8;
 132 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 133 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 134 static int ip_rt_min_advmss __read_mostly       = 256;
 135 static int rt_chain_length_max __read_mostly    = 20;
 136 static int redirect_genid;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152 static int rt_garbage_collect(struct dst_ops *ops);
 153
 154 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 155                             int how)
 156 {
 157 }
 158
 159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 160 {
 161         struct rtable *rt = (struct rtable *) dst;
 162         struct inet_peer *peer;
 163         u32 *p = NULL;
 164
 165         if (!rt->peer)
 166                 rt_bind_peer(rt, rt->rt_dst, 1);
 167
 168         peer = rt->peer;
 169         if (peer) {
 170                 u32 *old_p = __DST_METRICS_PTR(old);
 171                 unsigned long prev, new;
 172
 173                 p = peer->metrics;
 174                 if (inet_metrics_new(peer))
 175                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 176
 177                 new = (unsigned long) p;
 178                 prev = cmpxchg(&dst->_metrics, old, new);
 179
 180                 if (prev != old) {
 181                         p = __DST_METRICS_PTR(prev);
 182                         if (prev & DST_METRICS_READ_ONLY)
 183                                 p = NULL;
 184                 } else {
 185                         if (rt->fi) {
 186                                 fib_info_put(rt->fi);
 187                                 rt->fi = NULL;
 188                         }
 189                 }
 190         }
 191         return p;
 192 }
 193
 194 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 195
 196 static struct dst_ops ipv4_dst_ops = {
 197         .family =               AF_INET,
 198         .protocol =             cpu_to_be16(ETH_P_IP),
 199         .gc =                   rt_garbage_collect,
 200         .check =                ipv4_dst_check,
 201         .default_advmss =       ipv4_default_advmss,
 202         .mtu =                  ipv4_mtu,
 203         .cow_metrics =          ipv4_cow_metrics,
 204         .destroy =              ipv4_dst_destroy,
 205         .ifdown =               ipv4_dst_ifdown,
 206         .negative_advice =      ipv4_negative_advice,
 207         .link_failure =         ipv4_link_failure,
 208         .update_pmtu =          ip_rt_update_pmtu,
 209         .local_out =            __ip_local_out,
 210         .neigh_lookup =         ipv4_neigh_lookup,
 211 };
 212
 213 #define ECN_OR_COST(class)      TC_PRIO_##class
 214
 215 const __u8 ip_tos2prio[16] = {
 216         TC_PRIO_BESTEFFORT,
 217         ECN_OR_COST(BESTEFFORT),
 218         TC_PRIO_BESTEFFORT,
 219         ECN_OR_COST(BESTEFFORT),
 220         TC_PRIO_BULK,
 221         ECN_OR_COST(BULK),
 222         TC_PRIO_BULK,
 223         ECN_OR_COST(BULK),
 224         TC_PRIO_INTERACTIVE,
 225         ECN_OR_COST(INTERACTIVE),
 226         TC_PRIO_INTERACTIVE,
 227         ECN_OR_COST(INTERACTIVE),
 228         TC_PRIO_INTERACTIVE_BULK,
 229         ECN_OR_COST(INTERACTIVE_BULK),
 230         TC_PRIO_INTERACTIVE_BULK,
 231         ECN_OR_COST(INTERACTIVE_BULK)
 232 };
 233
 234
 235 /*
 236  * Route cache.
 237  */
 238
 239 /* The locking scheme is rather straight forward:
 240  *
 241  * 1) Read-Copy Update protects the buckets of the central route hash.
 242  * 2) Only writers remove entries, and they hold the lock
 243  *    as they look at rtable reference counts.
 244  * 3) Only readers acquire references to rtable entries,
 245  *    they do so with atomic increments and with the
 246  *    lock held.
 247  */
 248
 249 struct rt_hash_bucket {
 250         struct rtable __rcu     *chain;
 251 };
 252
 253 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 254         defined(CONFIG_PROVE_LOCKING)
 255 /*
 256  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 257  * The size of this table is a power of two and depends on the number of CPUS.
 258  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 259  */
 260 #ifdef CONFIG_LOCKDEP
 261 # define RT_HASH_LOCK_SZ        256
 262 #else
 263 # if NR_CPUS >= 32
 264 #  define RT_HASH_LOCK_SZ       4096
 265 # elif NR_CPUS >= 16
 266 #  define RT_HASH_LOCK_SZ       2048
 267 # elif NR_CPUS >= 8
 268 #  define RT_HASH_LOCK_SZ       1024
 269 # elif NR_CPUS >= 4
 270 #  define RT_HASH_LOCK_SZ       512
 271 # else
 272 #  define RT_HASH_LOCK_SZ       256
 273 # endif
 274 #endif
 275
 276 static spinlock_t       *rt_hash_locks;
 277 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 278
 279 static __init void rt_hash_lock_init(void)
 280 {
 281         int i;
 282
 283         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 284                         GFP_KERNEL);
 285         if (!rt_hash_locks)
 286                 panic("IP: failed to allocate rt_hash_locks\n");
 287
 288         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 289                 spin_lock_init(&rt_hash_locks[i]);
 290 }
 291 #else
 292 # define rt_hash_lock_addr(slot) NULL
 293
 294 static inline void rt_hash_lock_init(void)
 295 {
 296 }
 297 #endif
 298
 299 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 300 static unsigned                 rt_hash_mask __read_mostly;
 301 static unsigned int             rt_hash_log  __read_mostly;
 302
 303 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 304 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 305
 306 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 307                                    int genid)
 308 {
 309         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 310                             idx, genid)
 311                 & rt_hash_mask;
 312 }
 313
 314 static inline int rt_genid(struct net *net)
 315 {
 316         return atomic_read(&net->ipv4.rt_genid);
 317 }
 318
 319 #ifdef CONFIG_PROC_FS
 320 struct rt_cache_iter_state {
 321         struct seq_net_private p;
 322         int bucket;
 323         int genid;
 324 };
 325
 326 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 327 {
 328         struct rt_cache_iter_state *st = seq->private;
 329         struct rtable *r = NULL;
 330
 331         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 332                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 333                         continue;
 334                 rcu_read_lock_bh();
 335                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 336                 while (r) {
 337                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 338                             r->rt_genid == st->genid)
 339                                 return r;
 340                         r = rcu_dereference_bh(r->dst.rt_next);
 341                 }
 342                 rcu_read_unlock_bh();
 343         }
 344         return r;
 345 }
 346
 347 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 348                                           struct rtable *r)
 349 {
 350         struct rt_cache_iter_state *st = seq->private;
 351
 352         r = rcu_dereference_bh(r->dst.rt_next);
 353         while (!r) {
 354                 rcu_read_unlock_bh();
 355                 do {
 356                         if (--st->bucket < 0)
 357                                 return NULL;
 358                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 359                 rcu_read_lock_bh();
 360                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 361         }
 362         return r;
 363 }
 364
 365 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 366                                         struct rtable *r)
 367 {
 368         struct rt_cache_iter_state *st = seq->private;
 369         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 370                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 371                         continue;
 372                 if (r->rt_genid == st->genid)
 373                         break;
 374         }
 375         return r;
 376 }
 377
 378 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 379 {
 380         struct rtable *r = rt_cache_get_first(seq);
 381
 382         if (r)
 383                 while (pos && (r = rt_cache_get_next(seq, r)))
 384                         --pos;
 385         return pos ? NULL : r;
 386 }
 387
 388 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 389 {
 390         struct rt_cache_iter_state *st = seq->private;
 391         if (*pos)
 392                 return rt_cache_get_idx(seq, *pos - 1);
 393         st->genid = rt_genid(seq_file_net(seq));
 394         return SEQ_START_TOKEN;
 395 }
 396
 397 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 398 {
 399         struct rtable *r;
 400
 401         if (v == SEQ_START_TOKEN)
 402                 r = rt_cache_get_first(seq);
 403         else
 404                 r = rt_cache_get_next(seq, v);
 405         ++*pos;
 406         return r;
 407 }
 408
 409 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 410 {
 411         if (v && v != SEQ_START_TOKEN)
 412                 rcu_read_unlock_bh();
 413 }
 414
 415 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 416 {
 417         if (v == SEQ_START_TOKEN)
 418                 seq_printf(seq, "%-127s\n",
 419                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 420                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 421                            "HHUptod\tSpecDst");
 422         else {
 423                 struct rtable *r = v;
 424                 struct neighbour *n;
 425                 int len, HHUptod;
 426
 427                 rcu_read_lock();
 428                 n = dst_get_neighbour(&r->dst);
 429                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 430                 rcu_read_unlock();
 431
 432                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 433                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 434                         r->dst.dev ? r->dst.dev->name : "*",
 435                         (__force u32)r->rt_dst,
 436                         (__force u32)r->rt_gateway,
 437                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 438                         r->dst.__use, 0, (__force u32)r->rt_src,
 439                         dst_metric_advmss(&r->dst) + 40,
 440                         dst_metric(&r->dst, RTAX_WINDOW),
 441                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 442                               dst_metric(&r->dst, RTAX_RTTVAR)),
 443                         r->rt_key_tos,
 444                         -1,
 445                         HHUptod,
 446                         r->rt_spec_dst, &len);
 447
 448                 seq_printf(seq, "%*s\n", 127 - len, "");
 449         }
 450         return 0;
 451 }
 452
 453 static const struct seq_operations rt_cache_seq_ops = {
 454         .start  = rt_cache_seq_start,
 455         .next   = rt_cache_seq_next,
 456         .stop   = rt_cache_seq_stop,
 457         .show   = rt_cache_seq_show,
 458 };
 459
 460 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 461 {
 462         return seq_open_net(inode, file, &rt_cache_seq_ops,
 463                         sizeof(struct rt_cache_iter_state));
 464 }
 465
 466 static const struct file_operations rt_cache_seq_fops = {
 467         .owner   = THIS_MODULE,
 468         .open    = rt_cache_seq_open,
 469         .read    = seq_read,
 470         .llseek  = seq_lseek,
 471         .release = seq_release_net,
 472 };
 473
 474
 475 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 476 {
 477         int cpu;
 478
 479         if (*pos == 0)
 480                 return SEQ_START_TOKEN;
 481
 482         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 483                 if (!cpu_possible(cpu))
 484                         continue;
 485                 *pos = cpu+1;
 486                 return &per_cpu(rt_cache_stat, cpu);
 487         }
 488         return NULL;
 489 }
 490
 491 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 492 {
 493         int cpu;
 494
 495         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 496                 if (!cpu_possible(cpu))
 497                         continue;
 498                 *pos = cpu+1;
 499                 return &per_cpu(rt_cache_stat, cpu);
 500         }
 501         return NULL;
 502
 503 }
 504
 505 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 506 {
 507
 508 }
 509
 510 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 511 {
 512         struct rt_cache_stat *st = v;
 513
 514         if (v == SEQ_START_TOKEN) {
 515                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 516                 return 0;
 517         }
 518
 519         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 520                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 521                    dst_entries_get_slow(&ipv4_dst_ops),
 522                    st->in_hit,
 523                    st->in_slow_tot,
 524                    st->in_slow_mc,
 525                    st->in_no_route,
 526                    st->in_brd,
 527                    st->in_martian_dst,
 528                    st->in_martian_src,
 529
 530                    st->out_hit,
 531                    st->out_slow_tot,
 532                    st->out_slow_mc,
 533
 534                    st->gc_total,
 535                    st->gc_ignored,
 536                    st->gc_goal_miss,
 537                    st->gc_dst_overflow,
 538                    st->in_hlist_search,
 539                    st->out_hlist_search
 540                 );
 541         return 0;
 542 }
 543
 544 static const struct seq_operations rt_cpu_seq_ops = {
 545         .start  = rt_cpu_seq_start,
 546         .next   = rt_cpu_seq_next,
 547         .stop   = rt_cpu_seq_stop,
 548         .show   = rt_cpu_seq_show,
 549 };
 550
 551
 552 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 553 {
 554         return seq_open(file, &rt_cpu_seq_ops);
 555 }
 556
 557 static const struct file_operations rt_cpu_seq_fops = {
 558         .owner   = THIS_MODULE,
 559         .open    = rt_cpu_seq_open,
 560         .read    = seq_read,
 561         .llseek  = seq_lseek,
 562         .release = seq_release,
 563 };
 564
 565 #ifdef CONFIG_IP_ROUTE_CLASSID
 566 static int rt_acct_proc_show(struct seq_file *m, void *v)
 567 {
 568         struct ip_rt_acct *dst, *src;
 569         unsigned int i, j;
 570
 571         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 572         if (!dst)
 573                 return -ENOMEM;
 574
 575         for_each_possible_cpu(i) {
 576                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 577                 for (j = 0; j < 256; j++) {
 578                         dst[j].o_bytes   += src[j].o_bytes;
 579                         dst[j].o_packets += src[j].o_packets;
 580                         dst[j].i_bytes   += src[j].i_bytes;
 581                         dst[j].i_packets += src[j].i_packets;
 582                 }
 583         }
 584
 585         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 586         kfree(dst);
 587         return 0;
 588 }
 589
 590 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 591 {
 592         return single_open(file, rt_acct_proc_show, NULL);
 593 }
 594
 595 static const struct file_operations rt_acct_proc_fops = {
 596         .owner          = THIS_MODULE,
 597         .open           = rt_acct_proc_open,
 598         .read           = seq_read,
 599         .llseek         = seq_lseek,
 600         .release        = single_release,
 601 };
 602 #endif
 603
 604 static int __net_init ip_rt_do_proc_init(struct net *net)
 605 {
 606         struct proc_dir_entry *pde;
 607
 608         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 609                         &rt_cache_seq_fops);
 610         if (!pde)
 611                 goto err1;
 612
 613         pde = proc_create("rt_cache", S_IRUGO,
 614                           net->proc_net_stat, &rt_cpu_seq_fops);
 615         if (!pde)
 616                 goto err2;
 617
 618 #ifdef CONFIG_IP_ROUTE_CLASSID
 619         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 620         if (!pde)
 621                 goto err3;
 622 #endif
 623         return 0;
 624
 625 #ifdef CONFIG_IP_ROUTE_CLASSID
 626 err3:
 627         remove_proc_entry("rt_cache", net->proc_net_stat);
 628 #endif
 629 err2:
 630         remove_proc_entry("rt_cache", net->proc_net);
 631 err1:
 632         return -ENOMEM;
 633 }
 634
 635 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 636 {
 637         remove_proc_entry("rt_cache", net->proc_net_stat);
 638         remove_proc_entry("rt_cache", net->proc_net);
 639 #ifdef CONFIG_IP_ROUTE_CLASSID
 640         remove_proc_entry("rt_acct", net->proc_net);
 641 #endif
 642 }
 643
 644 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 645         .init = ip_rt_do_proc_init,
 646         .exit = ip_rt_do_proc_exit,
 647 };
 648
 649 static int __init ip_rt_proc_init(void)
 650 {
 651         return register_pernet_subsys(&ip_rt_proc_ops);
 652 }
 653
 654 #else
 655 static inline int ip_rt_proc_init(void)
 656 {
 657         return 0;
 658 }
 659 #endif /* CONFIG_PROC_FS */
 660
 661 static inline void rt_free(struct rtable *rt)
 662 {
 663         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 664 }
 665
 666 static inline void rt_drop(struct rtable *rt)
 667 {
 668         ip_rt_put(rt);
 669         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 670 }
 671
 672 static inline int rt_fast_clean(struct rtable *rth)
 673 {
 674         /* Kill broadcast/multicast entries very aggresively, if they
 675            collide in hash table with more useful entries */
 676         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 677                 rt_is_input_route(rth) && rth->dst.rt_next;
 678 }
 679
 680 static inline int rt_valuable(struct rtable *rth)
 681 {
 682         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 683                 (rth->peer && rth->peer->pmtu_expires);
 684 }
 685
 686 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 687 {
 688         unsigned long age;
 689         int ret = 0;
 690
 691         if (atomic_read(&rth->dst.__refcnt))
 692                 goto out;
 693
 694         age = jiffies - rth->dst.lastuse;
 695         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 696             (age <= tmo2 && rt_valuable(rth)))
 697                 goto out;
 698         ret = 1;
 699 out:    return ret;
 700 }
 701
 702 /* Bits of score are:
 703  * 31: very valuable
 704  * 30: not quite useless
 705  * 29..0: usage counter
 706  */
 707 static inline u32 rt_score(struct rtable *rt)
 708 {
 709         u32 score = jiffies - rt->dst.lastuse;
 710
 711         score = ~score & ~(3<<30);
 712
 713         if (rt_valuable(rt))
 714                 score |= (1<<31);
 715
 716         if (rt_is_output_route(rt) ||
 717             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 718                 score |= (1<<30);
 719
 720         return score;
 721 }
 722
 723 static inline bool rt_caching(const struct net *net)
 724 {
 725         return net->ipv4.current_rt_cache_rebuild_count <=
 726                 net->ipv4.sysctl_rt_cache_rebuild_count;
 727 }
 728
 729 static inline bool compare_hash_inputs(const struct rtable *rt1,
 730                                        const struct rtable *rt2)
 731 {
 732         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 733                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 734                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 735 }
 736
 737 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 738 {
 739         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 740                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 741                 (rt1->rt_mark ^ rt2->rt_mark) |
 742                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 743                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 744                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 745 }
 746
 747 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 748 {
 749         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 750 }
 751
 752 static inline int rt_is_expired(struct rtable *rth)
 753 {
 754         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 755 }
 756
 757 /*
 758  * Perform a full scan of hash table and free all entries.
 759  * Can be called by a softirq or a process.
 760  * In the later case, we want to be reschedule if necessary
 761  */
 762 static void rt_do_flush(struct net *net, int process_context)
 763 {
 764         unsigned int i;
 765         struct rtable *rth, *next;
 766
 767         for (i = 0; i <= rt_hash_mask; i++) {
 768                 struct rtable __rcu **pprev;
 769                 struct rtable *list;
 770
 771                 if (process_context && need_resched())
 772                         cond_resched();
 773                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 774                 if (!rth)
 775                         continue;
 776
 777                 spin_lock_bh(rt_hash_lock_addr(i));
 778
 779                 list = NULL;
 780                 pprev = &rt_hash_table[i].chain;
 781                 rth = rcu_dereference_protected(*pprev,
 782                         lockdep_is_held(rt_hash_lock_addr(i)));
 783
 784                 while (rth) {
 785                         next = rcu_dereference_protected(rth->dst.rt_next,
 786                                 lockdep_is_held(rt_hash_lock_addr(i)));
 787
 788                         if (!net ||
 789                             net_eq(dev_net(rth->dst.dev), net)) {
 790                                 rcu_assign_pointer(*pprev, next);
 791                                 rcu_assign_pointer(rth->dst.rt_next, list);
 792                                 list = rth;
 793                         } else {
 794                                 pprev = &rth->dst.rt_next;
 795                         }
 796                         rth = next;
 797                 }
 798
 799                 spin_unlock_bh(rt_hash_lock_addr(i));
 800
 801                 for (; list; list = next) {
 802                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 803                         rt_free(list);
 804                 }
 805         }
 806 }
 807
 808 /*
 809  * While freeing expired entries, we compute average chain length
 810  * and standard deviation, using fixed-point arithmetic.
 811  * This to have an estimation of rt_chain_length_max
 812  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 813  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 814  */
 815
 816 #define FRACT_BITS 3
 817 #define ONE (1UL << FRACT_BITS)
 818
 819 /*
 820  * Given a hash chain and an item in this hash chain,
 821  * find if a previous entry has the same hash_inputs
 822  * (but differs on tos, mark or oif)
 823  * Returns 0 if an alias is found.
 824  * Returns ONE if rth has no alias before itself.
 825  */
 826 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 827 {
 828         const struct rtable *aux = head;
 829
 830         while (aux != rth) {
 831                 if (compare_hash_inputs(aux, rth))
 832                         return 0;
 833                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 834         }
 835         return ONE;
 836 }
 837
 838 static void rt_check_expire(void)
 839 {
 840         static unsigned int rover;
 841         unsigned int i = rover, goal;
 842         struct rtable *rth;
 843         struct rtable __rcu **rthp;
 844         unsigned long samples = 0;
 845         unsigned long sum = 0, sum2 = 0;
 846         unsigned long delta;
 847         u64 mult;
 848
 849         delta = jiffies - expires_ljiffies;
 850         expires_ljiffies = jiffies;
 851         mult = ((u64)delta) << rt_hash_log;
 852         if (ip_rt_gc_timeout > 1)
 853                 do_div(mult, ip_rt_gc_timeout);
 854         goal = (unsigned int)mult;
 855         if (goal > rt_hash_mask)
 856                 goal = rt_hash_mask + 1;
 857         for (; goal > 0; goal--) {
 858                 unsigned long tmo = ip_rt_gc_timeout;
 859                 unsigned long length;
 860
 861                 i = (i + 1) & rt_hash_mask;
 862                 rthp = &rt_hash_table[i].chain;
 863
 864                 if (need_resched())
 865                         cond_resched();
 866
 867                 samples++;
 868
 869                 if (rcu_dereference_raw(*rthp) == NULL)
 870                         continue;
 871                 length = 0;
 872                 spin_lock_bh(rt_hash_lock_addr(i));
 873                 while ((rth = rcu_dereference_protected(*rthp,
 874                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 875                         prefetch(rth->dst.rt_next);
 876                         if (rt_is_expired(rth)) {
 877                                 *rthp = rth->dst.rt_next;
 878                                 rt_free(rth);
 879                                 continue;
 880                         }
 881                         if (rth->dst.expires) {
 882                                 /* Entry is expired even if it is in use */
 883                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 884 nofree:
 885                                         tmo >>= 1;
 886                                         rthp = &rth->dst.rt_next;
 887                                         /*
 888                                          * We only count entries on
 889                                          * a chain with equal hash inputs once
 890                                          * so that entries for different QOS
 891                                          * levels, and other non-hash input
 892                                          * attributes don't unfairly skew
 893                                          * the length computation
 894                                          */
 895                                         length += has_noalias(rt_hash_table[i].chain, rth);
 896                                         continue;
 897                                 }
 898                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 899                                 goto nofree;
 900
 901                         /* Cleanup aged off entries. */
 902                         *rthp = rth->dst.rt_next;
 903                         rt_free(rth);
 904                 }
 905                 spin_unlock_bh(rt_hash_lock_addr(i));
 906                 sum += length;
 907                 sum2 += length*length;
 908         }
 909         if (samples) {
 910                 unsigned long avg = sum / samples;
 911                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 912                 rt_chain_length_max = max_t(unsigned long,
 913                                         ip_rt_gc_elasticity,
 914                                         (avg + 4*sd) >> FRACT_BITS);
 915         }
 916         rover = i;
 917 }
 918
 919 /*
 920  * rt_worker_func() is run in process context.
 921  * we call rt_check_expire() to scan part of the hash table
 922  */
 923 static void rt_worker_func(struct work_struct *work)
 924 {
 925         rt_check_expire();
 926         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 927 }
 928
 929 /*
 930  * Perturbation of rt_genid by a small quantity [1..256]
 931  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 932  * many times (2^24) without giving recent rt_genid.
 933  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 934  */
 935 static void rt_cache_invalidate(struct net *net)
 936 {
 937         unsigned char shuffle;
 938
 939         get_random_bytes(&shuffle, sizeof(shuffle));
 940         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 941         redirect_genid++;
 942         inetpeer_invalidate_tree(AF_INET);
 943 }
 944
 945 /*
 946  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 947  * delay >= 0 : invalidate & flush cache (can be long)
 948  */
 949 void rt_cache_flush(struct net *net, int delay)
 950 {
 951         rt_cache_invalidate(net);
 952         if (delay >= 0)
 953                 rt_do_flush(net, !in_softirq());
 954 }
 955
 956 /* Flush previous cache invalidated entries from the cache */
 957 void rt_cache_flush_batch(struct net *net)
 958 {
 959         rt_do_flush(net, !in_softirq());
 960 }
 961
 962 static void rt_emergency_hash_rebuild(struct net *net)
 963 {
 964         if (net_ratelimit())
 965                 printk(KERN_WARNING "Route hash chain too long!\n");
 966         rt_cache_invalidate(net);
 967 }
 968
 969 /*
 970    Short description of GC goals.
 971
 972    We want to build algorithm, which will keep routing cache
 973    at some equilibrium point, when number of aged off entries
 974    is kept approximately equal to newly generated ones.
 975
 976    Current expiration strength is variable "expire".
 977    We try to adjust it dynamically, so that if networking
 978    is idle expires is large enough to keep enough of warm entries,
 979    and when load increases it reduces to limit cache size.
 980  */
 981
 982 static int rt_garbage_collect(struct dst_ops *ops)
 983 {
 984         static unsigned long expire = RT_GC_TIMEOUT;
 985         static unsigned long last_gc;
 986         static int rover;
 987         static int equilibrium;
 988         struct rtable *rth;
 989         struct rtable __rcu **rthp;
 990         unsigned long now = jiffies;
 991         int goal;
 992         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 993
 994         /*
 995          * Garbage collection is pretty expensive,
 996          * do not make it too frequently.
 997          */
 998
 999         RT_CACHE_STAT_INC(gc_total);
1000
1001         if (now - last_gc < ip_rt_gc_min_interval &&
1002             entries < ip_rt_max_size) {
1003                 RT_CACHE_STAT_INC(gc_ignored);
1004                 goto out;
1005         }
1006
1007         entries = dst_entries_get_slow(&ipv4_dst_ops);
1008         /* Calculate number of entries, which we want to expire now. */
1009         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1010         if (goal <= 0) {
1011                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1012                         equilibrium = ipv4_dst_ops.gc_thresh;
1013                 goal = entries - equilibrium;
1014                 if (goal > 0) {
1015                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1016                         goal = entries - equilibrium;
1017                 }
1018         } else {
1019                 /* We are in dangerous area. Try to reduce cache really
1020                  * aggressively.
1021                  */
1022                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1023                 equilibrium = entries - goal;
1024         }
1025
1026         if (now - last_gc >= ip_rt_gc_min_interval)
1027                 last_gc = now;
1028
1029         if (goal <= 0) {
1030                 equilibrium += goal;
1031                 goto work_done;
1032         }
1033
1034         do {
1035                 int i, k;
1036
1037                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1038                         unsigned long tmo = expire;
1039
1040                         k = (k + 1) & rt_hash_mask;
1041                         rthp = &rt_hash_table[k].chain;
1042                         spin_lock_bh(rt_hash_lock_addr(k));
1043                         while ((rth = rcu_dereference_protected(*rthp,
1044                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1045                                 if (!rt_is_expired(rth) &&
1046                                         !rt_may_expire(rth, tmo, expire)) {
1047                                         tmo >>= 1;
1048                                         rthp = &rth->dst.rt_next;
1049                                         continue;
1050                                 }
1051                                 *rthp = rth->dst.rt_next;
1052                                 rt_free(rth);
1053                                 goal--;
1054                         }
1055                         spin_unlock_bh(rt_hash_lock_addr(k));
1056                         if (goal <= 0)
1057                                 break;
1058                 }
1059                 rover = k;
1060
1061                 if (goal <= 0)
1062                         goto work_done;
1063
1064                 /* Goal is not achieved. We stop process if:
1065
1066                    - if expire reduced to zero. Otherwise, expire is halfed.
1067                    - if table is not full.
1068                    - if we are called from interrupt.
1069                    - jiffies check is just fallback/debug loop breaker.
1070                      We will not spin here for long time in any case.
1071                  */
1072
1073                 RT_CACHE_STAT_INC(gc_goal_miss);
1074
1075                 if (expire == 0)
1076                         break;
1077
1078                 expire >>= 1;
1079
1080                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1081                         goto out;
1082         } while (!in_softirq() && time_before_eq(jiffies, now));
1083
1084         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1085                 goto out;
1086         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1087                 goto out;
1088         if (net_ratelimit())
1089                 printk(KERN_WARNING "dst cache overflow\n");
1090         RT_CACHE_STAT_INC(gc_dst_overflow);
1091         return 1;
1092
1093 work_done:
1094         expire += ip_rt_gc_min_interval;
1095         if (expire > ip_rt_gc_timeout ||
1096             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1097             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1098                 expire = ip_rt_gc_timeout;
1099 out:    return 0;
1100 }
1101
1102 /*
1103  * Returns number of entries in a hash chain that have different hash_inputs
1104  */
1105 static int slow_chain_length(const struct rtable *head)
1106 {
1107         int length = 0;
1108         const struct rtable *rth = head;
1109
1110         while (rth) {
1111                 length += has_noalias(head, rth);
1112                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1113         }
1114         return length >> FRACT_BITS;
1115 }
1116
1117 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1118 {
1119         struct neigh_table *tbl = &arp_tbl;
1120         static const __be32 inaddr_any = 0;
1121         struct net_device *dev = dst->dev;
1122         const __be32 *pkey = daddr;
1123         struct neighbour *n;
1124
1125 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1126         if (dev->type == ARPHRD_ATM)
1127                 tbl = clip_tbl_hook;
1128 #endif
1129         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1130                 pkey = &inaddr_any;
1131
1132         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1133         if (n)
1134                 return n;
1135         return neigh_create(tbl, pkey, dev);
1136 }
1137
1138 static int rt_bind_neighbour(struct rtable *rt)
1139 {
1140         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1141         if (IS_ERR(n))
1142                 return PTR_ERR(n);
1143         dst_set_neighbour(&rt->dst, n);
1144
1145         return 0;
1146 }
1147
1148 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1149                                      struct sk_buff *skb, int ifindex)
1150 {
1151         struct rtable   *rth, *cand;
1152         struct rtable __rcu **rthp, **candp;
1153         unsigned long   now;
1154         u32             min_score;
1155         int             chain_length;
1156         int attempts = !in_softirq();
1157
1158 restart:
1159         chain_length = 0;
1160         min_score = ~(u32)0;
1161         cand = NULL;
1162         candp = NULL;
1163         now = jiffies;
1164
1165         if (!rt_caching(dev_net(rt->dst.dev))) {
1166                 /*
1167                  * If we're not caching, just tell the caller we
1168                  * were successful and don't touch the route.  The
1169                  * caller hold the sole reference to the cache entry, and
1170                  * it will be released when the caller is done with it.
1171                  * If we drop it here, the callers have no way to resolve routes
1172                  * when we're not caching.  Instead, just point *rp at rt, so
1173                  * the caller gets a single use out of the route
1174                  * Note that we do rt_free on this new route entry, so that
1175                  * once its refcount hits zero, we are still able to reap it
1176                  * (Thanks Alexey)
1177                  * Note: To avoid expensive rcu stuff for this uncached dst,
1178                  * we set DST_NOCACHE so that dst_release() can free dst without
1179                  * waiting a grace period.
1180                  */
1181
1182                 rt->dst.flags |= DST_NOCACHE;
1183                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1184                         int err = rt_bind_neighbour(rt);
1185                         if (err) {
1186                                 if (net_ratelimit())
1187                                         printk(KERN_WARNING
1188                                             "Neighbour table failure & not caching routes.\n");
1189                                 ip_rt_put(rt);
1190                                 return ERR_PTR(err);
1191                         }
1192                 }
1193
1194                 goto skip_hashing;
1195         }
1196
1197         rthp = &rt_hash_table[hash].chain;
1198
1199         spin_lock_bh(rt_hash_lock_addr(hash));
1200         while ((rth = rcu_dereference_protected(*rthp,
1201                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1202                 if (rt_is_expired(rth)) {
1203                         *rthp = rth->dst.rt_next;
1204                         rt_free(rth);
1205                         continue;
1206                 }
1207                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1208                         /* Put it first */
1209                         *rthp = rth->dst.rt_next;
1210                         /*
1211                          * Since lookup is lockfree, the deletion
1212                          * must be visible to another weakly ordered CPU before
1213                          * the insertion at the start of the hash chain.
1214                          */
1215                         rcu_assign_pointer(rth->dst.rt_next,
1216                                            rt_hash_table[hash].chain);
1217                         /*
1218                          * Since lookup is lockfree, the update writes
1219                          * must be ordered for consistency on SMP.
1220                          */
1221                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1222
1223                         dst_use(&rth->dst, now);
1224                         spin_unlock_bh(rt_hash_lock_addr(hash));
1225
1226                         rt_drop(rt);
1227                         if (skb)
1228                                 skb_dst_set(skb, &rth->dst);
1229                         return rth;
1230                 }
1231
1232                 if (!atomic_read(&rth->dst.__refcnt)) {
1233                         u32 score = rt_score(rth);
1234
1235                         if (score <= min_score) {
1236                                 cand = rth;
1237                                 candp = rthp;
1238                                 min_score = score;
1239                         }
1240                 }
1241
1242                 chain_length++;
1243
1244                 rthp = &rth->dst.rt_next;
1245         }
1246
1247         if (cand) {
1248                 /* ip_rt_gc_elasticity used to be average length of chain
1249                  * length, when exceeded gc becomes really aggressive.
1250                  *
1251                  * The second limit is less certain. At the moment it allows
1252                  * only 2 entries per bucket. We will see.
1253                  */
1254                 if (chain_length > ip_rt_gc_elasticity) {
1255                         *candp = cand->dst.rt_next;
1256                         rt_free(cand);
1257                 }
1258         } else {
1259                 if (chain_length > rt_chain_length_max &&
1260                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1261                         struct net *net = dev_net(rt->dst.dev);
1262                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1263                         if (!rt_caching(net)) {
1264                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1265                                         rt->dst.dev->name, num);
1266                         }
1267                         rt_emergency_hash_rebuild(net);
1268                         spin_unlock_bh(rt_hash_lock_addr(hash));
1269
1270                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1271                                         ifindex, rt_genid(net));
1272                         goto restart;
1273                 }
1274         }
1275
1276         /* Try to bind route to arp only if it is output
1277            route or unicast forwarding path.
1278          */
1279         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1280                 int err = rt_bind_neighbour(rt);
1281                 if (err) {
1282                         spin_unlock_bh(rt_hash_lock_addr(hash));
1283
1284                         if (err != -ENOBUFS) {
1285                                 rt_drop(rt);
1286                                 return ERR_PTR(err);
1287                         }
1288
1289                         /* Neighbour tables are full and nothing
1290                            can be released. Try to shrink route cache,
1291                            it is most likely it holds some neighbour records.
1292                          */
1293                         if (attempts-- > 0) {
1294                                 int saved_elasticity = ip_rt_gc_elasticity;
1295                                 int saved_int = ip_rt_gc_min_interval;
1296                                 ip_rt_gc_elasticity     = 1;
1297                                 ip_rt_gc_min_interval   = 0;
1298                                 rt_garbage_collect(&ipv4_dst_ops);
1299                                 ip_rt_gc_min_interval   = saved_int;
1300                                 ip_rt_gc_elasticity     = saved_elasticity;
1301                                 goto restart;
1302                         }
1303
1304                         if (net_ratelimit())
1305                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1306                         rt_drop(rt);
1307                         return ERR_PTR(-ENOBUFS);
1308                 }
1309         }
1310
1311         rt->dst.rt_next = rt_hash_table[hash].chain;
1312
1313         /*
1314          * Since lookup is lockfree, we must make sure
1315          * previous writes to rt are committed to memory
1316          * before making rt visible to other CPUS.
1317          */
1318         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1319
1320         spin_unlock_bh(rt_hash_lock_addr(hash));
1321
1322 skip_hashing:
1323         if (skb)
1324                 skb_dst_set(skb, &rt->dst);
1325         return rt;
1326 }
1327
1328 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1329
1330 static u32 rt_peer_genid(void)
1331 {
1332         return atomic_read(&__rt_peer_genid);
1333 }
1334
1335 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1336 {
1337         struct inet_peer *peer;
1338
1339         peer = inet_getpeer_v4(daddr, create);
1340
1341         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1342                 inet_putpeer(peer);
1343         else
1344                 rt->rt_peer_genid = rt_peer_genid();
1345 }
1346
1347 #define IP_IDENTS_SZ 2048u
1348 struct ip_ident_bucket {
1349         atomic_t        id;
1350         u32             stamp32;
1351 };
1352
1353 static struct ip_ident_bucket *ip_idents __read_mostly;
1354
1355 /* In order to protect privacy, we add a perturbation to identifiers
1356  * if one generator is seldom used. This makes hard for an attacker
1357  * to infer how many packets were sent between two points in time.
1358  */
1359 u32 ip_idents_reserve(u32 hash, int segs)
1360 {
1361         struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
1362         u32 old = ACCESS_ONCE(bucket->stamp32);
1363         u32 now = (u32)jiffies;
1364         u32 delta = 0;
1365
1366         if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
1367                 u64 x = random32();
1368
1369                 x *= (now - old);
1370                 delta = (u32)(x >> 32);
1371         }
1372
1373         return atomic_add_return(segs + delta, &bucket->id) - segs;
1374 }
1375 EXPORT_SYMBOL(ip_idents_reserve);
1376
1377 void __ip_select_ident(struct iphdr *iph, int segs)
1378 {
1379         static u32 ip_idents_hashrnd __read_mostly;
1380         static bool hashrnd_initialized = false;
1381         u32 hash, id;
1382
1383         if (unlikely(!hashrnd_initialized)) {
1384                 hashrnd_initialized = true;
1385                 get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
1386         }
1387
1388         hash = jhash_3words((__force u32)iph->daddr,
1389                             (__force u32)iph->saddr,
1390                             iph->protocol,
1391                             ip_idents_hashrnd);
1392         id = ip_idents_reserve(hash, segs);
1393         iph->id = htons(id);
1394 }
1395 EXPORT_SYMBOL(__ip_select_ident);
1396
1397 static void rt_del(unsigned hash, struct rtable *rt)
1398 {
1399         struct rtable __rcu **rthp;
1400         struct rtable *aux;
1401
1402         rthp = &rt_hash_table[hash].chain;
1403         spin_lock_bh(rt_hash_lock_addr(hash));
1404         ip_rt_put(rt);
1405         while ((aux = rcu_dereference_protected(*rthp,
1406                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1407                 if (aux == rt || rt_is_expired(aux)) {
1408                         *rthp = aux->dst.rt_next;
1409                         rt_free(aux);
1410                         continue;
1411                 }
1412                 rthp = &aux->dst.rt_next;
1413         }
1414         spin_unlock_bh(rt_hash_lock_addr(hash));
1415 }
1416
1417 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1418 {
1419         struct rtable *rt = (struct rtable *) dst;
1420         __be32 orig_gw = rt->rt_gateway;
1421         struct neighbour *n, *old_n;
1422
1423         dst_confirm(&rt->dst);
1424
1425         rt->rt_gateway = peer->redirect_learned.a4;
1426
1427         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1428         if (IS_ERR(n)) {
1429                 rt->rt_gateway = orig_gw;
1430                 return;
1431         }
1432         old_n = xchg(&rt->dst._neighbour, n);
1433         if (old_n)
1434                 neigh_release(old_n);
1435         if (!(n->nud_state & NUD_VALID)) {
1436                 neigh_event_send(n, NULL);
1437         } else {
1438                 rt->rt_flags |= RTCF_REDIRECTED;
1439                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1440         }
1441 }
1442
1443 /* called in rcu_read_lock() section */
1444 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1445                     __be32 saddr, struct net_device *dev)
1446 {
1447         int s, i;
1448         struct in_device *in_dev = __in_dev_get_rcu(dev);
1449         __be32 skeys[2] = { saddr, 0 };
1450         int    ikeys[2] = { dev->ifindex, 0 };
1451         struct inet_peer *peer;
1452         struct net *net;
1453
1454         if (!in_dev)
1455                 return;
1456
1457         net = dev_net(dev);
1458         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1459             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1460             ipv4_is_zeronet(new_gw))
1461                 goto reject_redirect;
1462
1463         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1464                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1465                         goto reject_redirect;
1466                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1467                         goto reject_redirect;
1468         } else {
1469                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1470                         goto reject_redirect;
1471         }
1472
1473         for (s = 0; s < 2; s++) {
1474                 for (i = 0; i < 2; i++) {
1475                         unsigned int hash;
1476                         struct rtable __rcu **rthp;
1477                         struct rtable *rt;
1478
1479                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1480
1481                         rthp = &rt_hash_table[hash].chain;
1482
1483                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1484                                 rthp = &rt->dst.rt_next;
1485
1486                                 if (rt->rt_key_dst != daddr ||
1487                                     rt->rt_key_src != skeys[s] ||
1488                                     rt->rt_oif != ikeys[i] ||
1489                                     rt_is_input_route(rt) ||
1490                                     rt_is_expired(rt) ||
1491                                     !net_eq(dev_net(rt->dst.dev), net) ||
1492                                     rt->dst.error ||
1493                                     rt->dst.dev != dev ||
1494                                     rt->rt_gateway != old_gw)
1495                                         continue;
1496
1497                                 if (!rt->peer)
1498                                         rt_bind_peer(rt, rt->rt_dst, 1);
1499
1500                                 peer = rt->peer;
1501                                 if (peer) {
1502                                         if (peer->redirect_learned.a4 != new_gw ||
1503                                             peer->redirect_genid != redirect_genid) {
1504                                                 peer->redirect_learned.a4 = new_gw;
1505                                                 peer->redirect_genid = redirect_genid;
1506                                                 atomic_inc(&__rt_peer_genid);
1507                                         }
1508                                         check_peer_redir(&rt->dst, peer);
1509                                 }
1510                         }
1511                 }
1512         }
1513         return;
1514
1515 reject_redirect:
1516 #ifdef CONFIG_IP_ROUTE_VERBOSE
1517         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1518                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1519                         "  Advised path = %pI4 -> %pI4\n",
1520                        &old_gw, dev->name, &new_gw,
1521                        &saddr, &daddr);
1522 #endif
1523         ;
1524 }
1525
1526 static bool peer_pmtu_expired(struct inet_peer *peer)
1527 {
1528         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1529
1530         return orig &&
1531                time_after_eq(jiffies, orig) &&
1532                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1533 }
1534
1535 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1536 {
1537         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1538
1539         return orig &&
1540                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1541 }
1542
1543 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1544 {
1545         struct rtable *rt = (struct rtable *)dst;
1546         struct dst_entry *ret = dst;
1547
1548         if (rt) {
1549                 if (dst->obsolete > 0) {
1550                         ip_rt_put(rt);
1551                         ret = NULL;
1552                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1553                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1554                                                 rt->rt_oif,
1555                                                 rt_genid(dev_net(dst->dev)));
1556                         rt_del(hash, rt);
1557                         ret = NULL;
1558                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1559                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1560                 }
1561         }
1562         return ret;
1563 }
1564
1565 /*
1566  * Algorithm:
1567  *      1. The first ip_rt_redirect_number redirects are sent
1568  *         with exponential backoff, then we stop sending them at all,
1569  *         assuming that the host ignores our redirects.
1570  *      2. If we did not see packets requiring redirects
1571  *         during ip_rt_redirect_silence, we assume that the host
1572  *         forgot redirected route and start to send redirects again.
1573  *
1574  * This algorithm is much cheaper and more intelligent than dumb load limiting
1575  * in icmp.c.
1576  *
1577  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1578  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1579  */
1580
1581 void ip_rt_send_redirect(struct sk_buff *skb)
1582 {
1583         struct rtable *rt = skb_rtable(skb);
1584         struct in_device *in_dev;
1585         struct inet_peer *peer;
1586         int log_martians;
1587
1588         rcu_read_lock();
1589         in_dev = __in_dev_get_rcu(rt->dst.dev);
1590         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1591                 rcu_read_unlock();
1592                 return;
1593         }
1594         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1595         rcu_read_unlock();
1596
1597         if (!rt->peer)
1598                 rt_bind_peer(rt, rt->rt_dst, 1);
1599         peer = rt->peer;
1600         if (!peer) {
1601                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1602                 return;
1603         }
1604
1605         /* No redirected packets during ip_rt_redirect_silence;
1606          * reset the algorithm.
1607          */
1608         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1609                 peer->rate_tokens = 0;
1610
1611         /* Too many ignored redirects; do not send anything
1612          * set dst.rate_last to the last seen redirected packet.
1613          */
1614         if (peer->rate_tokens >= ip_rt_redirect_number) {
1615                 peer->rate_last = jiffies;
1616                 return;
1617         }
1618
1619         /* Check for load limit; set rate_last to the latest sent
1620          * redirect.
1621          */
1622         if (peer->rate_tokens == 0 ||
1623             time_after(jiffies,
1624                        (peer->rate_last +
1625                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1626                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1627                 peer->rate_last = jiffies;
1628                 ++peer->rate_tokens;
1629 #ifdef CONFIG_IP_ROUTE_VERBOSE
1630                 if (log_martians &&
1631                     peer->rate_tokens == ip_rt_redirect_number &&
1632                     net_ratelimit())
1633                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1634                                &ip_hdr(skb)->saddr, rt->rt_iif,
1635                                 &rt->rt_dst, &rt->rt_gateway);
1636 #endif
1637         }
1638 }
1639
1640 static int ip_error(struct sk_buff *skb)
1641 {
1642         struct rtable *rt = skb_rtable(skb);
1643         struct inet_peer *peer;
1644         unsigned long now;
1645         bool send;
1646         int code;
1647
1648         switch (rt->dst.error) {
1649         case EINVAL:
1650         default:
1651                 goto out;
1652         case EHOSTUNREACH:
1653                 code = ICMP_HOST_UNREACH;
1654                 break;
1655         case ENETUNREACH:
1656                 code = ICMP_NET_UNREACH;
1657                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1658                                 IPSTATS_MIB_INNOROUTES);
1659                 break;
1660         case EACCES:
1661                 code = ICMP_PKT_FILTERED;
1662                 break;
1663         }
1664
1665         if (!rt->peer)
1666                 rt_bind_peer(rt, rt->rt_dst, 1);
1667         peer = rt->peer;
1668
1669         send = true;
1670         if (peer) {
1671                 now = jiffies;
1672                 peer->rate_tokens += now - peer->rate_last;
1673                 if (peer->rate_tokens > ip_rt_error_burst)
1674                         peer->rate_tokens = ip_rt_error_burst;
1675                 peer->rate_last = now;
1676                 if (peer->rate_tokens >= ip_rt_error_cost)
1677                         peer->rate_tokens -= ip_rt_error_cost;
1678                 else
1679                         send = false;
1680         }
1681         if (send)
1682                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1683
1684 out:    kfree_skb(skb);
1685         return 0;
1686 }
1687
1688 /*
1689  *      The last two values are not from the RFC but
1690  *      are needed for AMPRnet AX.25 paths.
1691  */
1692
1693 static const unsigned short mtu_plateau[] =
1694 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1695
1696 static inline unsigned short guess_mtu(unsigned short old_mtu)
1697 {
1698         int i;
1699
1700         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1701                 if (old_mtu > mtu_plateau[i])
1702                         return mtu_plateau[i];
1703         return 68;
1704 }
1705
1706 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1707                                  unsigned short new_mtu,
1708                                  struct net_device *dev)
1709 {
1710         unsigned short old_mtu = ntohs(iph->tot_len);
1711         unsigned short est_mtu = 0;
1712         struct inet_peer *peer;
1713
1714         peer = inet_getpeer_v4(iph->daddr, 1);
1715         if (peer) {
1716                 unsigned short mtu = new_mtu;
1717
1718                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1719                         /* BSD 4.2 derived systems incorrectly adjust
1720                          * tot_len by the IP header length, and report
1721                          * a zero MTU in the ICMP message.
1722                          */
1723                         if (mtu == 0 &&
1724                             old_mtu >= 68 + (iph->ihl << 2))
1725                                 old_mtu -= iph->ihl << 2;
1726                         mtu = guess_mtu(old_mtu);
1727                 }
1728
1729                 if (mtu < ip_rt_min_pmtu)
1730                         mtu = ip_rt_min_pmtu;
1731                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1732                         unsigned long pmtu_expires;
1733
1734                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1735                         if (!pmtu_expires)
1736                                 pmtu_expires = 1UL;
1737
1738                         est_mtu = mtu;
1739                         peer->pmtu_learned = mtu;
1740                         peer->pmtu_expires = pmtu_expires;
1741                         atomic_inc(&__rt_peer_genid);
1742                 }
1743
1744                 inet_putpeer(peer);
1745         }
1746         return est_mtu ? : new_mtu;
1747 }
1748
1749 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1750 {
1751         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1752
1753         if (!expires)
1754                 return;
1755         if (time_before(jiffies, expires)) {
1756                 u32 orig_dst_mtu = dst_mtu(dst);
1757                 if (peer->pmtu_learned < orig_dst_mtu) {
1758                         if (!peer->pmtu_orig)
1759                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1760                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1761                 }
1762         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1763                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1764 }
1765
1766 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1767 {
1768         struct rtable *rt = (struct rtable *) dst;
1769         struct inet_peer *peer;
1770
1771         dst_confirm(dst);
1772
1773         if (!rt->peer)
1774                 rt_bind_peer(rt, rt->rt_dst, 1);
1775         peer = rt->peer;
1776         if (peer) {
1777                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1778
1779                 if (mtu < ip_rt_min_pmtu)
1780                         mtu = ip_rt_min_pmtu;
1781                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1782
1783                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1784                         if (!pmtu_expires)
1785                                 pmtu_expires = 1UL;
1786
1787                         peer->pmtu_learned = mtu;
1788                         peer->pmtu_expires = pmtu_expires;
1789
1790                         atomic_inc(&__rt_peer_genid);
1791                         rt->rt_peer_genid = rt_peer_genid();
1792                 }
1793                 check_peer_pmtu(dst, peer);
1794         }
1795 }
1796
1797
1798 static void ipv4_validate_peer(struct rtable *rt)
1799 {
1800         if (rt->rt_peer_genid != rt_peer_genid()) {
1801                 struct inet_peer *peer;
1802
1803                 if (!rt->peer)
1804                         rt_bind_peer(rt, rt->rt_dst, 0);
1805
1806                 peer = rt->peer;
1807                 if (peer) {
1808                         check_peer_pmtu(&rt->dst, peer);
1809
1810                         if (peer->redirect_genid != redirect_genid)
1811                                 peer->redirect_learned.a4 = 0;
1812                         if (peer->redirect_learned.a4 &&
1813                             peer->redirect_learned.a4 != rt->rt_gateway)
1814                                 check_peer_redir(&rt->dst, peer);
1815                 }
1816
1817                 rt->rt_peer_genid = rt_peer_genid();
1818         }
1819 }
1820
1821 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1822 {
1823         struct rtable *rt = (struct rtable *) dst;
1824
1825         if (rt_is_expired(rt))
1826                 return NULL;
1827         ipv4_validate_peer(rt);
1828         return dst;
1829 }
1830
1831 static void ipv4_dst_destroy(struct dst_entry *dst)
1832 {
1833         struct rtable *rt = (struct rtable *) dst;
1834         struct inet_peer *peer = rt->peer;
1835
1836         if (rt->fi) {
1837                 fib_info_put(rt->fi);
1838                 rt->fi = NULL;
1839         }
1840         if (peer) {
1841                 rt->peer = NULL;
1842                 inet_putpeer(peer);
1843         }
1844 }
1845
1846
1847 static void ipv4_link_failure(struct sk_buff *skb)
1848 {
1849         struct rtable *rt;
1850
1851         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1852
1853         rt = skb_rtable(skb);
1854         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1855                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1856 }
1857
1858 static int ip_rt_bug(struct sk_buff *skb)
1859 {
1860         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1861                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1862                 skb->dev ? skb->dev->name : "?");
1863         kfree_skb(skb);
1864         WARN_ON(1);
1865         return 0;
1866 }
1867
1868 /*
1869    We do not cache source address of outgoing interface,
1870    because it is used only by IP RR, TS and SRR options,
1871    so that it out of fast path.
1872
1873    BTW remember: "addr" is allowed to be not aligned
1874    in IP options!
1875  */
1876
1877 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1878 {
1879         __be32 src;
1880
1881         if (rt_is_output_route(rt))
1882                 src = ip_hdr(skb)->saddr;
1883         else {
1884                 struct fib_result res;
1885                 struct flowi4 fl4;
1886                 struct iphdr *iph;
1887
1888                 iph = ip_hdr(skb);
1889
1890                 memset(&fl4, 0, sizeof(fl4));
1891                 fl4.daddr = iph->daddr;
1892                 fl4.saddr = iph->saddr;
1893                 fl4.flowi4_tos = RT_TOS(iph->tos);
1894                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1895                 fl4.flowi4_iif = skb->dev->ifindex;
1896                 fl4.flowi4_mark = skb->mark;
1897
1898                 rcu_read_lock();
1899                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1900                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1901                 else
1902                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1903                                         RT_SCOPE_UNIVERSE);
1904                 rcu_read_unlock();
1905         }
1906         memcpy(addr, &src, 4);
1907 }
1908
1909 #ifdef CONFIG_IP_ROUTE_CLASSID
1910 static void set_class_tag(struct rtable *rt, u32 tag)
1911 {
1912         if (!(rt->dst.tclassid & 0xFFFF))
1913                 rt->dst.tclassid |= tag & 0xFFFF;
1914         if (!(rt->dst.tclassid & 0xFFFF0000))
1915                 rt->dst.tclassid |= tag & 0xFFFF0000;
1916 }
1917 #endif
1918
1919 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1920 {
1921         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1922
1923         if (advmss == 0) {
1924                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1925                                ip_rt_min_advmss);
1926                 if (advmss > 65535 - 40)
1927                         advmss = 65535 - 40;
1928         }
1929         return advmss;
1930 }
1931
1932 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1933 {
1934         const struct rtable *rt = (const struct rtable *) dst;
1935         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1936
1937         if (mtu && rt_is_output_route(rt))
1938                 return mtu;
1939
1940         mtu = dst->dev->mtu;
1941
1942         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1943
1944                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1945                         mtu = 576;
1946         }
1947
1948         if (mtu > IP_MAX_MTU)
1949                 mtu = IP_MAX_MTU;
1950
1951         return mtu;
1952 }
1953
1954 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1955                             struct fib_info *fi)
1956 {
1957         struct inet_peer *peer;
1958         int create = 0;
1959
1960         /* If a peer entry exists for this destination, we must hook
1961          * it up in order to get at cached metrics.
1962          */
1963         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1964                 create = 1;
1965
1966         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1967         if (peer) {
1968                 rt->rt_peer_genid = rt_peer_genid();
1969                 if (inet_metrics_new(peer))
1970                         memcpy(peer->metrics, fi->fib_metrics,
1971                                sizeof(u32) * RTAX_MAX);
1972                 dst_init_metrics(&rt->dst, peer->metrics, false);
1973
1974                 check_peer_pmtu(&rt->dst, peer);
1975                 if (peer->redirect_genid != redirect_genid)
1976                         peer->redirect_learned.a4 = 0;
1977                 if (peer->redirect_learned.a4 &&
1978                     peer->redirect_learned.a4 != rt->rt_gateway) {
1979                         rt->rt_gateway = peer->redirect_learned.a4;
1980                         rt->rt_flags |= RTCF_REDIRECTED;
1981                 }
1982         } else {
1983                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1984                         rt->fi = fi;
1985                         atomic_inc(&fi->fib_clntref);
1986                 }
1987                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1988         }
1989 }
1990
1991 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1992                            const struct fib_result *res,
1993                            struct fib_info *fi, u16 type, u32 itag)
1994 {
1995         struct dst_entry *dst = &rt->dst;
1996
1997         if (fi) {
1998                 if (FIB_RES_GW(*res) &&
1999                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
2000                         rt->rt_gateway = FIB_RES_GW(*res);
2001                 rt_init_metrics(rt, fl4, fi);
2002 #ifdef CONFIG_IP_ROUTE_CLASSID
2003                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
2004 #endif
2005         }
2006
2007         if (dst_mtu(dst) > IP_MAX_MTU)
2008                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
2009         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
2010                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
2011
2012 #ifdef CONFIG_IP_ROUTE_CLASSID
2013 #ifdef CONFIG_IP_MULTIPLE_TABLES
2014         set_class_tag(rt, fib_rules_tclass(res));
2015 #endif
2016         set_class_tag(rt, itag);
2017 #endif
2018 }
2019
2020 static struct rtable *rt_dst_alloc(struct net_device *dev,
2021                                    bool nopolicy, bool noxfrm)
2022 {
2023         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2024                          DST_HOST |
2025                          (nopolicy ? DST_NOPOLICY : 0) |
2026                          (noxfrm ? DST_NOXFRM : 0));
2027 }
2028
2029 /* called in rcu_read_lock() section */
2030 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2031                                 u8 tos, struct net_device *dev, int our)
2032 {
2033         unsigned int hash;
2034         struct rtable *rth;
2035         __be32 spec_dst;
2036         struct in_device *in_dev = __in_dev_get_rcu(dev);
2037         u32 itag = 0;
2038         int err;
2039
2040         /* Primary sanity checks. */
2041
2042         if (in_dev == NULL)
2043                 return -EINVAL;
2044
2045         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2046             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2047                 goto e_inval;
2048
2049         if (ipv4_is_zeronet(saddr)) {
2050                 if (!ipv4_is_local_multicast(daddr))
2051                         goto e_inval;
2052                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2053         } else {
2054                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2055                                           &itag);
2056                 if (err < 0)
2057                         goto e_err;
2058         }
2059         rth = rt_dst_alloc(init_net.loopback_dev,
2060                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2061         if (!rth)
2062                 goto e_nobufs;
2063
2064 #ifdef CONFIG_IP_ROUTE_CLASSID
2065         rth->dst.tclassid = itag;
2066 #endif
2067         rth->dst.output = ip_rt_bug;
2068
2069         rth->rt_key_dst = daddr;
2070         rth->rt_key_src = saddr;
2071         rth->rt_genid   = rt_genid(dev_net(dev));
2072         rth->rt_flags   = RTCF_MULTICAST;
2073         rth->rt_type    = RTN_MULTICAST;
2074         rth->rt_key_tos = tos;
2075         rth->rt_dst     = daddr;
2076         rth->rt_src     = saddr;
2077         rth->rt_route_iif = dev->ifindex;
2078         rth->rt_iif     = dev->ifindex;
2079         rth->rt_oif     = 0;
2080         rth->rt_mark    = skb->mark;
2081         rth->rt_gateway = daddr;
2082         rth->rt_spec_dst= spec_dst;
2083         rth->rt_peer_genid = 0;
2084         rth->peer = NULL;
2085         rth->fi = NULL;
2086         if (our) {
2087                 rth->dst.input= ip_local_deliver;
2088                 rth->rt_flags |= RTCF_LOCAL;
2089         }
2090
2091 #ifdef CONFIG_IP_MROUTE
2092         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2093                 rth->dst.input = ip_mr_input;
2094 #endif
2095         RT_CACHE_STAT_INC(in_slow_mc);
2096
2097         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2098         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2099         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2100
2101 e_nobufs:
2102         return -ENOBUFS;
2103 e_inval:
2104         return -EINVAL;
2105 e_err:
2106         return err;
2107 }
2108
2109
2110 static void ip_handle_martian_source(struct net_device *dev,
2111                                      struct in_device *in_dev,
2112                                      struct sk_buff *skb,
2113                                      __be32 daddr,
2114                                      __be32 saddr)
2115 {
2116         RT_CACHE_STAT_INC(in_martian_src);
2117 #ifdef CONFIG_IP_ROUTE_VERBOSE
2118         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2119                 /*
2120                  *      RFC1812 recommendation, if source is martian,
2121                  *      the only hint is MAC header.
2122                  */
2123                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2124                         &daddr, &saddr, dev->name);
2125                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2126                         int i;
2127                         const unsigned char *p = skb_mac_header(skb);
2128                         printk(KERN_WARNING "ll header: ");
2129                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2130                                 printk("%02x", *p);
2131                                 if (i < (dev->hard_header_len - 1))
2132                                         printk(":");
2133                         }
2134                         printk("\n");
2135                 }
2136         }
2137 #endif
2138 }
2139
2140 /* called in rcu_read_lock() section */
2141 static int __mkroute_input(struct sk_buff *skb,
2142                            const struct fib_result *res,
2143                            struct in_device *in_dev,
2144                            __be32 daddr, __be32 saddr, u32 tos,
2145                            struct rtable **result)
2146 {
2147         struct rtable *rth;
2148         int err;
2149         struct in_device *out_dev;
2150         unsigned int flags = 0;
2151         __be32 spec_dst;
2152         u32 itag = 0;
2153
2154         /* get a working reference to the output device */
2155         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2156         if (out_dev == NULL) {
2157                 if (net_ratelimit())
2158                         printk(KERN_CRIT "Bug in ip_route_input" \
2159                                "_slow(). Please, report\n");
2160                 return -EINVAL;
2161         }
2162
2163
2164         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2165                                   in_dev->dev, &spec_dst, &itag);
2166         if (err < 0) {
2167                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2168                                          saddr);
2169
2170                 goto cleanup;
2171         }
2172
2173         if (err)
2174                 flags |= RTCF_DIRECTSRC;
2175
2176         if (out_dev == in_dev && err &&
2177             (IN_DEV_SHARED_MEDIA(out_dev) ||
2178              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2179                 flags |= RTCF_DOREDIRECT;
2180
2181         if (skb->protocol != htons(ETH_P_IP)) {
2182                 /* Not IP (i.e. ARP). Do not create route, if it is
2183                  * invalid for proxy arp. DNAT routes are always valid.
2184                  *
2185                  * Proxy arp feature have been extended to allow, ARP
2186                  * replies back to the same interface, to support
2187                  * Private VLAN switch technologies. See arp.c.
2188                  */
2189                 if (out_dev == in_dev &&
2190                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2191                         err = -EINVAL;
2192                         goto cleanup;
2193                 }
2194         }
2195
2196         rth = rt_dst_alloc(out_dev->dev,
2197                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2198                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2199         if (!rth) {
2200                 err = -ENOBUFS;
2201                 goto cleanup;
2202         }
2203
2204         rth->rt_key_dst = daddr;
2205         rth->rt_key_src = saddr;
2206         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2207         rth->rt_flags = flags;
2208         rth->rt_type = res->type;
2209         rth->rt_key_tos = tos;
2210         rth->rt_dst     = daddr;
2211         rth->rt_src     = saddr;
2212         rth->rt_route_iif = in_dev->dev->ifindex;
2213         rth->rt_iif     = in_dev->dev->ifindex;
2214         rth->rt_oif     = 0;
2215         rth->rt_mark    = skb->mark;
2216         rth->rt_gateway = daddr;
2217         rth->rt_spec_dst= spec_dst;
2218         rth->rt_peer_genid = 0;
2219         rth->peer = NULL;
2220         rth->fi = NULL;
2221
2222         rth->dst.input = ip_forward;
2223         rth->dst.output = ip_output;
2224
2225         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2226
2227         *result = rth;
2228         err = 0;
2229  cleanup:
2230         return err;
2231 }
2232
2233 static int ip_mkroute_input(struct sk_buff *skb,
2234                             struct fib_result *res,
2235                             const struct flowi4 *fl4,
2236                             struct in_device *in_dev,
2237                             __be32 daddr, __be32 saddr, u32 tos)
2238 {
2239         struct rtable* rth = NULL;
2240         int err;
2241         unsigned hash;
2242
2243 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2244         if (res->fi && res->fi->fib_nhs > 1)
2245                 fib_select_multipath(res);
2246 #endif
2247
2248         /* create a routing cache entry */
2249         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2250         if (err)
2251                 return err;
2252
2253         /* put it into the cache */
2254         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2255                        rt_genid(dev_net(rth->dst.dev)));
2256         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2257         if (IS_ERR(rth))
2258                 return PTR_ERR(rth);
2259         return 0;
2260 }
2261
2262 /*
2263  *      NOTE. We drop all the packets that has local source
2264  *      addresses, because every properly looped back packet
2265  *      must have correct destination already attached by output routine.
2266  *
2267  *      Such approach solves two big problems:
2268  *      1. Not simplex devices are handled properly.
2269  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2270  *      called with rcu_read_lock()
2271  */
2272
2273 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2274                                u8 tos, struct net_device *dev)
2275 {
2276         struct fib_result res;
2277         struct in_device *in_dev = __in_dev_get_rcu(dev);
2278         struct flowi4   fl4;
2279         unsigned        flags = 0;
2280         u32             itag = 0;
2281         struct rtable * rth;
2282         unsigned        hash;
2283         __be32          spec_dst;
2284         int             err = -EINVAL;
2285         struct net    * net = dev_net(dev);
2286
2287         /* IP on this device is disabled. */
2288
2289         if (!in_dev)
2290                 goto out;
2291
2292         /* Check for the most weird martians, which can be not detected
2293            by fib_lookup.
2294          */
2295
2296         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2297             ipv4_is_loopback(saddr))
2298                 goto martian_source;
2299
2300         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2301                 goto brd_input;
2302
2303         /* Accept zero addresses only to limited broadcast;
2304          * I even do not know to fix it or not. Waiting for complains :-)
2305          */
2306         if (ipv4_is_zeronet(saddr))
2307                 goto martian_source;
2308
2309         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2310                 goto martian_destination;
2311
2312         /*
2313          *      Now we are ready to route packet.
2314          */
2315         fl4.flowi4_oif = 0;
2316         fl4.flowi4_iif = dev->ifindex;
2317         fl4.flowi4_mark = skb->mark;
2318         fl4.flowi4_tos = tos;
2319         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2320         fl4.daddr = daddr;
2321         fl4.saddr = saddr;
2322         err = fib_lookup(net, &fl4, &res);
2323         if (err != 0) {
2324                 if (!IN_DEV_FORWARD(in_dev))
2325                         goto e_hostunreach;
2326                 goto no_route;
2327         }
2328
2329         RT_CACHE_STAT_INC(in_slow_tot);
2330
2331         if (res.type == RTN_BROADCAST)
2332                 goto brd_input;
2333
2334         if (res.type == RTN_LOCAL) {
2335                 err = fib_validate_source(skb, saddr, daddr, tos,
2336                                           net->loopback_dev->ifindex,
2337                                           dev, &spec_dst, &itag);
2338                 if (err < 0)
2339                         goto martian_source_keep_err;
2340                 if (err)
2341                         flags |= RTCF_DIRECTSRC;
2342                 spec_dst = daddr;
2343                 goto local_input;
2344         }
2345
2346         if (!IN_DEV_FORWARD(in_dev))
2347                 goto e_hostunreach;
2348         if (res.type != RTN_UNICAST)
2349                 goto martian_destination;
2350
2351         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2352 out:    return err;
2353
2354 brd_input:
2355         if (skb->protocol != htons(ETH_P_IP))
2356                 goto e_inval;
2357
2358         if (ipv4_is_zeronet(saddr))
2359                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2360         else {
2361                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2362                                           &itag);
2363                 if (err < 0)
2364                         goto martian_source_keep_err;
2365                 if (err)
2366                         flags |= RTCF_DIRECTSRC;
2367         }
2368         flags |= RTCF_BROADCAST;
2369         res.type = RTN_BROADCAST;
2370         RT_CACHE_STAT_INC(in_brd);
2371
2372 local_input:
2373         rth = rt_dst_alloc(net->loopback_dev,
2374                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2375         if (!rth)
2376                 goto e_nobufs;
2377
2378         rth->dst.input= ip_local_deliver;
2379         rth->dst.output= ip_rt_bug;
2380 #ifdef CONFIG_IP_ROUTE_CLASSID
2381         rth->dst.tclassid = itag;
2382 #endif
2383
2384         rth->rt_key_dst = daddr;
2385         rth->rt_key_src = saddr;
2386         rth->rt_genid = rt_genid(net);
2387         rth->rt_flags   = flags|RTCF_LOCAL;
2388         rth->rt_type    = res.type;
2389         rth->rt_key_tos = tos;
2390         rth->rt_dst     = daddr;
2391         rth->rt_src     = saddr;
2392 #ifdef CONFIG_IP_ROUTE_CLASSID
2393         rth->dst.tclassid = itag;
2394 #endif
2395         rth->rt_route_iif = dev->ifindex;
2396         rth->rt_iif     = dev->ifindex;
2397         rth->rt_oif     = 0;
2398         rth->rt_mark    = skb->mark;
2399         rth->rt_gateway = daddr;
2400         rth->rt_spec_dst= spec_dst;
2401         rth->rt_peer_genid = 0;
2402         rth->peer = NULL;
2403         rth->fi = NULL;
2404         if (res.type == RTN_UNREACHABLE) {
2405                 rth->dst.input= ip_error;
2406                 rth->dst.error= -err;
2407                 rth->rt_flags   &= ~RTCF_LOCAL;
2408         }
2409         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2410         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2411         err = 0;
2412         if (IS_ERR(rth))
2413                 err = PTR_ERR(rth);
2414         goto out;
2415
2416 no_route:
2417         RT_CACHE_STAT_INC(in_no_route);
2418         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2419         res.type = RTN_UNREACHABLE;
2420         if (err == -ESRCH)
2421                 err = -ENETUNREACH;
2422         goto local_input;
2423
2424         /*
2425          *      Do not cache martian addresses: they should be logged (RFC1812)
2426          */
2427 martian_destination:
2428         RT_CACHE_STAT_INC(in_martian_dst);
2429 #ifdef CONFIG_IP_ROUTE_VERBOSE
2430         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2431                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2432                         &daddr, &saddr, dev->name);
2433 #endif
2434
2435 e_hostunreach:
2436         err = -EHOSTUNREACH;
2437         goto out;
2438
2439 e_inval:
2440         err = -EINVAL;
2441         goto out;
2442
2443 e_nobufs:
2444         err = -ENOBUFS;
2445         goto out;
2446
2447 martian_source:
2448         err = -EINVAL;
2449 martian_source_keep_err:
2450         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2451         goto out;
2452 }
2453
2454 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2455                            u8 tos, struct net_device *dev, bool noref)
2456 {
2457         struct rtable * rth;
2458         unsigned        hash;
2459         int iif = dev->ifindex;
2460         struct net *net;
2461         int res;
2462
2463         net = dev_net(dev);
2464
2465         rcu_read_lock();
2466
2467         if (!rt_caching(net))
2468                 goto skip_cache;
2469
2470         tos &= IPTOS_RT_MASK;
2471         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2472
2473         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2474              rth = rcu_dereference(rth->dst.rt_next)) {
2475                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2476                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2477                      (rth->rt_route_iif ^ iif) |
2478                      (rth->rt_key_tos ^ tos)) == 0 &&
2479                     rth->rt_mark == skb->mark &&
2480                     net_eq(dev_net(rth->dst.dev), net) &&
2481                     !rt_is_expired(rth)) {
2482                         ipv4_validate_peer(rth);
2483                         if (noref) {
2484                                 dst_use_noref(&rth->dst, jiffies);
2485                                 skb_dst_set_noref(skb, &rth->dst);
2486                         } else {
2487                                 dst_use(&rth->dst, jiffies);
2488                                 skb_dst_set(skb, &rth->dst);
2489                         }
2490                         RT_CACHE_STAT_INC(in_hit);
2491                         rcu_read_unlock();
2492                         return 0;
2493                 }
2494                 RT_CACHE_STAT_INC(in_hlist_search);
2495         }
2496
2497 skip_cache:
2498         /* Multicast recognition logic is moved from route cache to here.
2499            The problem was that too many Ethernet cards have broken/missing
2500            hardware multicast filters :-( As result the host on multicasting
2501            network acquires a lot of useless route cache entries, sort of
2502            SDR messages from all the world. Now we try to get rid of them.
2503            Really, provided software IP multicast filter is organized
2504            reasonably (at least, hashed), it does not result in a slowdown
2505            comparing with route cache reject entries.
2506            Note, that multicast routers are not affected, because
2507            route cache entry is created eventually.
2508          */
2509         if (ipv4_is_multicast(daddr)) {
2510                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2511
2512                 if (in_dev) {
2513                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2514                                                   ip_hdr(skb)->protocol);
2515                         if (our
2516 #ifdef CONFIG_IP_MROUTE
2517                                 ||
2518                             (!ipv4_is_local_multicast(daddr) &&
2519                              IN_DEV_MFORWARD(in_dev))
2520 #endif
2521                            ) {
2522                                 int res = ip_route_input_mc(skb, daddr, saddr,
2523                                                             tos, dev, our);
2524                                 rcu_read_unlock();
2525                                 return res;
2526                         }
2527                 }
2528                 rcu_read_unlock();
2529                 return -EINVAL;
2530         }
2531         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2532         rcu_read_unlock();
2533         return res;
2534 }
2535 EXPORT_SYMBOL(ip_route_input_common);
2536
2537 /* called with rcu_read_lock() */
2538 static struct rtable *__mkroute_output(const struct fib_result *res,
2539                                        const struct flowi4 *fl4,
2540                                        __be32 orig_daddr, __be32 orig_saddr,
2541                                        int orig_oif, __u8 orig_rtos,
2542                                        struct net_device *dev_out,
2543                                        unsigned int flags)
2544 {
2545         struct fib_info *fi = res->fi;
2546         struct in_device *in_dev;
2547         u16 type = res->type;
2548         struct rtable *rth;
2549
2550         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2551                 return ERR_PTR(-EINVAL);
2552
2553         if (ipv4_is_lbcast(fl4->daddr))
2554                 type = RTN_BROADCAST;
2555         else if (ipv4_is_multicast(fl4->daddr))
2556                 type = RTN_MULTICAST;
2557         else if (ipv4_is_zeronet(fl4->daddr))
2558                 return ERR_PTR(-EINVAL);
2559
2560         if (dev_out->flags & IFF_LOOPBACK)
2561                 flags |= RTCF_LOCAL;
2562
2563         in_dev = __in_dev_get_rcu(dev_out);
2564         if (!in_dev)
2565                 return ERR_PTR(-EINVAL);
2566
2567         if (type == RTN_BROADCAST) {
2568                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2569                 fi = NULL;
2570         } else if (type == RTN_MULTICAST) {
2571                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2572                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2573                                      fl4->flowi4_proto))
2574                         flags &= ~RTCF_LOCAL;
2575                 /* If multicast route do not exist use
2576                  * default one, but do not gateway in this case.
2577                  * Yes, it is hack.
2578                  */
2579                 if (fi && res->prefixlen < 4)
2580                         fi = NULL;
2581         }
2582
2583         rth = rt_dst_alloc(dev_out,
2584                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2585                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2586         if (!rth)
2587                 return ERR_PTR(-ENOBUFS);
2588
2589         rth->dst.output = ip_output;
2590
2591         rth->rt_key_dst = orig_daddr;
2592         rth->rt_key_src = orig_saddr;
2593         rth->rt_genid = rt_genid(dev_net(dev_out));
2594         rth->rt_flags   = flags;
2595         rth->rt_type    = type;
2596         rth->rt_key_tos = orig_rtos;
2597         rth->rt_dst     = fl4->daddr;
2598         rth->rt_src     = fl4->saddr;
2599         rth->rt_route_iif = 0;
2600         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2601         rth->rt_oif     = orig_oif;
2602         rth->rt_mark    = fl4->flowi4_mark;
2603         rth->rt_gateway = fl4->daddr;
2604         rth->rt_spec_dst= fl4->saddr;
2605         rth->rt_peer_genid = 0;
2606         rth->peer = NULL;
2607         rth->fi = NULL;
2608
2609         RT_CACHE_STAT_INC(out_slow_tot);
2610
2611         if (flags & RTCF_LOCAL) {
2612                 rth->dst.input = ip_local_deliver;
2613                 rth->rt_spec_dst = fl4->daddr;
2614         }
2615         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2616                 rth->rt_spec_dst = fl4->saddr;
2617                 if (flags & RTCF_LOCAL &&
2618                     !(dev_out->flags & IFF_LOOPBACK)) {
2619                         rth->dst.output = ip_mc_output;
2620                         RT_CACHE_STAT_INC(out_slow_mc);
2621                 }
2622 #ifdef CONFIG_IP_MROUTE
2623                 if (type == RTN_MULTICAST) {
2624                         if (IN_DEV_MFORWARD(in_dev) &&
2625                             !ipv4_is_local_multicast(fl4->daddr)) {
2626                                 rth->dst.input = ip_mr_input;
2627                                 rth->dst.output = ip_mc_output;
2628                         }
2629                 }
2630 #endif
2631         }
2632
2633         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2634
2635         return rth;
2636 }
2637
2638 /*
2639  * Major route resolver routine.
2640  * called with rcu_read_lock();
2641  */
2642
2643 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2644 {
2645         struct net_device *dev_out = NULL;
2646         __u8 tos = RT_FL_TOS(fl4);
2647         unsigned int flags = 0;
2648         struct fib_result res;
2649         struct rtable *rth;
2650         __be32 orig_daddr;
2651         __be32 orig_saddr;
2652         int orig_oif;
2653
2654         res.fi          = NULL;
2655 #ifdef CONFIG_IP_MULTIPLE_TABLES
2656         res.r           = NULL;
2657 #endif
2658
2659         orig_daddr = fl4->daddr;
2660         orig_saddr = fl4->saddr;
2661         orig_oif = fl4->flowi4_oif;
2662
2663         fl4->flowi4_iif = net->loopback_dev->ifindex;
2664         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2665         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2666                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2667
2668         rcu_read_lock();
2669         if (fl4->saddr) {
2670                 rth = ERR_PTR(-EINVAL);
2671                 if (ipv4_is_multicast(fl4->saddr) ||
2672                     ipv4_is_lbcast(fl4->saddr) ||
2673                     ipv4_is_zeronet(fl4->saddr))
2674                         goto out;
2675
2676                 /* I removed check for oif == dev_out->oif here.
2677                    It was wrong for two reasons:
2678                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2679                       is assigned to multiple interfaces.
2680                    2. Moreover, we are allowed to send packets with saddr
2681                       of another iface. --ANK
2682                  */
2683
2684                 if (fl4->flowi4_oif == 0 &&
2685                     (ipv4_is_multicast(fl4->daddr) ||
2686                      ipv4_is_lbcast(fl4->daddr))) {
2687                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2688                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2689                         if (dev_out == NULL)
2690                                 goto out;
2691
2692                         /* Special hack: user can direct multicasts
2693                            and limited broadcast via necessary interface
2694                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2695                            This hack is not just for fun, it allows
2696                            vic,vat and friends to work.
2697                            They bind socket to loopback, set ttl to zero
2698                            and expect that it will work.
2699                            From the viewpoint of routing cache they are broken,
2700                            because we are not allowed to build multicast path
2701                            with loopback source addr (look, routing cache
2702                            cannot know, that ttl is zero, so that packet
2703                            will not leave this host and route is valid).
2704                            Luckily, this hack is good workaround.
2705                          */
2706
2707                         fl4->flowi4_oif = dev_out->ifindex;
2708                         goto make_route;
2709                 }
2710
2711                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2712                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2713                         if (!__ip_dev_find(net, fl4->saddr, false))
2714                                 goto out;
2715                 }
2716         }
2717
2718
2719         if (fl4->flowi4_oif) {
2720                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2721                 rth = ERR_PTR(-ENODEV);
2722                 if (dev_out == NULL)
2723                         goto out;
2724
2725                 /* RACE: Check return value of inet_select_addr instead. */
2726                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2727                         rth = ERR_PTR(-ENETUNREACH);
2728                         goto out;
2729                 }
2730                 if (ipv4_is_local_multicast(fl4->daddr) ||
2731                     ipv4_is_lbcast(fl4->daddr)) {
2732                         if (!fl4->saddr)
2733                                 fl4->saddr = inet_select_addr(dev_out, 0,
2734                                                               RT_SCOPE_LINK);
2735                         goto make_route;
2736                 }
2737                 if (!fl4->saddr) {
2738                         if (ipv4_is_multicast(fl4->daddr))
2739                                 fl4->saddr = inet_select_addr(dev_out, 0,
2740                                                               fl4->flowi4_scope);
2741                         else if (!fl4->daddr)
2742                                 fl4->saddr = inet_select_addr(dev_out, 0,
2743                                                               RT_SCOPE_HOST);
2744                 }
2745         }
2746
2747         if (!fl4->daddr) {
2748                 fl4->daddr = fl4->saddr;
2749                 if (!fl4->daddr)
2750                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2751                 dev_out = net->loopback_dev;
2752                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2753                 res.type = RTN_LOCAL;
2754                 flags |= RTCF_LOCAL;
2755                 goto make_route;
2756         }
2757
2758         if (fib_lookup(net, fl4, &res)) {
2759                 res.fi = NULL;
2760                 if (fl4->flowi4_oif) {
2761                         /* Apparently, routing tables are wrong. Assume,
2762                            that the destination is on link.
2763
2764                            WHY? DW.
2765                            Because we are allowed to send to iface
2766                            even if it has NO routes and NO assigned
2767                            addresses. When oif is specified, routing
2768                            tables are looked up with only one purpose:
2769                            to catch if destination is gatewayed, rather than
2770                            direct. Moreover, if MSG_DONTROUTE is set,
2771                            we send packet, ignoring both routing tables
2772                            and ifaddr state. --ANK
2773
2774
2775                            We could make it even if oif is unknown,
2776                            likely IPv6, but we do not.
2777                          */
2778
2779                         if (fl4->saddr == 0)
2780                                 fl4->saddr = inet_select_addr(dev_out, 0,
2781                                                               RT_SCOPE_LINK);
2782                         res.type = RTN_UNICAST;
2783                         goto make_route;
2784                 }
2785                 rth = ERR_PTR(-ENETUNREACH);
2786                 goto out;
2787         }
2788
2789         if (res.type == RTN_LOCAL) {
2790                 if (!fl4->saddr) {
2791                         if (res.fi->fib_prefsrc)
2792                                 fl4->saddr = res.fi->fib_prefsrc;
2793                         else
2794                                 fl4->saddr = fl4->daddr;
2795                 }
2796                 dev_out = net->loopback_dev;
2797                 fl4->flowi4_oif = dev_out->ifindex;
2798                 res.fi = NULL;
2799                 flags |= RTCF_LOCAL;
2800                 goto make_route;
2801         }
2802
2803 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2804         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2805                 fib_select_multipath(&res);
2806         else
2807 #endif
2808         if (!res.prefixlen &&
2809             res.table->tb_num_default > 1 &&
2810             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2811                 fib_select_default(&res);
2812
2813         if (!fl4->saddr)
2814                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2815
2816         dev_out = FIB_RES_DEV(res);
2817         fl4->flowi4_oif = dev_out->ifindex;
2818
2819
2820 make_route:
2821         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2822                                tos, dev_out, flags);
2823         if (!IS_ERR(rth)) {
2824                 unsigned int hash;
2825
2826                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2827                                rt_genid(dev_net(dev_out)));
2828                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2829         }
2830
2831 out:
2832         rcu_read_unlock();
2833         return rth;
2834 }
2835
2836 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2837 {
2838         struct rtable *rth;
2839         unsigned int hash;
2840
2841         if (!rt_caching(net))
2842                 goto slow_output;
2843
2844         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2845
2846         rcu_read_lock_bh();
2847         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2848                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2849                 if (rth->rt_key_dst == flp4->daddr &&
2850                     rth->rt_key_src == flp4->saddr &&
2851                     rt_is_output_route(rth) &&
2852                     rth->rt_oif == flp4->flowi4_oif &&
2853                     rth->rt_mark == flp4->flowi4_mark &&
2854                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2855                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2856                     net_eq(dev_net(rth->dst.dev), net) &&
2857                     !rt_is_expired(rth)) {
2858                         ipv4_validate_peer(rth);
2859                         dst_use(&rth->dst, jiffies);
2860                         RT_CACHE_STAT_INC(out_hit);
2861                         rcu_read_unlock_bh();
2862                         if (!flp4->saddr)
2863                                 flp4->saddr = rth->rt_src;
2864                         if (!flp4->daddr)
2865                                 flp4->daddr = rth->rt_dst;
2866                         return rth;
2867                 }
2868                 RT_CACHE_STAT_INC(out_hlist_search);
2869         }
2870         rcu_read_unlock_bh();
2871
2872 slow_output:
2873         return ip_route_output_slow(net, flp4);
2874 }
2875 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2876
2877 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2878 {
2879         return NULL;
2880 }
2881
2882 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2883 {
2884         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2885
2886         return mtu ? : dst->dev->mtu;
2887 }
2888
2889 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2890 {
2891 }
2892
2893 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2894                                           unsigned long old)
2895 {
2896         return NULL;
2897 }
2898
2899 static struct dst_ops ipv4_dst_blackhole_ops = {
2900         .family                 =       AF_INET,
2901         .protocol               =       cpu_to_be16(ETH_P_IP),
2902         .destroy                =       ipv4_dst_destroy,
2903         .check                  =       ipv4_blackhole_dst_check,
2904         .mtu                    =       ipv4_blackhole_mtu,
2905         .default_advmss         =       ipv4_default_advmss,
2906         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2907         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2908         .neigh_lookup           =       ipv4_neigh_lookup,
2909 };
2910
2911 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2912 {
2913         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2914         struct rtable *ort = (struct rtable *) dst_orig;
2915
2916         if (rt) {
2917                 struct dst_entry *new = &rt->dst;
2918
2919                 new->__use = 1;
2920                 new->input = dst_discard;
2921                 new->output = dst_discard;
2922                 dst_copy_metrics(new, &ort->dst);
2923
2924                 new->dev = ort->dst.dev;
2925                 if (new->dev)
2926                         dev_hold(new->dev);
2927
2928                 rt->rt_key_dst = ort->rt_key_dst;
2929                 rt->rt_key_src = ort->rt_key_src;
2930                 rt->rt_key_tos = ort->rt_key_tos;
2931                 rt->rt_route_iif = ort->rt_route_iif;
2932                 rt->rt_iif = ort->rt_iif;
2933                 rt->rt_oif = ort->rt_oif;
2934                 rt->rt_mark = ort->rt_mark;
2935
2936                 rt->rt_genid = rt_genid(net);
2937                 rt->rt_flags = ort->rt_flags;
2938                 rt->rt_type = ort->rt_type;
2939                 rt->rt_dst = ort->rt_dst;
2940                 rt->rt_src = ort->rt_src;
2941                 rt->rt_gateway = ort->rt_gateway;
2942                 rt->rt_spec_dst = ort->rt_spec_dst;
2943                 rt->peer = ort->peer;
2944                 if (rt->peer)
2945                         atomic_inc(&rt->peer->refcnt);
2946                 rt->fi = ort->fi;
2947                 if (rt->fi)
2948                         atomic_inc(&rt->fi->fib_clntref);
2949
2950                 dst_free(new);
2951         }
2952
2953         dst_release(dst_orig);
2954
2955         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2956 }
2957
2958 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2959                                     struct sock *sk)
2960 {
2961         struct rtable *rt = __ip_route_output_key(net, flp4);
2962
2963         if (IS_ERR(rt))
2964                 return rt;
2965
2966         if (flp4->flowi4_proto)
2967                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2968                                                    flowi4_to_flowi(flp4),
2969                                                    sk, 0);
2970
2971         return rt;
2972 }
2973 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2974
2975 static int rt_fill_info(struct net *net,
2976                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2977                         int nowait, unsigned int flags)
2978 {
2979         struct rtable *rt = skb_rtable(skb);
2980         struct rtmsg *r;
2981         struct nlmsghdr *nlh;
2982         unsigned long expires = 0;
2983         const struct inet_peer *peer = rt->peer;
2984         u32 id = 0, ts = 0, tsage = 0, error;
2985
2986         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2987         if (nlh == NULL)
2988                 return -EMSGSIZE;
2989
2990         r = nlmsg_data(nlh);
2991         r->rtm_family    = AF_INET;
2992         r->rtm_dst_len  = 32;
2993         r->rtm_src_len  = 0;
2994         r->rtm_tos      = rt->rt_key_tos;
2995         r->rtm_table    = RT_TABLE_MAIN;
2996         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2997         r->rtm_type     = rt->rt_type;
2998         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2999         r->rtm_protocol = RTPROT_UNSPEC;
3000         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
3001         if (rt->rt_flags & RTCF_NOTIFY)
3002                 r->rtm_flags |= RTM_F_NOTIFY;
3003
3004         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
3005
3006         if (rt->rt_key_src) {
3007                 r->rtm_src_len = 32;
3008                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
3009         }
3010         if (rt->dst.dev)
3011                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
3012 #ifdef CONFIG_IP_ROUTE_CLASSID
3013         if (rt->dst.tclassid)
3014                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
3015 #endif
3016         if (rt_is_input_route(rt))
3017                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
3018         else if (rt->rt_src != rt->rt_key_src)
3019                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3020
3021         if (rt->rt_dst != rt->rt_gateway)
3022                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3023
3024         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3025                 goto nla_put_failure;
3026
3027         if (rt->rt_mark)
3028                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3029
3030         error = rt->dst.error;
3031         if (peer) {
3032                 inet_peer_refcheck(rt->peer);
3033                 if (peer->tcp_ts_stamp) {
3034                         ts = peer->tcp_ts;
3035                         tsage = get_seconds() - peer->tcp_ts_stamp;
3036                 }
3037                 expires = ACCESS_ONCE(peer->pmtu_expires);
3038                 if (expires) {
3039                         if (time_before(jiffies, expires))
3040                                 expires -= jiffies;
3041                         else
3042                                 expires = 0;
3043                 }
3044         }
3045
3046         if (rt_is_input_route(rt)) {
3047 #ifdef CONFIG_IP_MROUTE
3048                 __be32 dst = rt->rt_dst;
3049
3050                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3051                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3052                         int err = ipmr_get_route(net, skb,
3053                                                  rt->rt_src, rt->rt_dst,
3054                                                  r, nowait);
3055                         if (err <= 0) {
3056                                 if (!nowait) {
3057                                         if (err == 0)
3058                                                 return 0;
3059                                         goto nla_put_failure;
3060                                 } else {
3061                                         if (err == -EMSGSIZE)
3062                                                 goto nla_put_failure;
3063                                         error = err;
3064                                 }
3065                         }
3066                 } else
3067 #endif
3068                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3069         }
3070
3071         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3072                                expires, error) < 0)
3073                 goto nla_put_failure;
3074
3075         return nlmsg_end(skb, nlh);
3076
3077 nla_put_failure:
3078         nlmsg_cancel(skb, nlh);
3079         return -EMSGSIZE;
3080 }
3081
3082 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3083 {
3084         struct net *net = sock_net(in_skb->sk);
3085         struct rtmsg *rtm;
3086         struct nlattr *tb[RTA_MAX+1];
3087         struct rtable *rt = NULL;
3088         __be32 dst = 0;
3089         __be32 src = 0;
3090         u32 iif;
3091         int err;
3092         int mark;
3093         struct sk_buff *skb;
3094
3095         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3096         if (err < 0)
3097                 goto errout;
3098
3099         rtm = nlmsg_data(nlh);
3100
3101         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3102         if (skb == NULL) {
3103                 err = -ENOBUFS;
3104                 goto errout;
3105         }
3106
3107         /* Reserve room for dummy headers, this skb can pass
3108            through good chunk of routing engine.
3109          */
3110         skb_reset_mac_header(skb);
3111         skb_reset_network_header(skb);
3112
3113         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3114         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3115         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3116
3117         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3118         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3119         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3120         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3121
3122         if (iif) {
3123                 struct net_device *dev;
3124
3125                 dev = __dev_get_by_index(net, iif);
3126                 if (dev == NULL) {
3127                         err = -ENODEV;
3128                         goto errout_free;
3129                 }
3130
3131                 skb->protocol   = htons(ETH_P_IP);
3132                 skb->dev        = dev;
3133                 skb->mark       = mark;
3134                 local_bh_disable();
3135                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3136                 local_bh_enable();
3137
3138                 rt = skb_rtable(skb);
3139                 if (err == 0 && rt->dst.error)
3140                         err = -rt->dst.error;
3141         } else {
3142                 struct flowi4 fl4 = {
3143                         .daddr = dst,
3144                         .saddr = src,
3145                         .flowi4_tos = rtm->rtm_tos,
3146                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3147                         .flowi4_mark = mark,
3148                 };
3149                 rt = ip_route_output_key(net, &fl4);
3150
3151                 err = 0;
3152                 if (IS_ERR(rt))
3153                         err = PTR_ERR(rt);
3154         }
3155
3156         if (err)
3157                 goto errout_free;
3158
3159         skb_dst_set(skb, &rt->dst);
3160         if (rtm->rtm_flags & RTM_F_NOTIFY)
3161                 rt->rt_flags |= RTCF_NOTIFY;
3162
3163         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3164                            RTM_NEWROUTE, 0, 0);
3165         if (err <= 0)
3166                 goto errout_free;
3167
3168         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3169 errout:
3170         return err;
3171
3172 errout_free:
3173         kfree_skb(skb);
3174         goto errout;
3175 }
3176
3177 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3178 {
3179         struct rtable *rt;
3180         int h, s_h;
3181         int idx, s_idx;
3182         struct net *net;
3183
3184         net = sock_net(skb->sk);
3185
3186         s_h = cb->args[0];
3187         if (s_h < 0)
3188                 s_h = 0;
3189         s_idx = idx = cb->args[1];
3190         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3191                 if (!rt_hash_table[h].chain)
3192                         continue;
3193                 rcu_read_lock_bh();
3194                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3195                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3196                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3197                                 continue;
3198                         if (rt_is_expired(rt))
3199                                 continue;
3200                         skb_dst_set_noref(skb, &rt->dst);
3201                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3202                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3203                                          1, NLM_F_MULTI) <= 0) {
3204                                 skb_dst_drop(skb);
3205                                 rcu_read_unlock_bh();
3206                                 goto done;
3207                         }
3208                         skb_dst_drop(skb);
3209                 }
3210                 rcu_read_unlock_bh();
3211         }
3212
3213 done:
3214         cb->args[0] = h;
3215         cb->args[1] = idx;
3216         return skb->len;
3217 }
3218
3219 void ip_rt_multicast_event(struct in_device *in_dev)
3220 {
3221         rt_cache_flush(dev_net(in_dev->dev), 0);
3222 }
3223
3224 #ifdef CONFIG_SYSCTL
3225 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3226                                         void __user *buffer,
3227                                         size_t *lenp, loff_t *ppos)
3228 {
3229         if (write) {
3230                 int flush_delay;
3231                 ctl_table ctl;
3232                 struct net *net;
3233
3234                 memcpy(&ctl, __ctl, sizeof(ctl));
3235                 ctl.data = &flush_delay;
3236                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3237
3238                 net = (struct net *)__ctl->extra1;
3239                 rt_cache_flush(net, flush_delay);
3240                 return 0;
3241         }
3242
3243         return -EINVAL;
3244 }
3245
3246 static ctl_table ipv4_route_table[] = {
3247         {
3248                 .procname       = "gc_thresh",
3249                 .data           = &ipv4_dst_ops.gc_thresh,
3250                 .maxlen         = sizeof(int),
3251                 .mode           = 0644,
3252                 .proc_handler   = proc_dointvec,
3253         },
3254         {
3255                 .procname       = "max_size",
3256                 .data           = &ip_rt_max_size,
3257                 .maxlen         = sizeof(int),
3258                 .mode           = 0644,
3259                 .proc_handler   = proc_dointvec,
3260         },
3261         {
3262                 /*  Deprecated. Use gc_min_interval_ms */
3263
3264                 .procname       = "gc_min_interval",
3265                 .data           = &ip_rt_gc_min_interval,
3266                 .maxlen         = sizeof(int),
3267                 .mode           = 0644,
3268                 .proc_handler   = proc_dointvec_jiffies,
3269         },
3270         {
3271                 .procname       = "gc_min_interval_ms",
3272                 .data           = &ip_rt_gc_min_interval,
3273                 .maxlen         = sizeof(int),
3274                 .mode           = 0644,
3275                 .proc_handler   = proc_dointvec_ms_jiffies,
3276         },
3277         {
3278                 .procname       = "gc_timeout",
3279                 .data           = &ip_rt_gc_timeout,
3280                 .maxlen         = sizeof(int),
3281                 .mode           = 0644,
3282                 .proc_handler   = proc_dointvec_jiffies,
3283         },
3284         {
3285                 .procname       = "gc_interval",
3286                 .data           = &ip_rt_gc_interval,
3287                 .maxlen         = sizeof(int),
3288                 .mode           = 0644,
3289                 .proc_handler   = proc_dointvec_jiffies,
3290         },
3291         {
3292                 .procname       = "redirect_load",
3293                 .data           = &ip_rt_redirect_load,
3294                 .maxlen         = sizeof(int),
3295                 .mode           = 0644,
3296                 .proc_handler   = proc_dointvec,
3297         },
3298         {
3299                 .procname       = "redirect_number",
3300                 .data           = &ip_rt_redirect_number,
3301                 .maxlen         = sizeof(int),
3302                 .mode           = 0644,
3303                 .proc_handler   = proc_dointvec,
3304         },
3305         {
3306                 .procname       = "redirect_silence",
3307                 .data           = &ip_rt_redirect_silence,
3308                 .maxlen         = sizeof(int),
3309                 .mode           = 0644,
3310                 .proc_handler   = proc_dointvec,
3311         },
3312         {
3313                 .procname       = "error_cost",
3314                 .data           = &ip_rt_error_cost,
3315                 .maxlen         = sizeof(int),
3316                 .mode           = 0644,
3317                 .proc_handler   = proc_dointvec,
3318         },
3319         {
3320                 .procname       = "error_burst",
3321                 .data           = &ip_rt_error_burst,
3322                 .maxlen         = sizeof(int),
3323                 .mode           = 0644,
3324                 .proc_handler   = proc_dointvec,
3325         },
3326         {
3327                 .procname       = "gc_elasticity",
3328                 .data           = &ip_rt_gc_elasticity,
3329                 .maxlen         = sizeof(int),
3330                 .mode           = 0644,
3331                 .proc_handler   = proc_dointvec,
3332         },
3333         {
3334                 .procname       = "mtu_expires",
3335                 .data           = &ip_rt_mtu_expires,
3336                 .maxlen         = sizeof(int),
3337                 .mode           = 0644,
3338                 .proc_handler   = proc_dointvec_jiffies,
3339         },
3340         {
3341                 .procname       = "min_pmtu",
3342                 .data           = &ip_rt_min_pmtu,
3343                 .maxlen         = sizeof(int),
3344                 .mode           = 0644,
3345                 .proc_handler   = proc_dointvec,
3346         },
3347         {
3348                 .procname       = "min_adv_mss",
3349                 .data           = &ip_rt_min_advmss,
3350                 .maxlen         = sizeof(int),
3351                 .mode           = 0644,
3352                 .proc_handler   = proc_dointvec,
3353         },
3354         { }
3355 };
3356
3357 static struct ctl_table empty[1];
3358
3359 static struct ctl_table ipv4_skeleton[] =
3360 {
3361         { .procname = "route",
3362           .mode = 0555, .child = ipv4_route_table},
3363         { .procname = "neigh",
3364           .mode = 0555, .child = empty},
3365         { }
3366 };
3367
3368 static __net_initdata struct ctl_path ipv4_path[] = {
3369         { .procname = "net", },
3370         { .procname = "ipv4", },
3371         { },
3372 };
3373
3374 static struct ctl_table ipv4_route_flush_table[] = {
3375         {
3376                 .procname       = "flush",
3377                 .maxlen         = sizeof(int),
3378                 .mode           = 0200,
3379                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3380         },
3381         { },
3382 };
3383
3384 static __net_initdata struct ctl_path ipv4_route_path[] = {
3385         { .procname = "net", },
3386         { .procname = "ipv4", },
3387         { .procname = "route", },
3388         { },
3389 };
3390
3391 static __net_init int sysctl_route_net_init(struct net *net)
3392 {
3393         struct ctl_table *tbl;
3394
3395         tbl = ipv4_route_flush_table;
3396         if (!net_eq(net, &init_net)) {
3397                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3398                 if (tbl == NULL)
3399                         goto err_dup;
3400         }
3401         tbl[0].extra1 = net;
3402
3403         net->ipv4.route_hdr =
3404                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3405         if (net->ipv4.route_hdr == NULL)
3406                 goto err_reg;
3407         return 0;
3408
3409 err_reg:
3410         if (tbl != ipv4_route_flush_table)
3411                 kfree(tbl);
3412 err_dup:
3413         return -ENOMEM;
3414 }
3415
3416 static __net_exit void sysctl_route_net_exit(struct net *net)
3417 {
3418         struct ctl_table *tbl;
3419
3420         tbl = net->ipv4.route_hdr->ctl_table_arg;
3421         unregister_net_sysctl_table(net->ipv4.route_hdr);
3422         BUG_ON(tbl == ipv4_route_flush_table);
3423         kfree(tbl);
3424 }
3425
3426 static __net_initdata struct pernet_operations sysctl_route_ops = {
3427         .init = sysctl_route_net_init,
3428         .exit = sysctl_route_net_exit,
3429 };
3430 #endif
3431
3432 static __net_init int rt_genid_init(struct net *net)
3433 {
3434         get_random_bytes(&net->ipv4.rt_genid,
3435                          sizeof(net->ipv4.rt_genid));
3436         get_random_bytes(&net->ipv4.dev_addr_genid,
3437                          sizeof(net->ipv4.dev_addr_genid));
3438         return 0;
3439 }
3440
3441 static __net_initdata struct pernet_operations rt_genid_ops = {
3442         .init = rt_genid_init,
3443 };
3444
3445
3446 #ifdef CONFIG_IP_ROUTE_CLASSID
3447 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3448 #endif /* CONFIG_IP_ROUTE_CLASSID */
3449
3450 static __initdata unsigned long rhash_entries;
3451 static int __init set_rhash_entries(char *str)
3452 {
3453         if (!str)
3454                 return 0;
3455         rhash_entries = simple_strtoul(str, &str, 0);
3456         return 1;
3457 }
3458 __setup("rhash_entries=", set_rhash_entries);
3459
3460 int __init ip_rt_init(void)
3461 {
3462         int rc = 0;
3463
3464         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3465         if (!ip_idents)
3466                 panic("IP: failed to allocate ip_idents\n");
3467
3468         get_random_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3469
3470 #ifdef CONFIG_IP_ROUTE_CLASSID
3471         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3472         if (!ip_rt_acct)
3473                 panic("IP: failed to allocate ip_rt_acct\n");
3474 #endif
3475
3476         ipv4_dst_ops.kmem_cachep =
3477                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3478                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3479
3480         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3481
3482         if (dst_entries_init(&ipv4_dst_ops) < 0)
3483                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3484
3485         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3486                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3487
3488         rt_hash_table = (struct rt_hash_bucket *)
3489                 alloc_large_system_hash("IP route cache",
3490                                         sizeof(struct rt_hash_bucket),
3491                                         rhash_entries,
3492                                         (totalram_pages >= 128 * 1024) ?
3493                                         15 : 17,
3494                                         0,
3495                                         &rt_hash_log,
3496                                         &rt_hash_mask,
3497                                         rhash_entries ? 0 : 512 * 1024);
3498         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3499         rt_hash_lock_init();
3500
3501         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3502         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3503
3504         devinet_init();
3505         ip_fib_init();
3506
3507         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3508         expires_ljiffies = jiffies;
3509         schedule_delayed_work(&expires_work,
3510                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3511
3512         if (ip_rt_proc_init())
3513                 printk(KERN_ERR "Unable to create route proc files\n");
3514 #ifdef CONFIG_XFRM
3515         xfrm_init();
3516         xfrm4_init(ip_rt_max_size);
3517 #endif
3518         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3519
3520 #ifdef CONFIG_SYSCTL
3521         register_pernet_subsys(&sysctl_route_ops);
3522 #endif
3523         register_pernet_subsys(&rt_genid_ops);
3524         return rc;
3525 }
3526
3527 #ifdef CONFIG_SYSCTL
3528 /*
3529  * We really need to sanitize the damn ipv4 init order, then all
3530  * this nonsense will go away.
3531  */
3532 void __init ip_static_sysctl_init(void)
3533 {
3534         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3535 }
3536 #endif