net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <linux/prefetch.h>
  95 #include <net/dst.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #endif
 112 #include <net/atmclip.h>
 113 #include <net/secure_seq.h>
 114
 115 #define RT_FL_TOS(oldflp4) \
 116         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 117
 118 #define IP_MAX_MTU      0xFFF0
 119
 120 #define RT_GC_TIMEOUT (300*HZ)
 121
 122 static int ip_rt_max_size;
 123 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 124 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 125 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 126 static int ip_rt_redirect_number __read_mostly  = 9;
 127 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 128 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 129 static int ip_rt_error_cost __read_mostly       = HZ;
 130 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 131 static int ip_rt_gc_elasticity __read_mostly    = 8;
 132 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 133 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 134 static int ip_rt_min_advmss __read_mostly       = 256;
 135 static int rt_chain_length_max __read_mostly    = 20;
 136 static int redirect_genid;
 137
 138 static struct delayed_work expires_work;
 139 static unsigned long expires_ljiffies;
 140
 141 /*
 142  *      Interface to generic destination cache.
 143  */
 144
 145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 147 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 148 static void              ipv4_dst_destroy(struct dst_entry *dst);
 149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150 static void              ipv4_link_failure(struct sk_buff *skb);
 151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152 static int rt_garbage_collect(struct dst_ops *ops);
 153
 154 static void __rt_garbage_collect(struct work_struct *w);
 155 static DECLARE_WORK(rt_gc_worker, __rt_garbage_collect);
 156
 157 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 158                             int how)
 159 {
 160 }
 161
 162 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 163 {
 164         struct rtable *rt = (struct rtable *) dst;
 165         struct inet_peer *peer;
 166         u32 *p = NULL;
 167
 168         if (!rt->peer)
 169                 rt_bind_peer(rt, rt->rt_dst, 1);
 170
 171         peer = rt->peer;
 172         if (peer) {
 173                 u32 *old_p = __DST_METRICS_PTR(old);
 174                 unsigned long prev, new;
 175
 176                 p = peer->metrics;
 177                 if (inet_metrics_new(peer))
 178                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 179
 180                 new = (unsigned long) p;
 181                 prev = cmpxchg(&dst->_metrics, old, new);
 182
 183                 if (prev != old) {
 184                         p = __DST_METRICS_PTR(prev);
 185                         if (prev & DST_METRICS_READ_ONLY)
 186                                 p = NULL;
 187                 } else {
 188                         if (rt->fi) {
 189                                 fib_info_put(rt->fi);
 190                                 rt->fi = NULL;
 191                         }
 192                 }
 193         }
 194         return p;
 195 }
 196
 197 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 198
 199 static struct dst_ops ipv4_dst_ops = {
 200         .family =               AF_INET,
 201         .protocol =             cpu_to_be16(ETH_P_IP),
 202         .gc =                   rt_garbage_collect,
 203         .check =                ipv4_dst_check,
 204         .default_advmss =       ipv4_default_advmss,
 205         .mtu =                  ipv4_mtu,
 206         .cow_metrics =          ipv4_cow_metrics,
 207         .destroy =              ipv4_dst_destroy,
 208         .ifdown =               ipv4_dst_ifdown,
 209         .negative_advice =      ipv4_negative_advice,
 210         .link_failure =         ipv4_link_failure,
 211         .update_pmtu =          ip_rt_update_pmtu,
 212         .local_out =            __ip_local_out,
 213         .neigh_lookup =         ipv4_neigh_lookup,
 214 };
 215
 216 #define ECN_OR_COST(class)      TC_PRIO_##class
 217
 218 const __u8 ip_tos2prio[16] = {
 219         TC_PRIO_BESTEFFORT,
 220         ECN_OR_COST(BESTEFFORT),
 221         TC_PRIO_BESTEFFORT,
 222         ECN_OR_COST(BESTEFFORT),
 223         TC_PRIO_BULK,
 224         ECN_OR_COST(BULK),
 225         TC_PRIO_BULK,
 226         ECN_OR_COST(BULK),
 227         TC_PRIO_INTERACTIVE,
 228         ECN_OR_COST(INTERACTIVE),
 229         TC_PRIO_INTERACTIVE,
 230         ECN_OR_COST(INTERACTIVE),
 231         TC_PRIO_INTERACTIVE_BULK,
 232         ECN_OR_COST(INTERACTIVE_BULK),
 233         TC_PRIO_INTERACTIVE_BULK,
 234         ECN_OR_COST(INTERACTIVE_BULK)
 235 };
 236
 237
 238 /*
 239  * Route cache.
 240  */
 241
 242 /* The locking scheme is rather straight forward:
 243  *
 244  * 1) Read-Copy Update protects the buckets of the central route hash.
 245  * 2) Only writers remove entries, and they hold the lock
 246  *    as they look at rtable reference counts.
 247  * 3) Only readers acquire references to rtable entries,
 248  *    they do so with atomic increments and with the
 249  *    lock held.
 250  */
 251
 252 struct rt_hash_bucket {
 253         struct rtable __rcu     *chain;
 254 };
 255
 256 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 257         defined(CONFIG_PROVE_LOCKING)
 258 /*
 259  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 260  * The size of this table is a power of two and depends on the number of CPUS.
 261  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 262  */
 263 #ifdef CONFIG_LOCKDEP
 264 # define RT_HASH_LOCK_SZ        256
 265 #else
 266 # if NR_CPUS >= 32
 267 #  define RT_HASH_LOCK_SZ       4096
 268 # elif NR_CPUS >= 16
 269 #  define RT_HASH_LOCK_SZ       2048
 270 # elif NR_CPUS >= 8
 271 #  define RT_HASH_LOCK_SZ       1024
 272 # elif NR_CPUS >= 4
 273 #  define RT_HASH_LOCK_SZ       512
 274 # else
 275 #  define RT_HASH_LOCK_SZ       256
 276 # endif
 277 #endif
 278
 279 static spinlock_t       *rt_hash_locks;
 280 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 281
 282 static __init void rt_hash_lock_init(void)
 283 {
 284         int i;
 285
 286         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 287                         GFP_KERNEL);
 288         if (!rt_hash_locks)
 289                 panic("IP: failed to allocate rt_hash_locks\n");
 290
 291         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 292                 spin_lock_init(&rt_hash_locks[i]);
 293 }
 294 #else
 295 # define rt_hash_lock_addr(slot) NULL
 296
 297 static inline void rt_hash_lock_init(void)
 298 {
 299 }
 300 #endif
 301
 302 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 303 static unsigned                 rt_hash_mask __read_mostly;
 304 static unsigned int             rt_hash_log  __read_mostly;
 305
 306 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 307 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 308
 309 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 310                                    int genid)
 311 {
 312         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 313                             idx, genid)
 314                 & rt_hash_mask;
 315 }
 316
 317 static inline int rt_genid(struct net *net)
 318 {
 319         return atomic_read(&net->ipv4.rt_genid);
 320 }
 321
 322 #ifdef CONFIG_PROC_FS
 323 struct rt_cache_iter_state {
 324         struct seq_net_private p;
 325         int bucket;
 326         int genid;
 327 };
 328
 329 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 330 {
 331         struct rt_cache_iter_state *st = seq->private;
 332         struct rtable *r = NULL;
 333
 334         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 335                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 336                         continue;
 337                 rcu_read_lock_bh();
 338                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 339                 while (r) {
 340                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 341                             r->rt_genid == st->genid)
 342                                 return r;
 343                         r = rcu_dereference_bh(r->dst.rt_next);
 344                 }
 345                 rcu_read_unlock_bh();
 346         }
 347         return r;
 348 }
 349
 350 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 351                                           struct rtable *r)
 352 {
 353         struct rt_cache_iter_state *st = seq->private;
 354
 355         r = rcu_dereference_bh(r->dst.rt_next);
 356         while (!r) {
 357                 rcu_read_unlock_bh();
 358                 do {
 359                         if (--st->bucket < 0)
 360                                 return NULL;
 361                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 362                 rcu_read_lock_bh();
 363                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 364         }
 365         return r;
 366 }
 367
 368 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 369                                         struct rtable *r)
 370 {
 371         struct rt_cache_iter_state *st = seq->private;
 372         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 373                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 374                         continue;
 375                 if (r->rt_genid == st->genid)
 376                         break;
 377         }
 378         return r;
 379 }
 380
 381 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 382 {
 383         struct rtable *r = rt_cache_get_first(seq);
 384
 385         if (r)
 386                 while (pos && (r = rt_cache_get_next(seq, r)))
 387                         --pos;
 388         return pos ? NULL : r;
 389 }
 390
 391 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 392 {
 393         struct rt_cache_iter_state *st = seq->private;
 394         if (*pos)
 395                 return rt_cache_get_idx(seq, *pos - 1);
 396         st->genid = rt_genid(seq_file_net(seq));
 397         return SEQ_START_TOKEN;
 398 }
 399
 400 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 401 {
 402         struct rtable *r;
 403
 404         if (v == SEQ_START_TOKEN)
 405                 r = rt_cache_get_first(seq);
 406         else
 407                 r = rt_cache_get_next(seq, v);
 408         ++*pos;
 409         return r;
 410 }
 411
 412 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 413 {
 414         if (v && v != SEQ_START_TOKEN)
 415                 rcu_read_unlock_bh();
 416 }
 417
 418 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 419 {
 420         if (v == SEQ_START_TOKEN)
 421                 seq_printf(seq, "%-127s\n",
 422                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 423                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 424                            "HHUptod\tSpecDst");
 425         else {
 426                 struct rtable *r = v;
 427                 struct neighbour *n;
 428                 int len, HHUptod;
 429
 430                 rcu_read_lock();
 431                 n = dst_get_neighbour(&r->dst);
 432                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 433                 rcu_read_unlock();
 434
 435                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 436                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 437                         r->dst.dev ? r->dst.dev->name : "*",
 438                         (__force u32)r->rt_dst,
 439                         (__force u32)r->rt_gateway,
 440                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 441                         r->dst.__use, 0, (__force u32)r->rt_src,
 442                         dst_metric_advmss(&r->dst) + 40,
 443                         dst_metric(&r->dst, RTAX_WINDOW),
 444                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 445                               dst_metric(&r->dst, RTAX_RTTVAR)),
 446                         r->rt_key_tos,
 447                         -1,
 448                         HHUptod,
 449                         r->rt_spec_dst, &len);
 450
 451                 seq_printf(seq, "%*s\n", 127 - len, "");
 452         }
 453         return 0;
 454 }
 455
 456 static const struct seq_operations rt_cache_seq_ops = {
 457         .start  = rt_cache_seq_start,
 458         .next   = rt_cache_seq_next,
 459         .stop   = rt_cache_seq_stop,
 460         .show   = rt_cache_seq_show,
 461 };
 462
 463 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 464 {
 465         return seq_open_net(inode, file, &rt_cache_seq_ops,
 466                         sizeof(struct rt_cache_iter_state));
 467 }
 468
 469 static const struct file_operations rt_cache_seq_fops = {
 470         .owner   = THIS_MODULE,
 471         .open    = rt_cache_seq_open,
 472         .read    = seq_read,
 473         .llseek  = seq_lseek,
 474         .release = seq_release_net,
 475 };
 476
 477
 478 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 479 {
 480         int cpu;
 481
 482         if (*pos == 0)
 483                 return SEQ_START_TOKEN;
 484
 485         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 486                 if (!cpu_possible(cpu))
 487                         continue;
 488                 *pos = cpu+1;
 489                 return &per_cpu(rt_cache_stat, cpu);
 490         }
 491         return NULL;
 492 }
 493
 494 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 495 {
 496         int cpu;
 497
 498         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 499                 if (!cpu_possible(cpu))
 500                         continue;
 501                 *pos = cpu+1;
 502                 return &per_cpu(rt_cache_stat, cpu);
 503         }
 504         return NULL;
 505
 506 }
 507
 508 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 509 {
 510
 511 }
 512
 513 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 514 {
 515         struct rt_cache_stat *st = v;
 516
 517         if (v == SEQ_START_TOKEN) {
 518                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 519                 return 0;
 520         }
 521
 522         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 523                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 524                    dst_entries_get_slow(&ipv4_dst_ops),
 525                    st->in_hit,
 526                    st->in_slow_tot,
 527                    st->in_slow_mc,
 528                    st->in_no_route,
 529                    st->in_brd,
 530                    st->in_martian_dst,
 531                    st->in_martian_src,
 532
 533                    st->out_hit,
 534                    st->out_slow_tot,
 535                    st->out_slow_mc,
 536
 537                    st->gc_total,
 538                    st->gc_ignored,
 539                    st->gc_goal_miss,
 540                    st->gc_dst_overflow,
 541                    st->in_hlist_search,
 542                    st->out_hlist_search
 543                 );
 544         return 0;
 545 }
 546
 547 static const struct seq_operations rt_cpu_seq_ops = {
 548         .start  = rt_cpu_seq_start,
 549         .next   = rt_cpu_seq_next,
 550         .stop   = rt_cpu_seq_stop,
 551         .show   = rt_cpu_seq_show,
 552 };
 553
 554
 555 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 556 {
 557         return seq_open(file, &rt_cpu_seq_ops);
 558 }
 559
 560 static const struct file_operations rt_cpu_seq_fops = {
 561         .owner   = THIS_MODULE,
 562         .open    = rt_cpu_seq_open,
 563         .read    = seq_read,
 564         .llseek  = seq_lseek,
 565         .release = seq_release,
 566 };
 567
 568 #ifdef CONFIG_IP_ROUTE_CLASSID
 569 static int rt_acct_proc_show(struct seq_file *m, void *v)
 570 {
 571         struct ip_rt_acct *dst, *src;
 572         unsigned int i, j;
 573
 574         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 575         if (!dst)
 576                 return -ENOMEM;
 577
 578         for_each_possible_cpu(i) {
 579                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 580                 for (j = 0; j < 256; j++) {
 581                         dst[j].o_bytes   += src[j].o_bytes;
 582                         dst[j].o_packets += src[j].o_packets;
 583                         dst[j].i_bytes   += src[j].i_bytes;
 584                         dst[j].i_packets += src[j].i_packets;
 585                 }
 586         }
 587
 588         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 589         kfree(dst);
 590         return 0;
 591 }
 592
 593 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 594 {
 595         return single_open(file, rt_acct_proc_show, NULL);
 596 }
 597
 598 static const struct file_operations rt_acct_proc_fops = {
 599         .owner          = THIS_MODULE,
 600         .open           = rt_acct_proc_open,
 601         .read           = seq_read,
 602         .llseek         = seq_lseek,
 603         .release        = single_release,
 604 };
 605 #endif
 606
 607 static int __net_init ip_rt_do_proc_init(struct net *net)
 608 {
 609         struct proc_dir_entry *pde;
 610
 611         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 612                         &rt_cache_seq_fops);
 613         if (!pde)
 614                 goto err1;
 615
 616         pde = proc_create("rt_cache", S_IRUGO,
 617                           net->proc_net_stat, &rt_cpu_seq_fops);
 618         if (!pde)
 619                 goto err2;
 620
 621 #ifdef CONFIG_IP_ROUTE_CLASSID
 622         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 623         if (!pde)
 624                 goto err3;
 625 #endif
 626         return 0;
 627
 628 #ifdef CONFIG_IP_ROUTE_CLASSID
 629 err3:
 630         remove_proc_entry("rt_cache", net->proc_net_stat);
 631 #endif
 632 err2:
 633         remove_proc_entry("rt_cache", net->proc_net);
 634 err1:
 635         return -ENOMEM;
 636 }
 637
 638 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 639 {
 640         remove_proc_entry("rt_cache", net->proc_net_stat);
 641         remove_proc_entry("rt_cache", net->proc_net);
 642 #ifdef CONFIG_IP_ROUTE_CLASSID
 643         remove_proc_entry("rt_acct", net->proc_net);
 644 #endif
 645 }
 646
 647 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 648         .init = ip_rt_do_proc_init,
 649         .exit = ip_rt_do_proc_exit,
 650 };
 651
 652 static int __init ip_rt_proc_init(void)
 653 {
 654         return register_pernet_subsys(&ip_rt_proc_ops);
 655 }
 656
 657 #else
 658 static inline int ip_rt_proc_init(void)
 659 {
 660         return 0;
 661 }
 662 #endif /* CONFIG_PROC_FS */
 663
 664 static inline void rt_free(struct rtable *rt)
 665 {
 666         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 667 }
 668
 669 static inline void rt_drop(struct rtable *rt)
 670 {
 671         ip_rt_put(rt);
 672         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 673 }
 674
 675 static inline int rt_fast_clean(struct rtable *rth)
 676 {
 677         /* Kill broadcast/multicast entries very aggresively, if they
 678            collide in hash table with more useful entries */
 679         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 680                 rt_is_input_route(rth) && rth->dst.rt_next;
 681 }
 682
 683 static inline int rt_valuable(struct rtable *rth)
 684 {
 685         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 686                 (rth->peer && rth->peer->pmtu_expires);
 687 }
 688
 689 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 690 {
 691         unsigned long age;
 692         int ret = 0;
 693
 694         if (atomic_read(&rth->dst.__refcnt))
 695                 goto out;
 696
 697         age = jiffies - rth->dst.lastuse;
 698         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 699             (age <= tmo2 && rt_valuable(rth)))
 700                 goto out;
 701         ret = 1;
 702 out:    return ret;
 703 }
 704
 705 /* Bits of score are:
 706  * 31: very valuable
 707  * 30: not quite useless
 708  * 29..0: usage counter
 709  */
 710 static inline u32 rt_score(struct rtable *rt)
 711 {
 712         u32 score = jiffies - rt->dst.lastuse;
 713
 714         score = ~score & ~(3<<30);
 715
 716         if (rt_valuable(rt))
 717                 score |= (1<<31);
 718
 719         if (rt_is_output_route(rt) ||
 720             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 721                 score |= (1<<30);
 722
 723         return score;
 724 }
 725
 726 static inline bool rt_caching(const struct net *net)
 727 {
 728         return net->ipv4.current_rt_cache_rebuild_count <=
 729                 net->ipv4.sysctl_rt_cache_rebuild_count;
 730 }
 731
 732 static inline bool compare_hash_inputs(const struct rtable *rt1,
 733                                        const struct rtable *rt2)
 734 {
 735         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 736                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 737                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 738 }
 739
 740 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 741 {
 742         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 743                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 744                 (rt1->rt_mark ^ rt2->rt_mark) |
 745                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 746                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 747                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 748 }
 749
 750 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 751 {
 752         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 753 }
 754
 755 static inline int rt_is_expired(struct rtable *rth)
 756 {
 757         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 758 }
 759
 760 /*
 761  * Perform a full scan of hash table and free all entries.
 762  * Can be called by a softirq or a process.
 763  * In the later case, we want to be reschedule if necessary
 764  */
 765 static void rt_do_flush(struct net *net, int process_context)
 766 {
 767         unsigned int i;
 768         struct rtable *rth, *next;
 769
 770         for (i = 0; i <= rt_hash_mask; i++) {
 771                 struct rtable __rcu **pprev;
 772                 struct rtable *list;
 773
 774                 if (process_context && need_resched())
 775                         cond_resched();
 776                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 777                 if (!rth)
 778                         continue;
 779
 780                 spin_lock_bh(rt_hash_lock_addr(i));
 781
 782                 list = NULL;
 783                 pprev = &rt_hash_table[i].chain;
 784                 rth = rcu_dereference_protected(*pprev,
 785                         lockdep_is_held(rt_hash_lock_addr(i)));
 786
 787                 while (rth) {
 788                         next = rcu_dereference_protected(rth->dst.rt_next,
 789                                 lockdep_is_held(rt_hash_lock_addr(i)));
 790
 791                         if (!net ||
 792                             net_eq(dev_net(rth->dst.dev), net)) {
 793                                 rcu_assign_pointer(*pprev, next);
 794                                 rcu_assign_pointer(rth->dst.rt_next, list);
 795                                 list = rth;
 796                         } else {
 797                                 pprev = &rth->dst.rt_next;
 798                         }
 799                         rth = next;
 800                 }
 801
 802                 spin_unlock_bh(rt_hash_lock_addr(i));
 803
 804                 for (; list; list = next) {
 805                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 806                         rt_free(list);
 807                 }
 808         }
 809 }
 810
 811 /*
 812  * While freeing expired entries, we compute average chain length
 813  * and standard deviation, using fixed-point arithmetic.
 814  * This to have an estimation of rt_chain_length_max
 815  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 816  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 817  */
 818
 819 #define FRACT_BITS 3
 820 #define ONE (1UL << FRACT_BITS)
 821
 822 /*
 823  * Given a hash chain and an item in this hash chain,
 824  * find if a previous entry has the same hash_inputs
 825  * (but differs on tos, mark or oif)
 826  * Returns 0 if an alias is found.
 827  * Returns ONE if rth has no alias before itself.
 828  */
 829 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 830 {
 831         const struct rtable *aux = head;
 832
 833         while (aux != rth) {
 834                 if (compare_hash_inputs(aux, rth))
 835                         return 0;
 836                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 837         }
 838         return ONE;
 839 }
 840
 841 static void rt_check_expire(void)
 842 {
 843         static unsigned int rover;
 844         unsigned int i = rover, goal;
 845         struct rtable *rth;
 846         struct rtable __rcu **rthp;
 847         unsigned long samples = 0;
 848         unsigned long sum = 0, sum2 = 0;
 849         unsigned long delta;
 850         u64 mult;
 851
 852         delta = jiffies - expires_ljiffies;
 853         expires_ljiffies = jiffies;
 854         mult = ((u64)delta) << rt_hash_log;
 855         if (ip_rt_gc_timeout > 1)
 856                 do_div(mult, ip_rt_gc_timeout);
 857         goal = (unsigned int)mult;
 858         if (goal > rt_hash_mask)
 859                 goal = rt_hash_mask + 1;
 860         for (; goal > 0; goal--) {
 861                 unsigned long tmo = ip_rt_gc_timeout;
 862                 unsigned long length;
 863
 864                 i = (i + 1) & rt_hash_mask;
 865                 rthp = &rt_hash_table[i].chain;
 866
 867                 if (need_resched())
 868                         cond_resched();
 869
 870                 samples++;
 871
 872                 if (rcu_dereference_raw(*rthp) == NULL)
 873                         continue;
 874                 length = 0;
 875                 spin_lock_bh(rt_hash_lock_addr(i));
 876                 while ((rth = rcu_dereference_protected(*rthp,
 877                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 878                         prefetch(rth->dst.rt_next);
 879                         if (rt_is_expired(rth)) {
 880                                 *rthp = rth->dst.rt_next;
 881                                 rt_free(rth);
 882                                 continue;
 883                         }
 884                         if (rth->dst.expires) {
 885                                 /* Entry is expired even if it is in use */
 886                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 887 nofree:
 888                                         tmo >>= 1;
 889                                         rthp = &rth->dst.rt_next;
 890                                         /*
 891                                          * We only count entries on
 892                                          * a chain with equal hash inputs once
 893                                          * so that entries for different QOS
 894                                          * levels, and other non-hash input
 895                                          * attributes don't unfairly skew
 896                                          * the length computation
 897                                          */
 898                                         length += has_noalias(rt_hash_table[i].chain, rth);
 899                                         continue;
 900                                 }
 901                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 902                                 goto nofree;
 903
 904                         /* Cleanup aged off entries. */
 905                         *rthp = rth->dst.rt_next;
 906                         rt_free(rth);
 907                 }
 908                 spin_unlock_bh(rt_hash_lock_addr(i));
 909                 sum += length;
 910                 sum2 += length*length;
 911         }
 912         if (samples) {
 913                 unsigned long avg = sum / samples;
 914                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 915                 rt_chain_length_max = max_t(unsigned long,
 916                                         ip_rt_gc_elasticity,
 917                                         (avg + 4*sd) >> FRACT_BITS);
 918         }
 919         rover = i;
 920 }
 921
 922 /*
 923  * rt_worker_func() is run in process context.
 924  * we call rt_check_expire() to scan part of the hash table
 925  */
 926 static void rt_worker_func(struct work_struct *work)
 927 {
 928         rt_check_expire();
 929         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 930 }
 931
 932 /*
 933  * Perturbation of rt_genid by a small quantity [1..256]
 934  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 935  * many times (2^24) without giving recent rt_genid.
 936  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 937  */
 938 static void rt_cache_invalidate(struct net *net)
 939 {
 940         unsigned char shuffle;
 941
 942         get_random_bytes(&shuffle, sizeof(shuffle));
 943         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 944         redirect_genid++;
 945         inetpeer_invalidate_tree(AF_INET);
 946 }
 947
 948 /*
 949  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 950  * delay >= 0 : invalidate & flush cache (can be long)
 951  */
 952 void rt_cache_flush(struct net *net, int delay)
 953 {
 954         rt_cache_invalidate(net);
 955         if (delay >= 0)
 956                 rt_do_flush(net, !in_softirq());
 957 }
 958
 959 /* Flush previous cache invalidated entries from the cache */
 960 void rt_cache_flush_batch(struct net *net)
 961 {
 962         rt_do_flush(net, !in_softirq());
 963 }
 964
 965 static void rt_emergency_hash_rebuild(struct net *net)
 966 {
 967         if (net_ratelimit())
 968                 printk(KERN_WARNING "Route hash chain too long!\n");
 969         rt_cache_invalidate(net);
 970 }
 971
 972 /*
 973    Short description of GC goals.
 974
 975    We want to build algorithm, which will keep routing cache
 976    at some equilibrium point, when number of aged off entries
 977    is kept approximately equal to newly generated ones.
 978
 979    Current expiration strength is variable "expire".
 980    We try to adjust it dynamically, so that if networking
 981    is idle expires is large enough to keep enough of warm entries,
 982    and when load increases it reduces to limit cache size.
 983  */
 984
 985 static void __do_rt_garbage_collect(int elasticity, int min_interval)
 986 {
 987         static unsigned long expire = RT_GC_TIMEOUT;
 988         static unsigned long last_gc;
 989         static int rover;
 990         static int equilibrium;
 991         static DEFINE_SPINLOCK(rt_gc_lock);
 992         struct rtable *rth;
 993         struct rtable __rcu **rthp;
 994         unsigned long now = jiffies;
 995         int goal;
 996         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 997
 998         /*
 999          * Garbage collection is pretty expensive,
1000          * do not make it too frequently.
1001          */
1002
1003         spin_lock_bh(&rt_gc_lock);
1004
1005         RT_CACHE_STAT_INC(gc_total);
1006
1007         if (now - last_gc < min_interval &&
1008             entries < ip_rt_max_size) {
1009                 RT_CACHE_STAT_INC(gc_ignored);
1010                 goto out;
1011         }
1012
1013         entries = dst_entries_get_slow(&ipv4_dst_ops);
1014         /* Calculate number of entries, which we want to expire now. */
1015         goal = entries - (elasticity << rt_hash_log);
1016         if (goal <= 0) {
1017                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1018                         equilibrium = ipv4_dst_ops.gc_thresh;
1019                 goal = entries - equilibrium;
1020                 if (goal > 0) {
1021                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1022                         goal = entries - equilibrium;
1023                 }
1024         } else {
1025                 /* We are in dangerous area. Try to reduce cache really
1026                  * aggressively.
1027                  */
1028                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1029                 equilibrium = entries - goal;
1030         }
1031
1032         if (now - last_gc >= min_interval)
1033                 last_gc = now;
1034
1035         if (goal <= 0) {
1036                 equilibrium += goal;
1037                 goto work_done;
1038         }
1039
1040         do {
1041                 int i, k;
1042
1043                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1044                         unsigned long tmo = expire;
1045
1046                         k = (k + 1) & rt_hash_mask;
1047                         rthp = &rt_hash_table[k].chain;
1048                         spin_lock_bh(rt_hash_lock_addr(k));
1049                         while ((rth = rcu_dereference_protected(*rthp,
1050                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1051                                 if (!rt_is_expired(rth) &&
1052                                         !rt_may_expire(rth, tmo, expire)) {
1053                                         tmo >>= 1;
1054                                         rthp = &rth->dst.rt_next;
1055                                         continue;
1056                                 }
1057                                 *rthp = rth->dst.rt_next;
1058                                 rt_free(rth);
1059                                 goal--;
1060                         }
1061                         spin_unlock_bh(rt_hash_lock_addr(k));
1062                         if (goal <= 0)
1063                                 break;
1064                 }
1065                 rover = k;
1066
1067                 if (goal <= 0)
1068                         goto work_done;
1069
1070                 /* Goal is not achieved. We stop process if:
1071
1072                    - if expire reduced to zero. Otherwise, expire is halfed.
1073                    - if table is not full.
1074                    - if we are called from interrupt.
1075                    - jiffies check is just fallback/debug loop breaker.
1076                      We will not spin here for long time in any case.
1077                  */
1078
1079                 RT_CACHE_STAT_INC(gc_goal_miss);
1080
1081                 if (expire == 0)
1082                         break;
1083
1084                 expire >>= 1;
1085
1086                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1087                         goto out;
1088         } while (!in_softirq() && time_before_eq(jiffies, now));
1089
1090         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1091                 goto out;
1092         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1093                 goto out;
1094         if (net_ratelimit())
1095                 printk(KERN_WARNING "dst cache overflow\n");
1096         RT_CACHE_STAT_INC(gc_dst_overflow);
1097         goto out;
1098
1099 work_done:
1100         expire += min_interval;
1101         if (expire > ip_rt_gc_timeout ||
1102             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1103             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1104                 expire = ip_rt_gc_timeout;
1105 out:
1106         spin_unlock_bh(&rt_gc_lock);
1107 }
1108
1109 static void __rt_garbage_collect(struct work_struct *w)
1110 {
1111         __do_rt_garbage_collect(ip_rt_gc_elasticity, ip_rt_gc_min_interval);
1112 }
1113
1114 static int rt_garbage_collect(struct dst_ops *ops)
1115 {
1116         if (!work_pending(&rt_gc_worker))
1117                 schedule_work(&rt_gc_worker);
1118
1119         if (dst_entries_get_fast(&ipv4_dst_ops) >= ip_rt_max_size ||
1120             dst_entries_get_slow(&ipv4_dst_ops) >= ip_rt_max_size) {
1121                 RT_CACHE_STAT_INC(gc_dst_overflow);
1122                 return 1;
1123         }
1124         return 0;
1125 }
1126
1127 /*
1128  * Returns number of entries in a hash chain that have different hash_inputs
1129  */
1130 static int slow_chain_length(const struct rtable *head)
1131 {
1132         int length = 0;
1133         const struct rtable *rth = head;
1134
1135         while (rth) {
1136                 length += has_noalias(head, rth);
1137                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1138         }
1139         return length >> FRACT_BITS;
1140 }
1141
1142 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1143 {
1144         struct neigh_table *tbl = &arp_tbl;
1145         static const __be32 inaddr_any = 0;
1146         struct net_device *dev = dst->dev;
1147         const __be32 *pkey = daddr;
1148         struct neighbour *n;
1149
1150 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1151         if (dev->type == ARPHRD_ATM)
1152                 tbl = clip_tbl_hook;
1153 #endif
1154         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1155                 pkey = &inaddr_any;
1156
1157         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1158         if (n)
1159                 return n;
1160         return neigh_create(tbl, pkey, dev);
1161 }
1162
1163 static int rt_bind_neighbour(struct rtable *rt)
1164 {
1165         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1166         if (IS_ERR(n))
1167                 return PTR_ERR(n);
1168         dst_set_neighbour(&rt->dst, n);
1169
1170         return 0;
1171 }
1172
1173 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1174                                      struct sk_buff *skb, int ifindex)
1175 {
1176         struct rtable   *rth, *cand;
1177         struct rtable __rcu **rthp, **candp;
1178         unsigned long   now;
1179         u32             min_score;
1180         int             chain_length;
1181         int attempts = 1;
1182
1183 restart:
1184         chain_length = 0;
1185         min_score = ~(u32)0;
1186         cand = NULL;
1187         candp = NULL;
1188         now = jiffies;
1189
1190         if (!rt_caching(dev_net(rt->dst.dev))) {
1191                 /*
1192                  * If we're not caching, just tell the caller we
1193                  * were successful and don't touch the route.  The
1194                  * caller hold the sole reference to the cache entry, and
1195                  * it will be released when the caller is done with it.
1196                  * If we drop it here, the callers have no way to resolve routes
1197                  * when we're not caching.  Instead, just point *rp at rt, so
1198                  * the caller gets a single use out of the route
1199                  * Note that we do rt_free on this new route entry, so that
1200                  * once its refcount hits zero, we are still able to reap it
1201                  * (Thanks Alexey)
1202                  * Note: To avoid expensive rcu stuff for this uncached dst,
1203                  * we set DST_NOCACHE so that dst_release() can free dst without
1204                  * waiting a grace period.
1205                  */
1206
1207                 rt->dst.flags |= DST_NOCACHE;
1208                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1209                         int err = rt_bind_neighbour(rt);
1210                         if (err) {
1211                                 if (net_ratelimit())
1212                                         printk(KERN_WARNING
1213                                             "Neighbour table failure & not caching routes.\n");
1214                                 ip_rt_put(rt);
1215                                 return ERR_PTR(err);
1216                         }
1217                 }
1218
1219                 goto skip_hashing;
1220         }
1221
1222         rthp = &rt_hash_table[hash].chain;
1223
1224         spin_lock_bh(rt_hash_lock_addr(hash));
1225         while ((rth = rcu_dereference_protected(*rthp,
1226                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1227                 if (rt_is_expired(rth)) {
1228                         *rthp = rth->dst.rt_next;
1229                         rt_free(rth);
1230                         continue;
1231                 }
1232                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1233                         /* Put it first */
1234                         *rthp = rth->dst.rt_next;
1235                         /*
1236                          * Since lookup is lockfree, the deletion
1237                          * must be visible to another weakly ordered CPU before
1238                          * the insertion at the start of the hash chain.
1239                          */
1240                         rcu_assign_pointer(rth->dst.rt_next,
1241                                            rt_hash_table[hash].chain);
1242                         /*
1243                          * Since lookup is lockfree, the update writes
1244                          * must be ordered for consistency on SMP.
1245                          */
1246                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1247
1248                         dst_use(&rth->dst, now);
1249                         spin_unlock_bh(rt_hash_lock_addr(hash));
1250
1251                         rt_drop(rt);
1252                         if (skb)
1253                                 skb_dst_set(skb, &rth->dst);
1254                         return rth;
1255                 }
1256
1257                 if (!atomic_read(&rth->dst.__refcnt)) {
1258                         u32 score = rt_score(rth);
1259
1260                         if (score <= min_score) {
1261                                 cand = rth;
1262                                 candp = rthp;
1263                                 min_score = score;
1264                         }
1265                 }
1266
1267                 chain_length++;
1268
1269                 rthp = &rth->dst.rt_next;
1270         }
1271
1272         if (cand) {
1273                 /* ip_rt_gc_elasticity used to be average length of chain
1274                  * length, when exceeded gc becomes really aggressive.
1275                  *
1276                  * The second limit is less certain. At the moment it allows
1277                  * only 2 entries per bucket. We will see.
1278                  */
1279                 if (chain_length > ip_rt_gc_elasticity) {
1280                         *candp = cand->dst.rt_next;
1281                         rt_free(cand);
1282                 }
1283         } else {
1284                 if (chain_length > rt_chain_length_max &&
1285                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1286                         struct net *net = dev_net(rt->dst.dev);
1287                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1288                         if (!rt_caching(net)) {
1289                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1290                                         rt->dst.dev->name, num);
1291                         }
1292                         rt_emergency_hash_rebuild(net);
1293                         spin_unlock_bh(rt_hash_lock_addr(hash));
1294
1295                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1296                                         ifindex, rt_genid(net));
1297                         goto restart;
1298                 }
1299         }
1300
1301         /* Try to bind route to arp only if it is output
1302            route or unicast forwarding path.
1303          */
1304         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1305                 int err = rt_bind_neighbour(rt);
1306                 if (err) {
1307                         spin_unlock_bh(rt_hash_lock_addr(hash));
1308
1309                         if (err != -ENOBUFS) {
1310                                 rt_drop(rt);
1311                                 return ERR_PTR(err);
1312                         }
1313
1314                         /* Neighbour tables are full and nothing
1315                            can be released. Try to shrink route cache,
1316                            it is most likely it holds some neighbour records.
1317                          */
1318                         if (!in_softirq() && attempts-- > 0) {
1319                                 static DEFINE_SPINLOCK(lock);
1320
1321                                 if (spin_trylock(&lock)) {
1322                                         __do_rt_garbage_collect(1, 0);
1323                                         spin_unlock(&lock);
1324                                 } else {
1325                                         spin_unlock_wait(&lock);
1326                                 }
1327                                 goto restart;
1328                         }
1329
1330                         if (net_ratelimit())
1331                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1332                         rt_drop(rt);
1333                         return ERR_PTR(-ENOBUFS);
1334                 }
1335         }
1336
1337         rt->dst.rt_next = rt_hash_table[hash].chain;
1338
1339         /*
1340          * Since lookup is lockfree, we must make sure
1341          * previous writes to rt are committed to memory
1342          * before making rt visible to other CPUS.
1343          */
1344         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1345
1346         spin_unlock_bh(rt_hash_lock_addr(hash));
1347
1348 skip_hashing:
1349         if (skb)
1350                 skb_dst_set(skb, &rt->dst);
1351         return rt;
1352 }
1353
1354 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1355
1356 static u32 rt_peer_genid(void)
1357 {
1358         return atomic_read(&__rt_peer_genid);
1359 }
1360
1361 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1362 {
1363         struct inet_peer *peer;
1364
1365         peer = inet_getpeer_v4(daddr, create);
1366
1367         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1368                 inet_putpeer(peer);
1369         else
1370                 rt->rt_peer_genid = rt_peer_genid();
1371 }
1372
1373 #define IP_IDENTS_SZ 2048u
1374 struct ip_ident_bucket {
1375         atomic_t        id;
1376         u32             stamp32;
1377 };
1378
1379 static struct ip_ident_bucket *ip_idents __read_mostly;
1380
1381 /* In order to protect privacy, we add a perturbation to identifiers
1382  * if one generator is seldom used. This makes hard for an attacker
1383  * to infer how many packets were sent between two points in time.
1384  */
1385 u32 ip_idents_reserve(u32 hash, int segs)
1386 {
1387         struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
1388         u32 old = ACCESS_ONCE(bucket->stamp32);
1389         u32 now = (u32)jiffies;
1390         u32 delta = 0;
1391
1392         if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
1393                 u64 x = random32();
1394
1395                 x *= (now - old);
1396                 delta = (u32)(x >> 32);
1397         }
1398
1399         return atomic_add_return(segs + delta, &bucket->id) - segs;
1400 }
1401 EXPORT_SYMBOL(ip_idents_reserve);
1402
1403 void __ip_select_ident(struct iphdr *iph, int segs)
1404 {
1405         static u32 ip_idents_hashrnd __read_mostly;
1406         static bool hashrnd_initialized = false;
1407         u32 hash, id;
1408
1409         if (unlikely(!hashrnd_initialized)) {
1410                 hashrnd_initialized = true;
1411                 get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
1412         }
1413
1414         hash = jhash_3words((__force u32)iph->daddr,
1415                             (__force u32)iph->saddr,
1416                             iph->protocol,
1417                             ip_idents_hashrnd);
1418         id = ip_idents_reserve(hash, segs);
1419         iph->id = htons(id);
1420 }
1421 EXPORT_SYMBOL(__ip_select_ident);
1422
1423 static void rt_del(unsigned hash, struct rtable *rt)
1424 {
1425         struct rtable __rcu **rthp;
1426         struct rtable *aux;
1427
1428         rthp = &rt_hash_table[hash].chain;
1429         spin_lock_bh(rt_hash_lock_addr(hash));
1430         ip_rt_put(rt);
1431         while ((aux = rcu_dereference_protected(*rthp,
1432                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1433                 if (aux == rt || rt_is_expired(aux)) {
1434                         *rthp = aux->dst.rt_next;
1435                         rt_free(aux);
1436                         continue;
1437                 }
1438                 rthp = &aux->dst.rt_next;
1439         }
1440         spin_unlock_bh(rt_hash_lock_addr(hash));
1441 }
1442
1443 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1444 {
1445         struct rtable *rt = (struct rtable *) dst;
1446         __be32 orig_gw = rt->rt_gateway;
1447         struct neighbour *n, *old_n;
1448
1449         dst_confirm(&rt->dst);
1450
1451         rt->rt_gateway = peer->redirect_learned.a4;
1452
1453         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1454         if (IS_ERR(n)) {
1455                 rt->rt_gateway = orig_gw;
1456                 return;
1457         }
1458         old_n = xchg(&rt->dst._neighbour, n);
1459         if (old_n)
1460                 neigh_release(old_n);
1461         if (!(n->nud_state & NUD_VALID)) {
1462                 neigh_event_send(n, NULL);
1463         } else {
1464                 rt->rt_flags |= RTCF_REDIRECTED;
1465                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1466         }
1467 }
1468
1469 /* called in rcu_read_lock() section */
1470 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1471                     __be32 saddr, struct net_device *dev)
1472 {
1473         int s, i;
1474         struct in_device *in_dev = __in_dev_get_rcu(dev);
1475         __be32 skeys[2] = { saddr, 0 };
1476         int    ikeys[2] = { dev->ifindex, 0 };
1477         struct inet_peer *peer;
1478         struct net *net;
1479
1480         if (!in_dev)
1481                 return;
1482
1483         net = dev_net(dev);
1484         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1485             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1486             ipv4_is_zeronet(new_gw))
1487                 goto reject_redirect;
1488
1489         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1490                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1491                         goto reject_redirect;
1492                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1493                         goto reject_redirect;
1494         } else {
1495                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1496                         goto reject_redirect;
1497         }
1498
1499         for (s = 0; s < 2; s++) {
1500                 for (i = 0; i < 2; i++) {
1501                         unsigned int hash;
1502                         struct rtable __rcu **rthp;
1503                         struct rtable *rt;
1504
1505                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1506
1507                         rthp = &rt_hash_table[hash].chain;
1508
1509                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1510                                 rthp = &rt->dst.rt_next;
1511
1512                                 if (rt->rt_key_dst != daddr ||
1513                                     rt->rt_key_src != skeys[s] ||
1514                                     rt->rt_oif != ikeys[i] ||
1515                                     rt_is_input_route(rt) ||
1516                                     rt_is_expired(rt) ||
1517                                     !net_eq(dev_net(rt->dst.dev), net) ||
1518                                     rt->dst.error ||
1519                                     rt->dst.dev != dev ||
1520                                     rt->rt_gateway != old_gw)
1521                                         continue;
1522
1523                                 if (!rt->peer)
1524                                         rt_bind_peer(rt, rt->rt_dst, 1);
1525
1526                                 peer = rt->peer;
1527                                 if (peer) {
1528                                         if (peer->redirect_learned.a4 != new_gw ||
1529                                             peer->redirect_genid != redirect_genid) {
1530                                                 peer->redirect_learned.a4 = new_gw;
1531                                                 peer->redirect_genid = redirect_genid;
1532                                                 atomic_inc(&__rt_peer_genid);
1533                                         }
1534                                         check_peer_redir(&rt->dst, peer);
1535                                 }
1536                         }
1537                 }
1538         }
1539         return;
1540
1541 reject_redirect:
1542 #ifdef CONFIG_IP_ROUTE_VERBOSE
1543         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1544                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1545                         "  Advised path = %pI4 -> %pI4\n",
1546                        &old_gw, dev->name, &new_gw,
1547                        &saddr, &daddr);
1548 #endif
1549         ;
1550 }
1551
1552 static bool peer_pmtu_expired(struct inet_peer *peer)
1553 {
1554         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1555
1556         return orig &&
1557                time_after_eq(jiffies, orig) &&
1558                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1559 }
1560
1561 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1562 {
1563         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1564
1565         return orig &&
1566                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1567 }
1568
1569 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1570 {
1571         struct rtable *rt = (struct rtable *)dst;
1572         struct dst_entry *ret = dst;
1573
1574         if (rt) {
1575                 if (dst->obsolete > 0) {
1576                         ip_rt_put(rt);
1577                         ret = NULL;
1578                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1579                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1580                                                 rt->rt_oif,
1581                                                 rt_genid(dev_net(dst->dev)));
1582                         rt_del(hash, rt);
1583                         ret = NULL;
1584                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1585                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1586                 }
1587         }
1588         return ret;
1589 }
1590
1591 /*
1592  * Algorithm:
1593  *      1. The first ip_rt_redirect_number redirects are sent
1594  *         with exponential backoff, then we stop sending them at all,
1595  *         assuming that the host ignores our redirects.
1596  *      2. If we did not see packets requiring redirects
1597  *         during ip_rt_redirect_silence, we assume that the host
1598  *         forgot redirected route and start to send redirects again.
1599  *
1600  * This algorithm is much cheaper and more intelligent than dumb load limiting
1601  * in icmp.c.
1602  *
1603  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1604  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1605  */
1606
1607 void ip_rt_send_redirect(struct sk_buff *skb)
1608 {
1609         struct rtable *rt = skb_rtable(skb);
1610         struct in_device *in_dev;
1611         struct inet_peer *peer;
1612         int log_martians;
1613
1614         rcu_read_lock();
1615         in_dev = __in_dev_get_rcu(rt->dst.dev);
1616         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1617                 rcu_read_unlock();
1618                 return;
1619         }
1620         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1621         rcu_read_unlock();
1622
1623         if (!rt->peer)
1624                 rt_bind_peer(rt, rt->rt_dst, 1);
1625         peer = rt->peer;
1626         if (!peer) {
1627                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1628                 return;
1629         }
1630
1631         /* No redirected packets during ip_rt_redirect_silence;
1632          * reset the algorithm.
1633          */
1634         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1635                 peer->rate_tokens = 0;
1636
1637         /* Too many ignored redirects; do not send anything
1638          * set dst.rate_last to the last seen redirected packet.
1639          */
1640         if (peer->rate_tokens >= ip_rt_redirect_number) {
1641                 peer->rate_last = jiffies;
1642                 return;
1643         }
1644
1645         /* Check for load limit; set rate_last to the latest sent
1646          * redirect.
1647          */
1648         if (peer->rate_tokens == 0 ||
1649             time_after(jiffies,
1650                        (peer->rate_last +
1651                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1652                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1653                 peer->rate_last = jiffies;
1654                 ++peer->rate_tokens;
1655 #ifdef CONFIG_IP_ROUTE_VERBOSE
1656                 if (log_martians &&
1657                     peer->rate_tokens == ip_rt_redirect_number &&
1658                     net_ratelimit())
1659                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1660                                &ip_hdr(skb)->saddr, rt->rt_iif,
1661                                 &rt->rt_dst, &rt->rt_gateway);
1662 #endif
1663         }
1664 }
1665
1666 static int ip_error(struct sk_buff *skb)
1667 {
1668         struct rtable *rt = skb_rtable(skb);
1669         struct inet_peer *peer;
1670         unsigned long now;
1671         bool send;
1672         int code;
1673
1674         switch (rt->dst.error) {
1675         case EINVAL:
1676         default:
1677                 goto out;
1678         case EHOSTUNREACH:
1679                 code = ICMP_HOST_UNREACH;
1680                 break;
1681         case ENETUNREACH:
1682                 code = ICMP_NET_UNREACH;
1683                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1684                                 IPSTATS_MIB_INNOROUTES);
1685                 break;
1686         case EACCES:
1687                 code = ICMP_PKT_FILTERED;
1688                 break;
1689         }
1690
1691         if (!rt->peer)
1692                 rt_bind_peer(rt, rt->rt_dst, 1);
1693         peer = rt->peer;
1694
1695         send = true;
1696         if (peer) {
1697                 now = jiffies;
1698                 peer->rate_tokens += now - peer->rate_last;
1699                 if (peer->rate_tokens > ip_rt_error_burst)
1700                         peer->rate_tokens = ip_rt_error_burst;
1701                 peer->rate_last = now;
1702                 if (peer->rate_tokens >= ip_rt_error_cost)
1703                         peer->rate_tokens -= ip_rt_error_cost;
1704                 else
1705                         send = false;
1706         }
1707         if (send)
1708                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1709
1710 out:    kfree_skb(skb);
1711         return 0;
1712 }
1713
1714 /*
1715  *      The last two values are not from the RFC but
1716  *      are needed for AMPRnet AX.25 paths.
1717  */
1718
1719 static const unsigned short mtu_plateau[] =
1720 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1721
1722 static inline unsigned short guess_mtu(unsigned short old_mtu)
1723 {
1724         int i;
1725
1726         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1727                 if (old_mtu > mtu_plateau[i])
1728                         return mtu_plateau[i];
1729         return 68;
1730 }
1731
1732 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1733                                  unsigned short new_mtu,
1734                                  struct net_device *dev)
1735 {
1736         unsigned short old_mtu = ntohs(iph->tot_len);
1737         unsigned short est_mtu = 0;
1738         struct inet_peer *peer;
1739
1740         peer = inet_getpeer_v4(iph->daddr, 1);
1741         if (peer) {
1742                 unsigned short mtu = new_mtu;
1743
1744                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1745                         /* BSD 4.2 derived systems incorrectly adjust
1746                          * tot_len by the IP header length, and report
1747                          * a zero MTU in the ICMP message.
1748                          */
1749                         if (mtu == 0 &&
1750                             old_mtu >= 68 + (iph->ihl << 2))
1751                                 old_mtu -= iph->ihl << 2;
1752                         mtu = guess_mtu(old_mtu);
1753                 }
1754
1755                 if (mtu < ip_rt_min_pmtu)
1756                         mtu = ip_rt_min_pmtu;
1757                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1758                         unsigned long pmtu_expires;
1759
1760                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1761                         if (!pmtu_expires)
1762                                 pmtu_expires = 1UL;
1763
1764                         est_mtu = mtu;
1765                         peer->pmtu_learned = mtu;
1766                         peer->pmtu_expires = pmtu_expires;
1767                         atomic_inc(&__rt_peer_genid);
1768                 }
1769
1770                 inet_putpeer(peer);
1771         }
1772         return est_mtu ? : new_mtu;
1773 }
1774
1775 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1776 {
1777         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1778
1779         if (!expires)
1780                 return;
1781         if (time_before(jiffies, expires)) {
1782                 u32 orig_dst_mtu = dst_mtu(dst);
1783                 if (peer->pmtu_learned < orig_dst_mtu) {
1784                         if (!peer->pmtu_orig)
1785                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1786                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1787                 }
1788         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1789                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1790 }
1791
1792 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1793 {
1794         struct rtable *rt = (struct rtable *) dst;
1795         struct inet_peer *peer;
1796
1797         dst_confirm(dst);
1798
1799         if (!rt->peer)
1800                 rt_bind_peer(rt, rt->rt_dst, 1);
1801         peer = rt->peer;
1802         if (peer) {
1803                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1804
1805                 if (mtu < ip_rt_min_pmtu)
1806                         mtu = ip_rt_min_pmtu;
1807                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1808
1809                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1810                         if (!pmtu_expires)
1811                                 pmtu_expires = 1UL;
1812
1813                         peer->pmtu_learned = mtu;
1814                         peer->pmtu_expires = pmtu_expires;
1815
1816                         atomic_inc(&__rt_peer_genid);
1817                         rt->rt_peer_genid = rt_peer_genid();
1818                 }
1819                 check_peer_pmtu(dst, peer);
1820         }
1821 }
1822
1823
1824 static void ipv4_validate_peer(struct rtable *rt)
1825 {
1826         if (rt->rt_peer_genid != rt_peer_genid()) {
1827                 struct inet_peer *peer;
1828
1829                 if (!rt->peer)
1830                         rt_bind_peer(rt, rt->rt_dst, 0);
1831
1832                 peer = rt->peer;
1833                 if (peer) {
1834                         check_peer_pmtu(&rt->dst, peer);
1835
1836                         if (peer->redirect_genid != redirect_genid)
1837                                 peer->redirect_learned.a4 = 0;
1838                         if (peer->redirect_learned.a4 &&
1839                             peer->redirect_learned.a4 != rt->rt_gateway)
1840                                 check_peer_redir(&rt->dst, peer);
1841                 }
1842
1843                 rt->rt_peer_genid = rt_peer_genid();
1844         }
1845 }
1846
1847 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1848 {
1849         struct rtable *rt = (struct rtable *) dst;
1850
1851         if (rt_is_expired(rt))
1852                 return NULL;
1853         ipv4_validate_peer(rt);
1854         return dst;
1855 }
1856
1857 static void ipv4_dst_destroy(struct dst_entry *dst)
1858 {
1859         struct rtable *rt = (struct rtable *) dst;
1860         struct inet_peer *peer = rt->peer;
1861
1862         if (rt->fi) {
1863                 fib_info_put(rt->fi);
1864                 rt->fi = NULL;
1865         }
1866         if (peer) {
1867                 rt->peer = NULL;
1868                 inet_putpeer(peer);
1869         }
1870 }
1871
1872
1873 static void ipv4_link_failure(struct sk_buff *skb)
1874 {
1875         struct rtable *rt;
1876
1877         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1878
1879         rt = skb_rtable(skb);
1880         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1881                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1882 }
1883
1884 static int ip_rt_bug(struct sk_buff *skb)
1885 {
1886         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1887                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1888                 skb->dev ? skb->dev->name : "?");
1889         kfree_skb(skb);
1890         WARN_ON(1);
1891         return 0;
1892 }
1893
1894 /*
1895    We do not cache source address of outgoing interface,
1896    because it is used only by IP RR, TS and SRR options,
1897    so that it out of fast path.
1898
1899    BTW remember: "addr" is allowed to be not aligned
1900    in IP options!
1901  */
1902
1903 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1904 {
1905         __be32 src;
1906
1907         if (rt_is_output_route(rt))
1908                 src = ip_hdr(skb)->saddr;
1909         else {
1910                 struct fib_result res;
1911                 struct flowi4 fl4;
1912                 struct iphdr *iph;
1913
1914                 iph = ip_hdr(skb);
1915
1916                 memset(&fl4, 0, sizeof(fl4));
1917                 fl4.daddr = iph->daddr;
1918                 fl4.saddr = iph->saddr;
1919                 fl4.flowi4_tos = RT_TOS(iph->tos);
1920                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1921                 fl4.flowi4_iif = skb->dev->ifindex;
1922                 fl4.flowi4_mark = skb->mark;
1923
1924                 rcu_read_lock();
1925                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1926                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1927                 else
1928                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1929                                         RT_SCOPE_UNIVERSE);
1930                 rcu_read_unlock();
1931         }
1932         memcpy(addr, &src, 4);
1933 }
1934
1935 #ifdef CONFIG_IP_ROUTE_CLASSID
1936 static void set_class_tag(struct rtable *rt, u32 tag)
1937 {
1938         if (!(rt->dst.tclassid & 0xFFFF))
1939                 rt->dst.tclassid |= tag & 0xFFFF;
1940         if (!(rt->dst.tclassid & 0xFFFF0000))
1941                 rt->dst.tclassid |= tag & 0xFFFF0000;
1942 }
1943 #endif
1944
1945 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1946 {
1947         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1948
1949         if (advmss == 0) {
1950                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1951                                ip_rt_min_advmss);
1952                 if (advmss > 65535 - 40)
1953                         advmss = 65535 - 40;
1954         }
1955         return advmss;
1956 }
1957
1958 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1959 {
1960         const struct rtable *rt = (const struct rtable *) dst;
1961         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1962
1963         if (mtu && rt_is_output_route(rt))
1964                 return mtu;
1965
1966         mtu = dst->dev->mtu;
1967
1968         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1969
1970                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1971                         mtu = 576;
1972         }
1973
1974         if (mtu > IP_MAX_MTU)
1975                 mtu = IP_MAX_MTU;
1976
1977         return mtu;
1978 }
1979
1980 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1981                             struct fib_info *fi)
1982 {
1983         struct inet_peer *peer;
1984         int create = 0;
1985
1986         /* If a peer entry exists for this destination, we must hook
1987          * it up in order to get at cached metrics.
1988          */
1989         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1990                 create = 1;
1991
1992         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1993         if (peer) {
1994                 rt->rt_peer_genid = rt_peer_genid();
1995                 if (inet_metrics_new(peer))
1996                         memcpy(peer->metrics, fi->fib_metrics,
1997                                sizeof(u32) * RTAX_MAX);
1998                 dst_init_metrics(&rt->dst, peer->metrics, false);
1999
2000                 check_peer_pmtu(&rt->dst, peer);
2001                 if (peer->redirect_genid != redirect_genid)
2002                         peer->redirect_learned.a4 = 0;
2003                 if (peer->redirect_learned.a4 &&
2004                     peer->redirect_learned.a4 != rt->rt_gateway) {
2005                         rt->rt_gateway = peer->redirect_learned.a4;
2006                         rt->rt_flags |= RTCF_REDIRECTED;
2007                 }
2008         } else {
2009                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
2010                         rt->fi = fi;
2011                         atomic_inc(&fi->fib_clntref);
2012                 }
2013                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
2014         }
2015 }
2016
2017 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
2018                            const struct fib_result *res,
2019                            struct fib_info *fi, u16 type, u32 itag)
2020 {
2021         struct dst_entry *dst = &rt->dst;
2022
2023         if (fi) {
2024                 if (FIB_RES_GW(*res) &&
2025                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
2026                         rt->rt_gateway = FIB_RES_GW(*res);
2027                 rt_init_metrics(rt, fl4, fi);
2028 #ifdef CONFIG_IP_ROUTE_CLASSID
2029                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
2030 #endif
2031         }
2032
2033         if (dst_mtu(dst) > IP_MAX_MTU)
2034                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
2035         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
2036                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
2037
2038 #ifdef CONFIG_IP_ROUTE_CLASSID
2039 #ifdef CONFIG_IP_MULTIPLE_TABLES
2040         set_class_tag(rt, fib_rules_tclass(res));
2041 #endif
2042         set_class_tag(rt, itag);
2043 #endif
2044 }
2045
2046 static struct rtable *rt_dst_alloc(struct net_device *dev,
2047                                    bool nopolicy, bool noxfrm)
2048 {
2049         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2050                          DST_HOST |
2051                          (nopolicy ? DST_NOPOLICY : 0) |
2052                          (noxfrm ? DST_NOXFRM : 0));
2053 }
2054
2055 /* called in rcu_read_lock() section */
2056 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2057                                 u8 tos, struct net_device *dev, int our)
2058 {
2059         unsigned int hash;
2060         struct rtable *rth;
2061         __be32 spec_dst;
2062         struct in_device *in_dev = __in_dev_get_rcu(dev);
2063         u32 itag = 0;
2064         int err;
2065
2066         /* Primary sanity checks. */
2067
2068         if (in_dev == NULL)
2069                 return -EINVAL;
2070
2071         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2072             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2073                 goto e_inval;
2074
2075         if (ipv4_is_zeronet(saddr)) {
2076                 if (!ipv4_is_local_multicast(daddr))
2077                         goto e_inval;
2078                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2079         } else {
2080                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2081                                           &itag);
2082                 if (err < 0)
2083                         goto e_err;
2084         }
2085         rth = rt_dst_alloc(init_net.loopback_dev,
2086                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2087         if (!rth)
2088                 goto e_nobufs;
2089
2090 #ifdef CONFIG_IP_ROUTE_CLASSID
2091         rth->dst.tclassid = itag;
2092 #endif
2093         rth->dst.output = ip_rt_bug;
2094
2095         rth->rt_key_dst = daddr;
2096         rth->rt_key_src = saddr;
2097         rth->rt_genid   = rt_genid(dev_net(dev));
2098         rth->rt_flags   = RTCF_MULTICAST;
2099         rth->rt_type    = RTN_MULTICAST;
2100         rth->rt_key_tos = tos;
2101         rth->rt_dst     = daddr;
2102         rth->rt_src     = saddr;
2103         rth->rt_route_iif = dev->ifindex;
2104         rth->rt_iif     = dev->ifindex;
2105         rth->rt_oif     = 0;
2106         rth->rt_mark    = skb->mark;
2107         rth->rt_gateway = daddr;
2108         rth->rt_spec_dst= spec_dst;
2109         rth->rt_peer_genid = 0;
2110         rth->peer = NULL;
2111         rth->fi = NULL;
2112         if (our) {
2113                 rth->dst.input= ip_local_deliver;
2114                 rth->rt_flags |= RTCF_LOCAL;
2115         }
2116
2117 #ifdef CONFIG_IP_MROUTE
2118         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2119                 rth->dst.input = ip_mr_input;
2120 #endif
2121         RT_CACHE_STAT_INC(in_slow_mc);
2122
2123         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2124         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2125         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2126
2127 e_nobufs:
2128         return -ENOBUFS;
2129 e_inval:
2130         return -EINVAL;
2131 e_err:
2132         return err;
2133 }
2134
2135
2136 static void ip_handle_martian_source(struct net_device *dev,
2137                                      struct in_device *in_dev,
2138                                      struct sk_buff *skb,
2139                                      __be32 daddr,
2140                                      __be32 saddr)
2141 {
2142         RT_CACHE_STAT_INC(in_martian_src);
2143 #ifdef CONFIG_IP_ROUTE_VERBOSE
2144         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2145                 /*
2146                  *      RFC1812 recommendation, if source is martian,
2147                  *      the only hint is MAC header.
2148                  */
2149                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2150                         &daddr, &saddr, dev->name);
2151                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2152                         int i;
2153                         const unsigned char *p = skb_mac_header(skb);
2154                         printk(KERN_WARNING "ll header: ");
2155                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2156                                 printk("%02x", *p);
2157                                 if (i < (dev->hard_header_len - 1))
2158                                         printk(":");
2159                         }
2160                         printk("\n");
2161                 }
2162         }
2163 #endif
2164 }
2165
2166 /* called in rcu_read_lock() section */
2167 static int __mkroute_input(struct sk_buff *skb,
2168                            const struct fib_result *res,
2169                            struct in_device *in_dev,
2170                            __be32 daddr, __be32 saddr, u32 tos,
2171                            struct rtable **result)
2172 {
2173         struct rtable *rth;
2174         int err;
2175         struct in_device *out_dev;
2176         unsigned int flags = 0;
2177         __be32 spec_dst;
2178         u32 itag = 0;
2179
2180         /* get a working reference to the output device */
2181         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2182         if (out_dev == NULL) {
2183                 if (net_ratelimit())
2184                         printk(KERN_CRIT "Bug in ip_route_input" \
2185                                "_slow(). Please, report\n");
2186                 return -EINVAL;
2187         }
2188
2189
2190         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2191                                   in_dev->dev, &spec_dst, &itag);
2192         if (err < 0) {
2193                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2194                                          saddr);
2195
2196                 goto cleanup;
2197         }
2198
2199         if (err)
2200                 flags |= RTCF_DIRECTSRC;
2201
2202         if (out_dev == in_dev && err &&
2203             (IN_DEV_SHARED_MEDIA(out_dev) ||
2204              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2205                 flags |= RTCF_DOREDIRECT;
2206
2207         if (skb->protocol != htons(ETH_P_IP)) {
2208                 /* Not IP (i.e. ARP). Do not create route, if it is
2209                  * invalid for proxy arp. DNAT routes are always valid.
2210                  *
2211                  * Proxy arp feature have been extended to allow, ARP
2212                  * replies back to the same interface, to support
2213                  * Private VLAN switch technologies. See arp.c.
2214                  */
2215                 if (out_dev == in_dev &&
2216                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2217                         err = -EINVAL;
2218                         goto cleanup;
2219                 }
2220         }
2221
2222         rth = rt_dst_alloc(out_dev->dev,
2223                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2224                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2225         if (!rth) {
2226                 err = -ENOBUFS;
2227                 goto cleanup;
2228         }
2229
2230         rth->rt_key_dst = daddr;
2231         rth->rt_key_src = saddr;
2232         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2233         rth->rt_flags = flags;
2234         rth->rt_type = res->type;
2235         rth->rt_key_tos = tos;
2236         rth->rt_dst     = daddr;
2237         rth->rt_src     = saddr;
2238         rth->rt_route_iif = in_dev->dev->ifindex;
2239         rth->rt_iif     = in_dev->dev->ifindex;
2240         rth->rt_oif     = 0;
2241         rth->rt_mark    = skb->mark;
2242         rth->rt_gateway = daddr;
2243         rth->rt_spec_dst= spec_dst;
2244         rth->rt_peer_genid = 0;
2245         rth->peer = NULL;
2246         rth->fi = NULL;
2247
2248         rth->dst.input = ip_forward;
2249         rth->dst.output = ip_output;
2250
2251         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2252
2253         *result = rth;
2254         err = 0;
2255  cleanup:
2256         return err;
2257 }
2258
2259 static int ip_mkroute_input(struct sk_buff *skb,
2260                             struct fib_result *res,
2261                             const struct flowi4 *fl4,
2262                             struct in_device *in_dev,
2263                             __be32 daddr, __be32 saddr, u32 tos)
2264 {
2265         struct rtable* rth = NULL;
2266         int err;
2267         unsigned hash;
2268
2269 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2270         if (res->fi && res->fi->fib_nhs > 1)
2271                 fib_select_multipath(res);
2272 #endif
2273
2274         /* create a routing cache entry */
2275         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2276         if (err)
2277                 return err;
2278
2279         /* put it into the cache */
2280         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2281                        rt_genid(dev_net(rth->dst.dev)));
2282         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2283         if (IS_ERR(rth))
2284                 return PTR_ERR(rth);
2285         return 0;
2286 }
2287
2288 /*
2289  *      NOTE. We drop all the packets that has local source
2290  *      addresses, because every properly looped back packet
2291  *      must have correct destination already attached by output routine.
2292  *
2293  *      Such approach solves two big problems:
2294  *      1. Not simplex devices are handled properly.
2295  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2296  *      called with rcu_read_lock()
2297  */
2298
2299 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2300                                u8 tos, struct net_device *dev)
2301 {
2302         struct fib_result res;
2303         struct in_device *in_dev = __in_dev_get_rcu(dev);
2304         struct flowi4   fl4;
2305         unsigned        flags = 0;
2306         u32             itag = 0;
2307         struct rtable * rth;
2308         unsigned        hash;
2309         __be32          spec_dst;
2310         int             err = -EINVAL;
2311         struct net    * net = dev_net(dev);
2312
2313         /* IP on this device is disabled. */
2314
2315         if (!in_dev)
2316                 goto out;
2317
2318         /* Check for the most weird martians, which can be not detected
2319            by fib_lookup.
2320          */
2321
2322         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2323             ipv4_is_loopback(saddr))
2324                 goto martian_source;
2325
2326         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2327                 goto brd_input;
2328
2329         /* Accept zero addresses only to limited broadcast;
2330          * I even do not know to fix it or not. Waiting for complains :-)
2331          */
2332         if (ipv4_is_zeronet(saddr))
2333                 goto martian_source;
2334
2335         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2336                 goto martian_destination;
2337
2338         /*
2339          *      Now we are ready to route packet.
2340          */
2341         fl4.flowi4_oif = 0;
2342         fl4.flowi4_iif = dev->ifindex;
2343         fl4.flowi4_mark = skb->mark;
2344         fl4.flowi4_tos = tos;
2345         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2346         fl4.daddr = daddr;
2347         fl4.saddr = saddr;
2348         err = fib_lookup(net, &fl4, &res);
2349         if (err != 0) {
2350                 if (!IN_DEV_FORWARD(in_dev))
2351                         goto e_hostunreach;
2352                 goto no_route;
2353         }
2354
2355         RT_CACHE_STAT_INC(in_slow_tot);
2356
2357         if (res.type == RTN_BROADCAST)
2358                 goto brd_input;
2359
2360         if (res.type == RTN_LOCAL) {
2361                 err = fib_validate_source(skb, saddr, daddr, tos,
2362                                           net->loopback_dev->ifindex,
2363                                           dev, &spec_dst, &itag);
2364                 if (err < 0)
2365                         goto martian_source_keep_err;
2366                 if (err)
2367                         flags |= RTCF_DIRECTSRC;
2368                 spec_dst = daddr;
2369                 goto local_input;
2370         }
2371
2372         if (!IN_DEV_FORWARD(in_dev))
2373                 goto e_hostunreach;
2374         if (res.type != RTN_UNICAST)
2375                 goto martian_destination;
2376
2377         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2378 out:    return err;
2379
2380 brd_input:
2381         if (skb->protocol != htons(ETH_P_IP))
2382                 goto e_inval;
2383
2384         if (ipv4_is_zeronet(saddr))
2385                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2386         else {
2387                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2388                                           &itag);
2389                 if (err < 0)
2390                         goto martian_source_keep_err;
2391                 if (err)
2392                         flags |= RTCF_DIRECTSRC;
2393         }
2394         flags |= RTCF_BROADCAST;
2395         res.type = RTN_BROADCAST;
2396         RT_CACHE_STAT_INC(in_brd);
2397
2398 local_input:
2399         rth = rt_dst_alloc(net->loopback_dev,
2400                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2401         if (!rth)
2402                 goto e_nobufs;
2403
2404         rth->dst.input= ip_local_deliver;
2405         rth->dst.output= ip_rt_bug;
2406 #ifdef CONFIG_IP_ROUTE_CLASSID
2407         rth->dst.tclassid = itag;
2408 #endif
2409
2410         rth->rt_key_dst = daddr;
2411         rth->rt_key_src = saddr;
2412         rth->rt_genid = rt_genid(net);
2413         rth->rt_flags   = flags|RTCF_LOCAL;
2414         rth->rt_type    = res.type;
2415         rth->rt_key_tos = tos;
2416         rth->rt_dst     = daddr;
2417         rth->rt_src     = saddr;
2418 #ifdef CONFIG_IP_ROUTE_CLASSID
2419         rth->dst.tclassid = itag;
2420 #endif
2421         rth->rt_route_iif = dev->ifindex;
2422         rth->rt_iif     = dev->ifindex;
2423         rth->rt_oif     = 0;
2424         rth->rt_mark    = skb->mark;
2425         rth->rt_gateway = daddr;
2426         rth->rt_spec_dst= spec_dst;
2427         rth->rt_peer_genid = 0;
2428         rth->peer = NULL;
2429         rth->fi = NULL;
2430         if (res.type == RTN_UNREACHABLE) {
2431                 rth->dst.input= ip_error;
2432                 rth->dst.error= -err;
2433                 rth->rt_flags   &= ~RTCF_LOCAL;
2434         }
2435         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2436         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2437         err = 0;
2438         if (IS_ERR(rth))
2439                 err = PTR_ERR(rth);
2440         goto out;
2441
2442 no_route:
2443         RT_CACHE_STAT_INC(in_no_route);
2444         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2445         res.type = RTN_UNREACHABLE;
2446         if (err == -ESRCH)
2447                 err = -ENETUNREACH;
2448         goto local_input;
2449
2450         /*
2451          *      Do not cache martian addresses: they should be logged (RFC1812)
2452          */
2453 martian_destination:
2454         RT_CACHE_STAT_INC(in_martian_dst);
2455 #ifdef CONFIG_IP_ROUTE_VERBOSE
2456         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2457                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2458                         &daddr, &saddr, dev->name);
2459 #endif
2460
2461 e_hostunreach:
2462         err = -EHOSTUNREACH;
2463         goto out;
2464
2465 e_inval:
2466         err = -EINVAL;
2467         goto out;
2468
2469 e_nobufs:
2470         err = -ENOBUFS;
2471         goto out;
2472
2473 martian_source:
2474         err = -EINVAL;
2475 martian_source_keep_err:
2476         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2477         goto out;
2478 }
2479
2480 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2481                            u8 tos, struct net_device *dev, bool noref)
2482 {
2483         struct rtable * rth;
2484         unsigned        hash;
2485         int iif = dev->ifindex;
2486         struct net *net;
2487         int res;
2488
2489         net = dev_net(dev);
2490
2491         rcu_read_lock();
2492
2493         if (!rt_caching(net))
2494                 goto skip_cache;
2495
2496         tos &= IPTOS_RT_MASK;
2497         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2498
2499         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2500              rth = rcu_dereference(rth->dst.rt_next)) {
2501                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2502                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2503                      (rth->rt_route_iif ^ iif) |
2504                      (rth->rt_key_tos ^ tos)) == 0 &&
2505                     rth->rt_mark == skb->mark &&
2506                     net_eq(dev_net(rth->dst.dev), net) &&
2507                     !rt_is_expired(rth)) {
2508                         ipv4_validate_peer(rth);
2509                         if (noref) {
2510                                 dst_use_noref(&rth->dst, jiffies);
2511                                 skb_dst_set_noref(skb, &rth->dst);
2512                         } else {
2513                                 dst_use(&rth->dst, jiffies);
2514                                 skb_dst_set(skb, &rth->dst);
2515                         }
2516                         RT_CACHE_STAT_INC(in_hit);
2517                         rcu_read_unlock();
2518                         return 0;
2519                 }
2520                 RT_CACHE_STAT_INC(in_hlist_search);
2521         }
2522
2523 skip_cache:
2524         /* Multicast recognition logic is moved from route cache to here.
2525            The problem was that too many Ethernet cards have broken/missing
2526            hardware multicast filters :-( As result the host on multicasting
2527            network acquires a lot of useless route cache entries, sort of
2528            SDR messages from all the world. Now we try to get rid of them.
2529            Really, provided software IP multicast filter is organized
2530            reasonably (at least, hashed), it does not result in a slowdown
2531            comparing with route cache reject entries.
2532            Note, that multicast routers are not affected, because
2533            route cache entry is created eventually.
2534          */
2535         if (ipv4_is_multicast(daddr)) {
2536                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2537
2538                 if (in_dev) {
2539                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2540                                                   ip_hdr(skb)->protocol);
2541                         if (our
2542 #ifdef CONFIG_IP_MROUTE
2543                                 ||
2544                             (!ipv4_is_local_multicast(daddr) &&
2545                              IN_DEV_MFORWARD(in_dev))
2546 #endif
2547                            ) {
2548                                 int res = ip_route_input_mc(skb, daddr, saddr,
2549                                                             tos, dev, our);
2550                                 rcu_read_unlock();
2551                                 return res;
2552                         }
2553                 }
2554                 rcu_read_unlock();
2555                 return -EINVAL;
2556         }
2557         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2558         rcu_read_unlock();
2559         return res;
2560 }
2561 EXPORT_SYMBOL(ip_route_input_common);
2562
2563 /* called with rcu_read_lock() */
2564 static struct rtable *__mkroute_output(const struct fib_result *res,
2565                                        const struct flowi4 *fl4,
2566                                        __be32 orig_daddr, __be32 orig_saddr,
2567                                        int orig_oif, __u8 orig_rtos,
2568                                        struct net_device *dev_out,
2569                                        unsigned int flags)
2570 {
2571         struct fib_info *fi = res->fi;
2572         struct in_device *in_dev;
2573         u16 type = res->type;
2574         struct rtable *rth;
2575
2576         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2577                 return ERR_PTR(-EINVAL);
2578
2579         if (ipv4_is_lbcast(fl4->daddr))
2580                 type = RTN_BROADCAST;
2581         else if (ipv4_is_multicast(fl4->daddr))
2582                 type = RTN_MULTICAST;
2583         else if (ipv4_is_zeronet(fl4->daddr))
2584                 return ERR_PTR(-EINVAL);
2585
2586         if (dev_out->flags & IFF_LOOPBACK)
2587                 flags |= RTCF_LOCAL;
2588
2589         in_dev = __in_dev_get_rcu(dev_out);
2590         if (!in_dev)
2591                 return ERR_PTR(-EINVAL);
2592
2593         if (type == RTN_BROADCAST) {
2594                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2595                 fi = NULL;
2596         } else if (type == RTN_MULTICAST) {
2597                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2598                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2599                                      fl4->flowi4_proto))
2600                         flags &= ~RTCF_LOCAL;
2601                 /* If multicast route do not exist use
2602                  * default one, but do not gateway in this case.
2603                  * Yes, it is hack.
2604                  */
2605                 if (fi && res->prefixlen < 4)
2606                         fi = NULL;
2607         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2608                    (orig_oif != dev_out->ifindex)) {
2609                 /* For local routes that require a particular output interface
2610                  * we do not want to cache the result.  Caching the result
2611                  * causes incorrect behaviour when there are multiple source
2612                  * addresses on the interface, the end result being that if the
2613                  * intended recipient is waiting on that interface for the
2614                  * packet he won't receive it because it will be delivered on
2615                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2616                  * be set to the loopback interface as well.
2617                  */
2618                 fi = NULL;
2619         }
2620
2621         rth = rt_dst_alloc(dev_out,
2622                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2623                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2624         if (!rth)
2625                 return ERR_PTR(-ENOBUFS);
2626
2627         rth->dst.output = ip_output;
2628
2629         rth->rt_key_dst = orig_daddr;
2630         rth->rt_key_src = orig_saddr;
2631         rth->rt_genid = rt_genid(dev_net(dev_out));
2632         rth->rt_flags   = flags;
2633         rth->rt_type    = type;
2634         rth->rt_key_tos = orig_rtos;
2635         rth->rt_dst     = fl4->daddr;
2636         rth->rt_src     = fl4->saddr;
2637         rth->rt_route_iif = 0;
2638         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2639         rth->rt_oif     = orig_oif;
2640         rth->rt_mark    = fl4->flowi4_mark;
2641         rth->rt_gateway = fl4->daddr;
2642         rth->rt_spec_dst= fl4->saddr;
2643         rth->rt_peer_genid = 0;
2644         rth->peer = NULL;
2645         rth->fi = NULL;
2646
2647         RT_CACHE_STAT_INC(out_slow_tot);
2648
2649         if (flags & RTCF_LOCAL) {
2650                 rth->dst.input = ip_local_deliver;
2651                 rth->rt_spec_dst = fl4->daddr;
2652         }
2653         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2654                 rth->rt_spec_dst = fl4->saddr;
2655                 if (flags & RTCF_LOCAL &&
2656                     !(dev_out->flags & IFF_LOOPBACK)) {
2657                         rth->dst.output = ip_mc_output;
2658                         RT_CACHE_STAT_INC(out_slow_mc);
2659                 }
2660 #ifdef CONFIG_IP_MROUTE
2661                 if (type == RTN_MULTICAST) {
2662                         if (IN_DEV_MFORWARD(in_dev) &&
2663                             !ipv4_is_local_multicast(fl4->daddr)) {
2664                                 rth->dst.input = ip_mr_input;
2665                                 rth->dst.output = ip_mc_output;
2666                         }
2667                 }
2668 #endif
2669         }
2670
2671         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2672
2673         return rth;
2674 }
2675
2676 /*
2677  * Major route resolver routine.
2678  * called with rcu_read_lock();
2679  */
2680
2681 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2682 {
2683         struct net_device *dev_out = NULL;
2684         __u8 tos = RT_FL_TOS(fl4);
2685         unsigned int flags = 0;
2686         struct fib_result res;
2687         struct rtable *rth;
2688         __be32 orig_daddr;
2689         __be32 orig_saddr;
2690         int orig_oif;
2691
2692         res.fi          = NULL;
2693 #ifdef CONFIG_IP_MULTIPLE_TABLES
2694         res.r           = NULL;
2695 #endif
2696
2697         orig_daddr = fl4->daddr;
2698         orig_saddr = fl4->saddr;
2699         orig_oif = fl4->flowi4_oif;
2700
2701         fl4->flowi4_iif = net->loopback_dev->ifindex;
2702         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2703         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2704                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2705
2706         rcu_read_lock();
2707         if (fl4->saddr) {
2708                 rth = ERR_PTR(-EINVAL);
2709                 if (ipv4_is_multicast(fl4->saddr) ||
2710                     ipv4_is_lbcast(fl4->saddr) ||
2711                     ipv4_is_zeronet(fl4->saddr))
2712                         goto out;
2713
2714                 /* I removed check for oif == dev_out->oif here.
2715                    It was wrong for two reasons:
2716                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2717                       is assigned to multiple interfaces.
2718                    2. Moreover, we are allowed to send packets with saddr
2719                       of another iface. --ANK
2720                  */
2721
2722                 if (fl4->flowi4_oif == 0 &&
2723                     (ipv4_is_multicast(fl4->daddr) ||
2724                      ipv4_is_lbcast(fl4->daddr))) {
2725                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2726                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2727                         if (dev_out == NULL)
2728                                 goto out;
2729
2730                         /* Special hack: user can direct multicasts
2731                            and limited broadcast via necessary interface
2732                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2733                            This hack is not just for fun, it allows
2734                            vic,vat and friends to work.
2735                            They bind socket to loopback, set ttl to zero
2736                            and expect that it will work.
2737                            From the viewpoint of routing cache they are broken,
2738                            because we are not allowed to build multicast path
2739                            with loopback source addr (look, routing cache
2740                            cannot know, that ttl is zero, so that packet
2741                            will not leave this host and route is valid).
2742                            Luckily, this hack is good workaround.
2743                          */
2744
2745                         fl4->flowi4_oif = dev_out->ifindex;
2746                         goto make_route;
2747                 }
2748
2749                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2750                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2751                         if (!__ip_dev_find(net, fl4->saddr, false))
2752                                 goto out;
2753                 }
2754         }
2755
2756
2757         if (fl4->flowi4_oif) {
2758                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2759                 rth = ERR_PTR(-ENODEV);
2760                 if (dev_out == NULL)
2761                         goto out;
2762
2763                 /* RACE: Check return value of inet_select_addr instead. */
2764                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2765                         rth = ERR_PTR(-ENETUNREACH);
2766                         goto out;
2767                 }
2768                 if (ipv4_is_local_multicast(fl4->daddr) ||
2769                     ipv4_is_lbcast(fl4->daddr)) {
2770                         if (!fl4->saddr)
2771                                 fl4->saddr = inet_select_addr(dev_out, 0,
2772                                                               RT_SCOPE_LINK);
2773                         goto make_route;
2774                 }
2775                 if (!fl4->saddr) {
2776                         if (ipv4_is_multicast(fl4->daddr))
2777                                 fl4->saddr = inet_select_addr(dev_out, 0,
2778                                                               fl4->flowi4_scope);
2779                         else if (!fl4->daddr)
2780                                 fl4->saddr = inet_select_addr(dev_out, 0,
2781                                                               RT_SCOPE_HOST);
2782                 }
2783         }
2784
2785         if (!fl4->daddr) {
2786                 fl4->daddr = fl4->saddr;
2787                 if (!fl4->daddr)
2788                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2789                 dev_out = net->loopback_dev;
2790                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2791                 res.type = RTN_LOCAL;
2792                 flags |= RTCF_LOCAL;
2793                 goto make_route;
2794         }
2795
2796         if (fib_lookup(net, fl4, &res)) {
2797                 res.fi = NULL;
2798                 if (fl4->flowi4_oif) {
2799                         /* Apparently, routing tables are wrong. Assume,
2800                            that the destination is on link.
2801
2802                            WHY? DW.
2803                            Because we are allowed to send to iface
2804                            even if it has NO routes and NO assigned
2805                            addresses. When oif is specified, routing
2806                            tables are looked up with only one purpose:
2807                            to catch if destination is gatewayed, rather than
2808                            direct. Moreover, if MSG_DONTROUTE is set,
2809                            we send packet, ignoring both routing tables
2810                            and ifaddr state. --ANK
2811
2812
2813                            We could make it even if oif is unknown,
2814                            likely IPv6, but we do not.
2815                          */
2816
2817                         if (fl4->saddr == 0)
2818                                 fl4->saddr = inet_select_addr(dev_out, 0,
2819                                                               RT_SCOPE_LINK);
2820                         res.type = RTN_UNICAST;
2821                         goto make_route;
2822                 }
2823                 rth = ERR_PTR(-ENETUNREACH);
2824                 goto out;
2825         }
2826
2827         if (res.type == RTN_LOCAL) {
2828                 if (!fl4->saddr) {
2829                         if (res.fi->fib_prefsrc)
2830                                 fl4->saddr = res.fi->fib_prefsrc;
2831                         else
2832                                 fl4->saddr = fl4->daddr;
2833                 }
2834                 dev_out = net->loopback_dev;
2835                 fl4->flowi4_oif = dev_out->ifindex;
2836                 res.fi = NULL;
2837                 flags |= RTCF_LOCAL;
2838                 goto make_route;
2839         }
2840
2841 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2842         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2843                 fib_select_multipath(&res);
2844         else
2845 #endif
2846         if (!res.prefixlen &&
2847             res.table->tb_num_default > 1 &&
2848             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2849                 fib_select_default(&res);
2850
2851         if (!fl4->saddr)
2852                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2853
2854         dev_out = FIB_RES_DEV(res);
2855         fl4->flowi4_oif = dev_out->ifindex;
2856
2857
2858 make_route:
2859         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2860                                tos, dev_out, flags);
2861         if (!IS_ERR(rth)) {
2862                 unsigned int hash;
2863
2864                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2865                                rt_genid(dev_net(dev_out)));
2866                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2867         }
2868
2869 out:
2870         rcu_read_unlock();
2871         return rth;
2872 }
2873
2874 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2875 {
2876         struct rtable *rth;
2877         unsigned int hash;
2878
2879         if (!rt_caching(net))
2880                 goto slow_output;
2881
2882         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2883
2884         rcu_read_lock_bh();
2885         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2886                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2887                 if (rth->rt_key_dst == flp4->daddr &&
2888                     rth->rt_key_src == flp4->saddr &&
2889                     rt_is_output_route(rth) &&
2890                     rth->rt_oif == flp4->flowi4_oif &&
2891                     rth->rt_mark == flp4->flowi4_mark &&
2892                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2893                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2894                     net_eq(dev_net(rth->dst.dev), net) &&
2895                     !rt_is_expired(rth)) {
2896                         ipv4_validate_peer(rth);
2897                         dst_use(&rth->dst, jiffies);
2898                         RT_CACHE_STAT_INC(out_hit);
2899                         rcu_read_unlock_bh();
2900                         if (!flp4->saddr)
2901                                 flp4->saddr = rth->rt_src;
2902                         if (!flp4->daddr)
2903                                 flp4->daddr = rth->rt_dst;
2904                         return rth;
2905                 }
2906                 RT_CACHE_STAT_INC(out_hlist_search);
2907         }
2908         rcu_read_unlock_bh();
2909
2910 slow_output:
2911         return ip_route_output_slow(net, flp4);
2912 }
2913 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2914
2915 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2916 {
2917         return NULL;
2918 }
2919
2920 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2921 {
2922         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2923
2924         return mtu ? : dst->dev->mtu;
2925 }
2926
2927 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2928 {
2929 }
2930
2931 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2932                                           unsigned long old)
2933 {
2934         return NULL;
2935 }
2936
2937 static struct dst_ops ipv4_dst_blackhole_ops = {
2938         .family                 =       AF_INET,
2939         .protocol               =       cpu_to_be16(ETH_P_IP),
2940         .destroy                =       ipv4_dst_destroy,
2941         .check                  =       ipv4_blackhole_dst_check,
2942         .mtu                    =       ipv4_blackhole_mtu,
2943         .default_advmss         =       ipv4_default_advmss,
2944         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2945         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2946         .neigh_lookup           =       ipv4_neigh_lookup,
2947 };
2948
2949 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2950 {
2951         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2952         struct rtable *ort = (struct rtable *) dst_orig;
2953
2954         if (rt) {
2955                 struct dst_entry *new = &rt->dst;
2956
2957                 new->__use = 1;
2958                 new->input = dst_discard;
2959                 new->output = dst_discard;
2960                 dst_copy_metrics(new, &ort->dst);
2961
2962                 new->dev = ort->dst.dev;
2963                 if (new->dev)
2964                         dev_hold(new->dev);
2965
2966                 rt->rt_key_dst = ort->rt_key_dst;
2967                 rt->rt_key_src = ort->rt_key_src;
2968                 rt->rt_key_tos = ort->rt_key_tos;
2969                 rt->rt_route_iif = ort->rt_route_iif;
2970                 rt->rt_iif = ort->rt_iif;
2971                 rt->rt_oif = ort->rt_oif;
2972                 rt->rt_mark = ort->rt_mark;
2973
2974                 rt->rt_genid = rt_genid(net);
2975                 rt->rt_flags = ort->rt_flags;
2976                 rt->rt_type = ort->rt_type;
2977                 rt->rt_dst = ort->rt_dst;
2978                 rt->rt_src = ort->rt_src;
2979                 rt->rt_gateway = ort->rt_gateway;
2980                 rt->rt_spec_dst = ort->rt_spec_dst;
2981                 rt->peer = ort->peer;
2982                 if (rt->peer)
2983                         atomic_inc(&rt->peer->refcnt);
2984                 rt->fi = ort->fi;
2985                 if (rt->fi)
2986                         atomic_inc(&rt->fi->fib_clntref);
2987
2988                 dst_free(new);
2989         }
2990
2991         dst_release(dst_orig);
2992
2993         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2994 }
2995
2996 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2997                                     struct sock *sk)
2998 {
2999         struct rtable *rt = __ip_route_output_key(net, flp4);
3000
3001         if (IS_ERR(rt))
3002                 return rt;
3003
3004         if (flp4->flowi4_proto)
3005                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
3006                                                    flowi4_to_flowi(flp4),
3007                                                    sk, 0);
3008
3009         return rt;
3010 }
3011 EXPORT_SYMBOL_GPL(ip_route_output_flow);
3012
3013 static int rt_fill_info(struct net *net,
3014                         struct sk_buff *skb, u32 pid, u32 seq, int event,
3015                         int nowait, unsigned int flags)
3016 {
3017         struct rtable *rt = skb_rtable(skb);
3018         struct rtmsg *r;
3019         struct nlmsghdr *nlh;
3020         unsigned long expires = 0;
3021         const struct inet_peer *peer = rt->peer;
3022         u32 id = 0, ts = 0, tsage = 0, error;
3023
3024         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
3025         if (nlh == NULL)
3026                 return -EMSGSIZE;
3027
3028         r = nlmsg_data(nlh);
3029         r->rtm_family    = AF_INET;
3030         r->rtm_dst_len  = 32;
3031         r->rtm_src_len  = 0;
3032         r->rtm_tos      = rt->rt_key_tos;
3033         r->rtm_table    = RT_TABLE_MAIN;
3034         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
3035         r->rtm_type     = rt->rt_type;
3036         r->rtm_scope    = RT_SCOPE_UNIVERSE;
3037         r->rtm_protocol = RTPROT_UNSPEC;
3038         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
3039         if (rt->rt_flags & RTCF_NOTIFY)
3040                 r->rtm_flags |= RTM_F_NOTIFY;
3041
3042         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
3043
3044         if (rt->rt_key_src) {
3045                 r->rtm_src_len = 32;
3046                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
3047         }
3048         if (rt->dst.dev)
3049                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
3050 #ifdef CONFIG_IP_ROUTE_CLASSID
3051         if (rt->dst.tclassid)
3052                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
3053 #endif
3054         if (rt_is_input_route(rt))
3055                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
3056         else if (rt->rt_src != rt->rt_key_src)
3057                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3058
3059         if (rt->rt_dst != rt->rt_gateway)
3060                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3061
3062         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3063                 goto nla_put_failure;
3064
3065         if (rt->rt_mark)
3066                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3067
3068         error = rt->dst.error;
3069         if (peer) {
3070                 inet_peer_refcheck(rt->peer);
3071                 if (peer->tcp_ts_stamp) {
3072                         ts = peer->tcp_ts;
3073                         tsage = get_seconds() - peer->tcp_ts_stamp;
3074                 }
3075                 expires = ACCESS_ONCE(peer->pmtu_expires);
3076                 if (expires) {
3077                         if (time_before(jiffies, expires))
3078                                 expires -= jiffies;
3079                         else
3080                                 expires = 0;
3081                 }
3082         }
3083
3084         if (rt_is_input_route(rt)) {
3085 #ifdef CONFIG_IP_MROUTE
3086                 __be32 dst = rt->rt_dst;
3087
3088                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3089                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3090                         int err = ipmr_get_route(net, skb,
3091                                                  rt->rt_src, rt->rt_dst,
3092                                                  r, nowait, pid);
3093
3094                         if (err <= 0) {
3095                                 if (!nowait) {
3096                                         if (err == 0)
3097                                                 return 0;
3098                                         goto nla_put_failure;
3099                                 } else {
3100                                         if (err == -EMSGSIZE)
3101                                                 goto nla_put_failure;
3102                                         error = err;
3103                                 }
3104                         }
3105                 } else
3106 #endif
3107                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3108         }
3109
3110         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3111                                expires, error) < 0)
3112                 goto nla_put_failure;
3113
3114         return nlmsg_end(skb, nlh);
3115
3116 nla_put_failure:
3117         nlmsg_cancel(skb, nlh);
3118         return -EMSGSIZE;
3119 }
3120
3121 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3122 {
3123         struct net *net = sock_net(in_skb->sk);
3124         struct rtmsg *rtm;
3125         struct nlattr *tb[RTA_MAX+1];
3126         struct rtable *rt = NULL;
3127         __be32 dst = 0;
3128         __be32 src = 0;
3129         u32 iif;
3130         int err;
3131         int mark;
3132         struct sk_buff *skb;
3133
3134         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3135         if (err < 0)
3136                 goto errout;
3137
3138         rtm = nlmsg_data(nlh);
3139
3140         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3141         if (skb == NULL) {
3142                 err = -ENOBUFS;
3143                 goto errout;
3144         }
3145
3146         /* Reserve room for dummy headers, this skb can pass
3147            through good chunk of routing engine.
3148          */
3149         skb_reset_mac_header(skb);
3150         skb_reset_network_header(skb);
3151
3152         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3153         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3154         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3155
3156         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3157         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3158         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3159         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3160
3161         if (iif) {
3162                 struct net_device *dev;
3163
3164                 dev = __dev_get_by_index(net, iif);
3165                 if (dev == NULL) {
3166                         err = -ENODEV;
3167                         goto errout_free;
3168                 }
3169
3170                 skb->protocol   = htons(ETH_P_IP);
3171                 skb->dev        = dev;
3172                 skb->mark       = mark;
3173                 local_bh_disable();
3174                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3175                 local_bh_enable();
3176
3177                 rt = skb_rtable(skb);
3178                 if (err == 0 && rt->dst.error)
3179                         err = -rt->dst.error;
3180         } else {
3181                 struct flowi4 fl4 = {
3182                         .daddr = dst,
3183                         .saddr = src,
3184                         .flowi4_tos = rtm->rtm_tos,
3185                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3186                         .flowi4_mark = mark,
3187                 };
3188                 rt = ip_route_output_key(net, &fl4);
3189
3190                 err = 0;
3191                 if (IS_ERR(rt))
3192                         err = PTR_ERR(rt);
3193         }
3194
3195         if (err)
3196                 goto errout_free;
3197
3198         skb_dst_set(skb, &rt->dst);
3199         if (rtm->rtm_flags & RTM_F_NOTIFY)
3200                 rt->rt_flags |= RTCF_NOTIFY;
3201
3202         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3203                            RTM_NEWROUTE, 0, 0);
3204         if (err <= 0)
3205                 goto errout_free;
3206
3207         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3208 errout:
3209         return err;
3210
3211 errout_free:
3212         kfree_skb(skb);
3213         goto errout;
3214 }
3215
3216 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3217 {
3218         struct rtable *rt;
3219         int h, s_h;
3220         int idx, s_idx;
3221         struct net *net;
3222
3223         net = sock_net(skb->sk);
3224
3225         s_h = cb->args[0];
3226         if (s_h < 0)
3227                 s_h = 0;
3228         s_idx = idx = cb->args[1];
3229         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3230                 if (!rt_hash_table[h].chain)
3231                         continue;
3232                 rcu_read_lock_bh();
3233                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3234                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3235                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3236                                 continue;
3237                         if (rt_is_expired(rt))
3238                                 continue;
3239                         skb_dst_set_noref(skb, &rt->dst);
3240                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3241                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3242                                          1, NLM_F_MULTI) <= 0) {
3243                                 skb_dst_drop(skb);
3244                                 rcu_read_unlock_bh();
3245                                 goto done;
3246                         }
3247                         skb_dst_drop(skb);
3248                 }
3249                 rcu_read_unlock_bh();
3250         }
3251
3252 done:
3253         cb->args[0] = h;
3254         cb->args[1] = idx;
3255         return skb->len;
3256 }
3257
3258 void ip_rt_multicast_event(struct in_device *in_dev)
3259 {
3260         rt_cache_flush(dev_net(in_dev->dev), 0);
3261 }
3262
3263 #ifdef CONFIG_SYSCTL
3264 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3265                                         void __user *buffer,
3266                                         size_t *lenp, loff_t *ppos)
3267 {
3268         if (write) {
3269                 int flush_delay;
3270                 ctl_table ctl;
3271                 struct net *net;
3272
3273                 memcpy(&ctl, __ctl, sizeof(ctl));
3274                 ctl.data = &flush_delay;
3275                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3276
3277                 net = (struct net *)__ctl->extra1;
3278                 rt_cache_flush(net, flush_delay);
3279                 return 0;
3280         }
3281
3282         return -EINVAL;
3283 }
3284
3285 static ctl_table ipv4_route_table[] = {
3286         {
3287                 .procname       = "gc_thresh",
3288                 .data           = &ipv4_dst_ops.gc_thresh,
3289                 .maxlen         = sizeof(int),
3290                 .mode           = 0644,
3291                 .proc_handler   = proc_dointvec,
3292         },
3293         {
3294                 .procname       = "max_size",
3295                 .data           = &ip_rt_max_size,
3296                 .maxlen         = sizeof(int),
3297                 .mode           = 0644,
3298                 .proc_handler   = proc_dointvec,
3299         },
3300         {
3301                 /*  Deprecated. Use gc_min_interval_ms */
3302
3303                 .procname       = "gc_min_interval",
3304                 .data           = &ip_rt_gc_min_interval,
3305                 .maxlen         = sizeof(int),
3306                 .mode           = 0644,
3307                 .proc_handler   = proc_dointvec_jiffies,
3308         },
3309         {
3310                 .procname       = "gc_min_interval_ms",
3311                 .data           = &ip_rt_gc_min_interval,
3312                 .maxlen         = sizeof(int),
3313                 .mode           = 0644,
3314                 .proc_handler   = proc_dointvec_ms_jiffies,
3315         },
3316         {
3317                 .procname       = "gc_timeout",
3318                 .data           = &ip_rt_gc_timeout,
3319                 .maxlen         = sizeof(int),
3320                 .mode           = 0644,
3321                 .proc_handler   = proc_dointvec_jiffies,
3322         },
3323         {
3324                 .procname       = "gc_interval",
3325                 .data           = &ip_rt_gc_interval,
3326                 .maxlen         = sizeof(int),
3327                 .mode           = 0644,
3328                 .proc_handler   = proc_dointvec_jiffies,
3329         },
3330         {
3331                 .procname       = "redirect_load",
3332                 .data           = &ip_rt_redirect_load,
3333                 .maxlen         = sizeof(int),
3334                 .mode           = 0644,
3335                 .proc_handler   = proc_dointvec,
3336         },
3337         {
3338                 .procname       = "redirect_number",
3339                 .data           = &ip_rt_redirect_number,
3340                 .maxlen         = sizeof(int),
3341                 .mode           = 0644,
3342                 .proc_handler   = proc_dointvec,
3343         },
3344         {
3345                 .procname       = "redirect_silence",
3346                 .data           = &ip_rt_redirect_silence,
3347                 .maxlen         = sizeof(int),
3348                 .mode           = 0644,
3349                 .proc_handler   = proc_dointvec,
3350         },
3351         {
3352                 .procname       = "error_cost",
3353                 .data           = &ip_rt_error_cost,
3354                 .maxlen         = sizeof(int),
3355                 .mode           = 0644,
3356                 .proc_handler   = proc_dointvec,
3357         },
3358         {
3359                 .procname       = "error_burst",
3360                 .data           = &ip_rt_error_burst,
3361                 .maxlen         = sizeof(int),
3362                 .mode           = 0644,
3363                 .proc_handler   = proc_dointvec,
3364         },
3365         {
3366                 .procname       = "gc_elasticity",
3367                 .data           = &ip_rt_gc_elasticity,
3368                 .maxlen         = sizeof(int),
3369                 .mode           = 0644,
3370                 .proc_handler   = proc_dointvec,
3371         },
3372         {
3373                 .procname       = "mtu_expires",
3374                 .data           = &ip_rt_mtu_expires,
3375                 .maxlen         = sizeof(int),
3376                 .mode           = 0644,
3377                 .proc_handler   = proc_dointvec_jiffies,
3378         },
3379         {
3380                 .procname       = "min_pmtu",
3381                 .data           = &ip_rt_min_pmtu,
3382                 .maxlen         = sizeof(int),
3383                 .mode           = 0644,
3384                 .proc_handler   = proc_dointvec,
3385         },
3386         {
3387                 .procname       = "min_adv_mss",
3388                 .data           = &ip_rt_min_advmss,
3389                 .maxlen         = sizeof(int),
3390                 .mode           = 0644,
3391                 .proc_handler   = proc_dointvec,
3392         },
3393         { }
3394 };
3395
3396 static struct ctl_table empty[1];
3397
3398 static struct ctl_table ipv4_skeleton[] =
3399 {
3400         { .procname = "route",
3401           .mode = 0555, .child = ipv4_route_table},
3402         { .procname = "neigh",
3403           .mode = 0555, .child = empty},
3404         { }
3405 };
3406
3407 static __net_initdata struct ctl_path ipv4_path[] = {
3408         { .procname = "net", },
3409         { .procname = "ipv4", },
3410         { },
3411 };
3412
3413 static struct ctl_table ipv4_route_flush_table[] = {
3414         {
3415                 .procname       = "flush",
3416                 .maxlen         = sizeof(int),
3417                 .mode           = 0200,
3418                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3419         },
3420         { },
3421 };
3422
3423 static __net_initdata struct ctl_path ipv4_route_path[] = {
3424         { .procname = "net", },
3425         { .procname = "ipv4", },
3426         { .procname = "route", },
3427         { },
3428 };
3429
3430 static __net_init int sysctl_route_net_init(struct net *net)
3431 {
3432         struct ctl_table *tbl;
3433
3434         tbl = ipv4_route_flush_table;
3435         if (!net_eq(net, &init_net)) {
3436                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3437                 if (tbl == NULL)
3438                         goto err_dup;
3439         }
3440         tbl[0].extra1 = net;
3441
3442         net->ipv4.route_hdr =
3443                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3444         if (net->ipv4.route_hdr == NULL)
3445                 goto err_reg;
3446         return 0;
3447
3448 err_reg:
3449         if (tbl != ipv4_route_flush_table)
3450                 kfree(tbl);
3451 err_dup:
3452         return -ENOMEM;
3453 }
3454
3455 static __net_exit void sysctl_route_net_exit(struct net *net)
3456 {
3457         struct ctl_table *tbl;
3458
3459         tbl = net->ipv4.route_hdr->ctl_table_arg;
3460         unregister_net_sysctl_table(net->ipv4.route_hdr);
3461         BUG_ON(tbl == ipv4_route_flush_table);
3462         kfree(tbl);
3463 }
3464
3465 static __net_initdata struct pernet_operations sysctl_route_ops = {
3466         .init = sysctl_route_net_init,
3467         .exit = sysctl_route_net_exit,
3468 };
3469 #endif
3470
3471 static __net_init int rt_genid_init(struct net *net)
3472 {
3473         get_random_bytes(&net->ipv4.rt_genid,
3474                          sizeof(net->ipv4.rt_genid));
3475         get_random_bytes(&net->ipv4.dev_addr_genid,
3476                          sizeof(net->ipv4.dev_addr_genid));
3477         return 0;
3478 }
3479
3480 static __net_initdata struct pernet_operations rt_genid_ops = {
3481         .init = rt_genid_init,
3482 };
3483
3484
3485 #ifdef CONFIG_IP_ROUTE_CLASSID
3486 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3487 #endif /* CONFIG_IP_ROUTE_CLASSID */
3488
3489 static __initdata unsigned long rhash_entries;
3490 static int __init set_rhash_entries(char *str)
3491 {
3492         if (!str)
3493                 return 0;
3494         rhash_entries = simple_strtoul(str, &str, 0);
3495         return 1;
3496 }
3497 __setup("rhash_entries=", set_rhash_entries);
3498
3499 int __init ip_rt_init(void)
3500 {
3501         int rc = 0;
3502
3503         ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3504         if (!ip_idents)
3505                 panic("IP: failed to allocate ip_idents\n");
3506
3507         get_random_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3508
3509 #ifdef CONFIG_IP_ROUTE_CLASSID
3510         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3511         if (!ip_rt_acct)
3512                 panic("IP: failed to allocate ip_rt_acct\n");
3513 #endif
3514
3515         ipv4_dst_ops.kmem_cachep =
3516                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3517                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3518
3519         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3520
3521         if (dst_entries_init(&ipv4_dst_ops) < 0)
3522                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3523
3524         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3525                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3526
3527         rt_hash_table = (struct rt_hash_bucket *)
3528                 alloc_large_system_hash("IP route cache",
3529                                         sizeof(struct rt_hash_bucket),
3530                                         rhash_entries,
3531                                         (totalram_pages >= 128 * 1024) ?
3532                                         15 : 17,
3533                                         0,
3534                                         &rt_hash_log,
3535                                         &rt_hash_mask,
3536                                         rhash_entries ? 0 : 512 * 1024);
3537         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3538         rt_hash_lock_init();
3539
3540         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3541         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3542
3543         devinet_init();
3544         ip_fib_init();
3545
3546         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3547         expires_ljiffies = jiffies;
3548         schedule_delayed_work(&expires_work,
3549                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3550
3551         if (ip_rt_proc_init())
3552                 printk(KERN_ERR "Unable to create route proc files\n");
3553 #ifdef CONFIG_XFRM
3554         xfrm_init();
3555         xfrm4_init(ip_rt_max_size);
3556 #endif
3557         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3558
3559 #ifdef CONFIG_SYSCTL
3560         register_pernet_subsys(&sysctl_route_ops);
3561 #endif
3562         register_pernet_subsys(&rt_genid_ops);
3563         return rc;
3564 }
3565
3566 #ifdef CONFIG_SYSCTL
3567 /*
3568  * We really need to sanitize the damn ipv4 init order, then all
3569  * this nonsense will go away.
3570  */
3571 void __init ip_static_sysctl_init(void)
3572 {
3573         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3574 }
3575 #endif