net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <linux/prefetch.h>
  95 #include <net/dst.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #endif
 112 #include <net/secure_seq.h>
 113
 114 #define RT_FL_TOS(oldflp4) \
 115         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 116
 117 #define IP_MAX_MTU      0xFFF0
 118
 119 #define RT_GC_TIMEOUT (300*HZ)
 120
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 124 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_gc_elasticity __read_mostly    = 8;
 131 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 132 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 133 static int ip_rt_min_advmss __read_mostly       = 256;
 134 static int rt_chain_length_max __read_mostly    = 20;
 135
 136 static struct delayed_work expires_work;
 137 static unsigned long expires_ljiffies;
 138
 139 /*
 140  *      Interface to generic destination cache.
 141  */
 142
 143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 144 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 145 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 146 static void              ipv4_dst_destroy(struct dst_entry *dst);
 147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 148 static void              ipv4_link_failure(struct sk_buff *skb);
 149 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 150 static int rt_garbage_collect(struct dst_ops *ops);
 151
 152 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 153                             int how)
 154 {
 155 }
 156
 157 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 158 {
 159         struct rtable *rt = (struct rtable *) dst;
 160         struct inet_peer *peer;
 161         u32 *p = NULL;
 162
 163         if (!rt->peer)
 164                 rt_bind_peer(rt, rt->rt_dst, 1);
 165
 166         peer = rt->peer;
 167         if (peer) {
 168                 u32 *old_p = __DST_METRICS_PTR(old);
 169                 unsigned long prev, new;
 170
 171                 p = peer->metrics;
 172                 if (inet_metrics_new(peer))
 173                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 174
 175                 new = (unsigned long) p;
 176                 prev = cmpxchg(&dst->_metrics, old, new);
 177
 178                 if (prev != old) {
 179                         p = __DST_METRICS_PTR(prev);
 180                         if (prev & DST_METRICS_READ_ONLY)
 181                                 p = NULL;
 182                 } else {
 183                         if (rt->fi) {
 184                                 fib_info_put(rt->fi);
 185                                 rt->fi = NULL;
 186                         }
 187                 }
 188         }
 189         return p;
 190 }
 191
 192 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 193
 194 static struct dst_ops ipv4_dst_ops = {
 195         .family =               AF_INET,
 196         .protocol =             cpu_to_be16(ETH_P_IP),
 197         .gc =                   rt_garbage_collect,
 198         .check =                ipv4_dst_check,
 199         .default_advmss =       ipv4_default_advmss,
 200         .mtu =                  ipv4_mtu,
 201         .cow_metrics =          ipv4_cow_metrics,
 202         .destroy =              ipv4_dst_destroy,
 203         .ifdown =               ipv4_dst_ifdown,
 204         .negative_advice =      ipv4_negative_advice,
 205         .link_failure =         ipv4_link_failure,
 206         .update_pmtu =          ip_rt_update_pmtu,
 207         .local_out =            __ip_local_out,
 208         .neigh_lookup =         ipv4_neigh_lookup,
 209 };
 210
 211 #define ECN_OR_COST(class)      TC_PRIO_##class
 212
 213 const __u8 ip_tos2prio[16] = {
 214         TC_PRIO_BESTEFFORT,
 215         ECN_OR_COST(BESTEFFORT),
 216         TC_PRIO_BESTEFFORT,
 217         ECN_OR_COST(BESTEFFORT),
 218         TC_PRIO_BULK,
 219         ECN_OR_COST(BULK),
 220         TC_PRIO_BULK,
 221         ECN_OR_COST(BULK),
 222         TC_PRIO_INTERACTIVE,
 223         ECN_OR_COST(INTERACTIVE),
 224         TC_PRIO_INTERACTIVE,
 225         ECN_OR_COST(INTERACTIVE),
 226         TC_PRIO_INTERACTIVE_BULK,
 227         ECN_OR_COST(INTERACTIVE_BULK),
 228         TC_PRIO_INTERACTIVE_BULK,
 229         ECN_OR_COST(INTERACTIVE_BULK)
 230 };
 231
 232
 233 /*
 234  * Route cache.
 235  */
 236
 237 /* The locking scheme is rather straight forward:
 238  *
 239  * 1) Read-Copy Update protects the buckets of the central route hash.
 240  * 2) Only writers remove entries, and they hold the lock
 241  *    as they look at rtable reference counts.
 242  * 3) Only readers acquire references to rtable entries,
 243  *    they do so with atomic increments and with the
 244  *    lock held.
 245  */
 246
 247 struct rt_hash_bucket {
 248         struct rtable __rcu     *chain;
 249 };
 250
 251 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 252         defined(CONFIG_PROVE_LOCKING)
 253 /*
 254  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 255  * The size of this table is a power of two and depends on the number of CPUS.
 256  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 257  */
 258 #ifdef CONFIG_LOCKDEP
 259 # define RT_HASH_LOCK_SZ        256
 260 #else
 261 # if NR_CPUS >= 32
 262 #  define RT_HASH_LOCK_SZ       4096
 263 # elif NR_CPUS >= 16
 264 #  define RT_HASH_LOCK_SZ       2048
 265 # elif NR_CPUS >= 8
 266 #  define RT_HASH_LOCK_SZ       1024
 267 # elif NR_CPUS >= 4
 268 #  define RT_HASH_LOCK_SZ       512
 269 # else
 270 #  define RT_HASH_LOCK_SZ       256
 271 # endif
 272 #endif
 273
 274 static spinlock_t       *rt_hash_locks;
 275 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 276
 277 static __init void rt_hash_lock_init(void)
 278 {
 279         int i;
 280
 281         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 282                         GFP_KERNEL);
 283         if (!rt_hash_locks)
 284                 panic("IP: failed to allocate rt_hash_locks\n");
 285
 286         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 287                 spin_lock_init(&rt_hash_locks[i]);
 288 }
 289 #else
 290 # define rt_hash_lock_addr(slot) NULL
 291
 292 static inline void rt_hash_lock_init(void)
 293 {
 294 }
 295 #endif
 296
 297 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 298 static unsigned                 rt_hash_mask __read_mostly;
 299 static unsigned int             rt_hash_log  __read_mostly;
 300
 301 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 302 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 303
 304 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 305                                    int genid)
 306 {
 307         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 308                             idx, genid)
 309                 & rt_hash_mask;
 310 }
 311
 312 static inline int rt_genid(struct net *net)
 313 {
 314         return atomic_read(&net->ipv4.rt_genid);
 315 }
 316
 317 #ifdef CONFIG_PROC_FS
 318 struct rt_cache_iter_state {
 319         struct seq_net_private p;
 320         int bucket;
 321         int genid;
 322 };
 323
 324 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 325 {
 326         struct rt_cache_iter_state *st = seq->private;
 327         struct rtable *r = NULL;
 328
 329         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 330                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 331                         continue;
 332                 rcu_read_lock_bh();
 333                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 334                 while (r) {
 335                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 336                             r->rt_genid == st->genid)
 337                                 return r;
 338                         r = rcu_dereference_bh(r->dst.rt_next);
 339                 }
 340                 rcu_read_unlock_bh();
 341         }
 342         return r;
 343 }
 344
 345 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 346                                           struct rtable *r)
 347 {
 348         struct rt_cache_iter_state *st = seq->private;
 349
 350         r = rcu_dereference_bh(r->dst.rt_next);
 351         while (!r) {
 352                 rcu_read_unlock_bh();
 353                 do {
 354                         if (--st->bucket < 0)
 355                                 return NULL;
 356                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 357                 rcu_read_lock_bh();
 358                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 359         }
 360         return r;
 361 }
 362
 363 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 364                                         struct rtable *r)
 365 {
 366         struct rt_cache_iter_state *st = seq->private;
 367         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 368                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 369                         continue;
 370                 if (r->rt_genid == st->genid)
 371                         break;
 372         }
 373         return r;
 374 }
 375
 376 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 377 {
 378         struct rtable *r = rt_cache_get_first(seq);
 379
 380         if (r)
 381                 while (pos && (r = rt_cache_get_next(seq, r)))
 382                         --pos;
 383         return pos ? NULL : r;
 384 }
 385
 386 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 387 {
 388         struct rt_cache_iter_state *st = seq->private;
 389         if (*pos)
 390                 return rt_cache_get_idx(seq, *pos - 1);
 391         st->genid = rt_genid(seq_file_net(seq));
 392         return SEQ_START_TOKEN;
 393 }
 394
 395 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 396 {
 397         struct rtable *r;
 398
 399         if (v == SEQ_START_TOKEN)
 400                 r = rt_cache_get_first(seq);
 401         else
 402                 r = rt_cache_get_next(seq, v);
 403         ++*pos;
 404         return r;
 405 }
 406
 407 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 408 {
 409         if (v && v != SEQ_START_TOKEN)
 410                 rcu_read_unlock_bh();
 411 }
 412
 413 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 414 {
 415         if (v == SEQ_START_TOKEN)
 416                 seq_printf(seq, "%-127s\n",
 417                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 418                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 419                            "HHUptod\tSpecDst");
 420         else {
 421                 struct rtable *r = v;
 422                 struct neighbour *n;
 423                 int len, HHUptod;
 424
 425                 rcu_read_lock();
 426                 n = dst_get_neighbour_noref(&r->dst);
 427                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 428                 rcu_read_unlock();
 429
 430                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 431                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 432                         r->dst.dev ? r->dst.dev->name : "*",
 433                         (__force u32)r->rt_dst,
 434                         (__force u32)r->rt_gateway,
 435                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 436                         r->dst.__use, 0, (__force u32)r->rt_src,
 437                         dst_metric_advmss(&r->dst) + 40,
 438                         dst_metric(&r->dst, RTAX_WINDOW),
 439                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 440                               dst_metric(&r->dst, RTAX_RTTVAR)),
 441                         r->rt_key_tos,
 442                         -1,
 443                         HHUptod,
 444                         r->rt_spec_dst, &len);
 445
 446                 seq_printf(seq, "%*s\n", 127 - len, "");
 447         }
 448         return 0;
 449 }
 450
 451 static const struct seq_operations rt_cache_seq_ops = {
 452         .start  = rt_cache_seq_start,
 453         .next   = rt_cache_seq_next,
 454         .stop   = rt_cache_seq_stop,
 455         .show   = rt_cache_seq_show,
 456 };
 457
 458 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 459 {
 460         return seq_open_net(inode, file, &rt_cache_seq_ops,
 461                         sizeof(struct rt_cache_iter_state));
 462 }
 463
 464 static const struct file_operations rt_cache_seq_fops = {
 465         .owner   = THIS_MODULE,
 466         .open    = rt_cache_seq_open,
 467         .read    = seq_read,
 468         .llseek  = seq_lseek,
 469         .release = seq_release_net,
 470 };
 471
 472
 473 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 474 {
 475         int cpu;
 476
 477         if (*pos == 0)
 478                 return SEQ_START_TOKEN;
 479
 480         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 481                 if (!cpu_possible(cpu))
 482                         continue;
 483                 *pos = cpu+1;
 484                 return &per_cpu(rt_cache_stat, cpu);
 485         }
 486         return NULL;
 487 }
 488
 489 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 490 {
 491         int cpu;
 492
 493         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 494                 if (!cpu_possible(cpu))
 495                         continue;
 496                 *pos = cpu+1;
 497                 return &per_cpu(rt_cache_stat, cpu);
 498         }
 499         return NULL;
 500
 501 }
 502
 503 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 504 {
 505
 506 }
 507
 508 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 509 {
 510         struct rt_cache_stat *st = v;
 511
 512         if (v == SEQ_START_TOKEN) {
 513                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 514                 return 0;
 515         }
 516
 517         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 518                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 519                    dst_entries_get_slow(&ipv4_dst_ops),
 520                    st->in_hit,
 521                    st->in_slow_tot,
 522                    st->in_slow_mc,
 523                    st->in_no_route,
 524                    st->in_brd,
 525                    st->in_martian_dst,
 526                    st->in_martian_src,
 527
 528                    st->out_hit,
 529                    st->out_slow_tot,
 530                    st->out_slow_mc,
 531
 532                    st->gc_total,
 533                    st->gc_ignored,
 534                    st->gc_goal_miss,
 535                    st->gc_dst_overflow,
 536                    st->in_hlist_search,
 537                    st->out_hlist_search
 538                 );
 539         return 0;
 540 }
 541
 542 static const struct seq_operations rt_cpu_seq_ops = {
 543         .start  = rt_cpu_seq_start,
 544         .next   = rt_cpu_seq_next,
 545         .stop   = rt_cpu_seq_stop,
 546         .show   = rt_cpu_seq_show,
 547 };
 548
 549
 550 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 551 {
 552         return seq_open(file, &rt_cpu_seq_ops);
 553 }
 554
 555 static const struct file_operations rt_cpu_seq_fops = {
 556         .owner   = THIS_MODULE,
 557         .open    = rt_cpu_seq_open,
 558         .read    = seq_read,
 559         .llseek  = seq_lseek,
 560         .release = seq_release,
 561 };
 562
 563 #ifdef CONFIG_IP_ROUTE_CLASSID
 564 static int rt_acct_proc_show(struct seq_file *m, void *v)
 565 {
 566         struct ip_rt_acct *dst, *src;
 567         unsigned int i, j;
 568
 569         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 570         if (!dst)
 571                 return -ENOMEM;
 572
 573         for_each_possible_cpu(i) {
 574                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 575                 for (j = 0; j < 256; j++) {
 576                         dst[j].o_bytes   += src[j].o_bytes;
 577                         dst[j].o_packets += src[j].o_packets;
 578                         dst[j].i_bytes   += src[j].i_bytes;
 579                         dst[j].i_packets += src[j].i_packets;
 580                 }
 581         }
 582
 583         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 584         kfree(dst);
 585         return 0;
 586 }
 587
 588 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 589 {
 590         return single_open(file, rt_acct_proc_show, NULL);
 591 }
 592
 593 static const struct file_operations rt_acct_proc_fops = {
 594         .owner          = THIS_MODULE,
 595         .open           = rt_acct_proc_open,
 596         .read           = seq_read,
 597         .llseek         = seq_lseek,
 598         .release        = single_release,
 599 };
 600 #endif
 601
 602 static int __net_init ip_rt_do_proc_init(struct net *net)
 603 {
 604         struct proc_dir_entry *pde;
 605
 606         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 607                         &rt_cache_seq_fops);
 608         if (!pde)
 609                 goto err1;
 610
 611         pde = proc_create("rt_cache", S_IRUGO,
 612                           net->proc_net_stat, &rt_cpu_seq_fops);
 613         if (!pde)
 614                 goto err2;
 615
 616 #ifdef CONFIG_IP_ROUTE_CLASSID
 617         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 618         if (!pde)
 619                 goto err3;
 620 #endif
 621         return 0;
 622
 623 #ifdef CONFIG_IP_ROUTE_CLASSID
 624 err3:
 625         remove_proc_entry("rt_cache", net->proc_net_stat);
 626 #endif
 627 err2:
 628         remove_proc_entry("rt_cache", net->proc_net);
 629 err1:
 630         return -ENOMEM;
 631 }
 632
 633 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 634 {
 635         remove_proc_entry("rt_cache", net->proc_net_stat);
 636         remove_proc_entry("rt_cache", net->proc_net);
 637 #ifdef CONFIG_IP_ROUTE_CLASSID
 638         remove_proc_entry("rt_acct", net->proc_net);
 639 #endif
 640 }
 641
 642 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 643         .init = ip_rt_do_proc_init,
 644         .exit = ip_rt_do_proc_exit,
 645 };
 646
 647 static int __init ip_rt_proc_init(void)
 648 {
 649         return register_pernet_subsys(&ip_rt_proc_ops);
 650 }
 651
 652 #else
 653 static inline int ip_rt_proc_init(void)
 654 {
 655         return 0;
 656 }
 657 #endif /* CONFIG_PROC_FS */
 658
 659 static inline void rt_free(struct rtable *rt)
 660 {
 661         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 662 }
 663
 664 static inline void rt_drop(struct rtable *rt)
 665 {
 666         ip_rt_put(rt);
 667         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 668 }
 669
 670 static inline int rt_fast_clean(struct rtable *rth)
 671 {
 672         /* Kill broadcast/multicast entries very aggresively, if they
 673            collide in hash table with more useful entries */
 674         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 675                 rt_is_input_route(rth) && rth->dst.rt_next;
 676 }
 677
 678 static inline int rt_valuable(struct rtable *rth)
 679 {
 680         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 681                 (rth->peer && rth->peer->pmtu_expires);
 682 }
 683
 684 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 685 {
 686         unsigned long age;
 687         int ret = 0;
 688
 689         if (atomic_read(&rth->dst.__refcnt))
 690                 goto out;
 691
 692         age = jiffies - rth->dst.lastuse;
 693         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 694             (age <= tmo2 && rt_valuable(rth)))
 695                 goto out;
 696         ret = 1;
 697 out:    return ret;
 698 }
 699
 700 /* Bits of score are:
 701  * 31: very valuable
 702  * 30: not quite useless
 703  * 29..0: usage counter
 704  */
 705 static inline u32 rt_score(struct rtable *rt)
 706 {
 707         u32 score = jiffies - rt->dst.lastuse;
 708
 709         score = ~score & ~(3<<30);
 710
 711         if (rt_valuable(rt))
 712                 score |= (1<<31);
 713
 714         if (rt_is_output_route(rt) ||
 715             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 716                 score |= (1<<30);
 717
 718         return score;
 719 }
 720
 721 static inline bool rt_caching(const struct net *net)
 722 {
 723         return net->ipv4.current_rt_cache_rebuild_count <=
 724                 net->ipv4.sysctl_rt_cache_rebuild_count;
 725 }
 726
 727 static inline bool compare_hash_inputs(const struct rtable *rt1,
 728                                        const struct rtable *rt2)
 729 {
 730         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 731                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 732                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 733 }
 734
 735 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 736 {
 737         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 738                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 739                 (rt1->rt_mark ^ rt2->rt_mark) |
 740                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 741                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 742                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 743 }
 744
 745 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 746 {
 747         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 748 }
 749
 750 static inline int rt_is_expired(struct rtable *rth)
 751 {
 752         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 753 }
 754
 755 /*
 756  * Perform a full scan of hash table and free all entries.
 757  * Can be called by a softirq or a process.
 758  * In the later case, we want to be reschedule if necessary
 759  */
 760 static void rt_do_flush(struct net *net, int process_context)
 761 {
 762         unsigned int i;
 763         struct rtable *rth, *next;
 764
 765         for (i = 0; i <= rt_hash_mask; i++) {
 766                 struct rtable __rcu **pprev;
 767                 struct rtable *list;
 768
 769                 if (process_context && need_resched())
 770                         cond_resched();
 771                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 772                 if (!rth)
 773                         continue;
 774
 775                 spin_lock_bh(rt_hash_lock_addr(i));
 776
 777                 list = NULL;
 778                 pprev = &rt_hash_table[i].chain;
 779                 rth = rcu_dereference_protected(*pprev,
 780                         lockdep_is_held(rt_hash_lock_addr(i)));
 781
 782                 while (rth) {
 783                         next = rcu_dereference_protected(rth->dst.rt_next,
 784                                 lockdep_is_held(rt_hash_lock_addr(i)));
 785
 786                         if (!net ||
 787                             net_eq(dev_net(rth->dst.dev), net)) {
 788                                 rcu_assign_pointer(*pprev, next);
 789                                 rcu_assign_pointer(rth->dst.rt_next, list);
 790                                 list = rth;
 791                         } else {
 792                                 pprev = &rth->dst.rt_next;
 793                         }
 794                         rth = next;
 795                 }
 796
 797                 spin_unlock_bh(rt_hash_lock_addr(i));
 798
 799                 for (; list; list = next) {
 800                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 801                         rt_free(list);
 802                 }
 803         }
 804 }
 805
 806 /*
 807  * While freeing expired entries, we compute average chain length
 808  * and standard deviation, using fixed-point arithmetic.
 809  * This to have an estimation of rt_chain_length_max
 810  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 811  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 812  */
 813
 814 #define FRACT_BITS 3
 815 #define ONE (1UL << FRACT_BITS)
 816
 817 /*
 818  * Given a hash chain and an item in this hash chain,
 819  * find if a previous entry has the same hash_inputs
 820  * (but differs on tos, mark or oif)
 821  * Returns 0 if an alias is found.
 822  * Returns ONE if rth has no alias before itself.
 823  */
 824 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 825 {
 826         const struct rtable *aux = head;
 827
 828         while (aux != rth) {
 829                 if (compare_hash_inputs(aux, rth))
 830                         return 0;
 831                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 832         }
 833         return ONE;
 834 }
 835
 836 static void rt_check_expire(void)
 837 {
 838         static unsigned int rover;
 839         unsigned int i = rover, goal;
 840         struct rtable *rth;
 841         struct rtable __rcu **rthp;
 842         unsigned long samples = 0;
 843         unsigned long sum = 0, sum2 = 0;
 844         unsigned long delta;
 845         u64 mult;
 846
 847         delta = jiffies - expires_ljiffies;
 848         expires_ljiffies = jiffies;
 849         mult = ((u64)delta) << rt_hash_log;
 850         if (ip_rt_gc_timeout > 1)
 851                 do_div(mult, ip_rt_gc_timeout);
 852         goal = (unsigned int)mult;
 853         if (goal > rt_hash_mask)
 854                 goal = rt_hash_mask + 1;
 855         for (; goal > 0; goal--) {
 856                 unsigned long tmo = ip_rt_gc_timeout;
 857                 unsigned long length;
 858
 859                 i = (i + 1) & rt_hash_mask;
 860                 rthp = &rt_hash_table[i].chain;
 861
 862                 if (need_resched())
 863                         cond_resched();
 864
 865                 samples++;
 866
 867                 if (rcu_dereference_raw(*rthp) == NULL)
 868                         continue;
 869                 length = 0;
 870                 spin_lock_bh(rt_hash_lock_addr(i));
 871                 while ((rth = rcu_dereference_protected(*rthp,
 872                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 873                         prefetch(rth->dst.rt_next);
 874                         if (rt_is_expired(rth)) {
 875                                 *rthp = rth->dst.rt_next;
 876                                 rt_free(rth);
 877                                 continue;
 878                         }
 879                         if (rth->dst.expires) {
 880                                 /* Entry is expired even if it is in use */
 881                                 if (time_before_eq(jiffies, rth->dst.expires)) {
 882 nofree:
 883                                         tmo >>= 1;
 884                                         rthp = &rth->dst.rt_next;
 885                                         /*
 886                                          * We only count entries on
 887                                          * a chain with equal hash inputs once
 888                                          * so that entries for different QOS
 889                                          * levels, and other non-hash input
 890                                          * attributes don't unfairly skew
 891                                          * the length computation
 892                                          */
 893                                         length += has_noalias(rt_hash_table[i].chain, rth);
 894                                         continue;
 895                                 }
 896                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 897                                 goto nofree;
 898
 899                         /* Cleanup aged off entries. */
 900                         *rthp = rth->dst.rt_next;
 901                         rt_free(rth);
 902                 }
 903                 spin_unlock_bh(rt_hash_lock_addr(i));
 904                 sum += length;
 905                 sum2 += length*length;
 906         }
 907         if (samples) {
 908                 unsigned long avg = sum / samples;
 909                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 910                 rt_chain_length_max = max_t(unsigned long,
 911                                         ip_rt_gc_elasticity,
 912                                         (avg + 4*sd) >> FRACT_BITS);
 913         }
 914         rover = i;
 915 }
 916
 917 /*
 918  * rt_worker_func() is run in process context.
 919  * we call rt_check_expire() to scan part of the hash table
 920  */
 921 static void rt_worker_func(struct work_struct *work)
 922 {
 923         rt_check_expire();
 924         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 925 }
 926
 927 /*
 928  * Perturbation of rt_genid by a small quantity [1..256]
 929  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 930  * many times (2^24) without giving recent rt_genid.
 931  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 932  */
 933 static void rt_cache_invalidate(struct net *net)
 934 {
 935         unsigned char shuffle;
 936
 937         get_random_bytes(&shuffle, sizeof(shuffle));
 938         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 939         inetpeer_invalidate_tree(AF_INET);
 940 }
 941
 942 /*
 943  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 944  * delay >= 0 : invalidate & flush cache (can be long)
 945  */
 946 void rt_cache_flush(struct net *net, int delay)
 947 {
 948         rt_cache_invalidate(net);
 949         if (delay >= 0)
 950                 rt_do_flush(net, !in_softirq());
 951 }
 952
 953 /* Flush previous cache invalidated entries from the cache */
 954 void rt_cache_flush_batch(struct net *net)
 955 {
 956         rt_do_flush(net, !in_softirq());
 957 }
 958
 959 static void rt_emergency_hash_rebuild(struct net *net)
 960 {
 961         if (net_ratelimit())
 962                 printk(KERN_WARNING "Route hash chain too long!\n");
 963         rt_cache_invalidate(net);
 964 }
 965
 966 /*
 967    Short description of GC goals.
 968
 969    We want to build algorithm, which will keep routing cache
 970    at some equilibrium point, when number of aged off entries
 971    is kept approximately equal to newly generated ones.
 972
 973    Current expiration strength is variable "expire".
 974    We try to adjust it dynamically, so that if networking
 975    is idle expires is large enough to keep enough of warm entries,
 976    and when load increases it reduces to limit cache size.
 977  */
 978
 979 static int rt_garbage_collect(struct dst_ops *ops)
 980 {
 981         static unsigned long expire = RT_GC_TIMEOUT;
 982         static unsigned long last_gc;
 983         static int rover;
 984         static int equilibrium;
 985         struct rtable *rth;
 986         struct rtable __rcu **rthp;
 987         unsigned long now = jiffies;
 988         int goal;
 989         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 990
 991         /*
 992          * Garbage collection is pretty expensive,
 993          * do not make it too frequently.
 994          */
 995
 996         RT_CACHE_STAT_INC(gc_total);
 997
 998         if (now - last_gc < ip_rt_gc_min_interval &&
 999             entries < ip_rt_max_size) {
1000                 RT_CACHE_STAT_INC(gc_ignored);
1001                 goto out;
1002         }
1003
1004         entries = dst_entries_get_slow(&ipv4_dst_ops);
1005         /* Calculate number of entries, which we want to expire now. */
1006         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1007         if (goal <= 0) {
1008                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1009                         equilibrium = ipv4_dst_ops.gc_thresh;
1010                 goal = entries - equilibrium;
1011                 if (goal > 0) {
1012                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1013                         goal = entries - equilibrium;
1014                 }
1015         } else {
1016                 /* We are in dangerous area. Try to reduce cache really
1017                  * aggressively.
1018                  */
1019                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1020                 equilibrium = entries - goal;
1021         }
1022
1023         if (now - last_gc >= ip_rt_gc_min_interval)
1024                 last_gc = now;
1025
1026         if (goal <= 0) {
1027                 equilibrium += goal;
1028                 goto work_done;
1029         }
1030
1031         do {
1032                 int i, k;
1033
1034                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1035                         unsigned long tmo = expire;
1036
1037                         k = (k + 1) & rt_hash_mask;
1038                         rthp = &rt_hash_table[k].chain;
1039                         spin_lock_bh(rt_hash_lock_addr(k));
1040                         while ((rth = rcu_dereference_protected(*rthp,
1041                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1042                                 if (!rt_is_expired(rth) &&
1043                                         !rt_may_expire(rth, tmo, expire)) {
1044                                         tmo >>= 1;
1045                                         rthp = &rth->dst.rt_next;
1046                                         continue;
1047                                 }
1048                                 *rthp = rth->dst.rt_next;
1049                                 rt_free(rth);
1050                                 goal--;
1051                         }
1052                         spin_unlock_bh(rt_hash_lock_addr(k));
1053                         if (goal <= 0)
1054                                 break;
1055                 }
1056                 rover = k;
1057
1058                 if (goal <= 0)
1059                         goto work_done;
1060
1061                 /* Goal is not achieved. We stop process if:
1062
1063                    - if expire reduced to zero. Otherwise, expire is halfed.
1064                    - if table is not full.
1065                    - if we are called from interrupt.
1066                    - jiffies check is just fallback/debug loop breaker.
1067                      We will not spin here for long time in any case.
1068                  */
1069
1070                 RT_CACHE_STAT_INC(gc_goal_miss);
1071
1072                 if (expire == 0)
1073                         break;
1074
1075                 expire >>= 1;
1076
1077                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1078                         goto out;
1079         } while (!in_softirq() && time_before_eq(jiffies, now));
1080
1081         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1082                 goto out;
1083         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1084                 goto out;
1085         if (net_ratelimit())
1086                 printk(KERN_WARNING "dst cache overflow\n");
1087         RT_CACHE_STAT_INC(gc_dst_overflow);
1088         return 1;
1089
1090 work_done:
1091         expire += ip_rt_gc_min_interval;
1092         if (expire > ip_rt_gc_timeout ||
1093             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1094             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1095                 expire = ip_rt_gc_timeout;
1096 out:    return 0;
1097 }
1098
1099 /*
1100  * Returns number of entries in a hash chain that have different hash_inputs
1101  */
1102 static int slow_chain_length(const struct rtable *head)
1103 {
1104         int length = 0;
1105         const struct rtable *rth = head;
1106
1107         while (rth) {
1108                 length += has_noalias(head, rth);
1109                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1110         }
1111         return length >> FRACT_BITS;
1112 }
1113
1114 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1115 {
1116         static const __be32 inaddr_any = 0;
1117         struct net_device *dev = dst->dev;
1118         const __be32 *pkey = daddr;
1119         const struct rtable *rt;
1120         struct neighbour *n;
1121
1122         rt = (const struct rtable *) dst;
1123
1124         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1125                 pkey = &inaddr_any;
1126         else if (rt->rt_gateway)
1127                 pkey = (const __be32 *) &rt->rt_gateway;
1128
1129         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1130         if (n)
1131                 return n;
1132         return neigh_create(&arp_tbl, pkey, dev);
1133 }
1134
1135 static int rt_bind_neighbour(struct rtable *rt)
1136 {
1137         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1138         if (IS_ERR(n))
1139                 return PTR_ERR(n);
1140         dst_set_neighbour(&rt->dst, n);
1141
1142         return 0;
1143 }
1144
1145 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1146                                      struct sk_buff *skb, int ifindex)
1147 {
1148         struct rtable   *rth, *cand;
1149         struct rtable __rcu **rthp, **candp;
1150         unsigned long   now;
1151         u32             min_score;
1152         int             chain_length;
1153         int attempts = !in_softirq();
1154
1155 restart:
1156         chain_length = 0;
1157         min_score = ~(u32)0;
1158         cand = NULL;
1159         candp = NULL;
1160         now = jiffies;
1161
1162         if (!rt_caching(dev_net(rt->dst.dev))) {
1163                 /*
1164                  * If we're not caching, just tell the caller we
1165                  * were successful and don't touch the route.  The
1166                  * caller hold the sole reference to the cache entry, and
1167                  * it will be released when the caller is done with it.
1168                  * If we drop it here, the callers have no way to resolve routes
1169                  * when we're not caching.  Instead, just point *rp at rt, so
1170                  * the caller gets a single use out of the route
1171                  * Note that we do rt_free on this new route entry, so that
1172                  * once its refcount hits zero, we are still able to reap it
1173                  * (Thanks Alexey)
1174                  * Note: To avoid expensive rcu stuff for this uncached dst,
1175                  * we set DST_NOCACHE so that dst_release() can free dst without
1176                  * waiting a grace period.
1177                  */
1178
1179                 rt->dst.flags |= DST_NOCACHE;
1180                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1181                         int err = rt_bind_neighbour(rt);
1182                         if (err) {
1183                                 if (net_ratelimit())
1184                                         printk(KERN_WARNING
1185                                             "Neighbour table failure & not caching routes.\n");
1186                                 ip_rt_put(rt);
1187                                 return ERR_PTR(err);
1188                         }
1189                 }
1190
1191                 goto skip_hashing;
1192         }
1193
1194         rthp = &rt_hash_table[hash].chain;
1195
1196         spin_lock_bh(rt_hash_lock_addr(hash));
1197         while ((rth = rcu_dereference_protected(*rthp,
1198                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1199                 if (rt_is_expired(rth)) {
1200                         *rthp = rth->dst.rt_next;
1201                         rt_free(rth);
1202                         continue;
1203                 }
1204                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1205                         /* Put it first */
1206                         *rthp = rth->dst.rt_next;
1207                         /*
1208                          * Since lookup is lockfree, the deletion
1209                          * must be visible to another weakly ordered CPU before
1210                          * the insertion at the start of the hash chain.
1211                          */
1212                         rcu_assign_pointer(rth->dst.rt_next,
1213                                            rt_hash_table[hash].chain);
1214                         /*
1215                          * Since lookup is lockfree, the update writes
1216                          * must be ordered for consistency on SMP.
1217                          */
1218                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1219
1220                         dst_use(&rth->dst, now);
1221                         spin_unlock_bh(rt_hash_lock_addr(hash));
1222
1223                         rt_drop(rt);
1224                         if (skb)
1225                                 skb_dst_set(skb, &rth->dst);
1226                         return rth;
1227                 }
1228
1229                 if (!atomic_read(&rth->dst.__refcnt)) {
1230                         u32 score = rt_score(rth);
1231
1232                         if (score <= min_score) {
1233                                 cand = rth;
1234                                 candp = rthp;
1235                                 min_score = score;
1236                         }
1237                 }
1238
1239                 chain_length++;
1240
1241                 rthp = &rth->dst.rt_next;
1242         }
1243
1244         if (cand) {
1245                 /* ip_rt_gc_elasticity used to be average length of chain
1246                  * length, when exceeded gc becomes really aggressive.
1247                  *
1248                  * The second limit is less certain. At the moment it allows
1249                  * only 2 entries per bucket. We will see.
1250                  */
1251                 if (chain_length > ip_rt_gc_elasticity) {
1252                         *candp = cand->dst.rt_next;
1253                         rt_free(cand);
1254                 }
1255         } else {
1256                 if (chain_length > rt_chain_length_max &&
1257                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1258                         struct net *net = dev_net(rt->dst.dev);
1259                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1260                         if (!rt_caching(net)) {
1261                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1262                                         rt->dst.dev->name, num);
1263                         }
1264                         rt_emergency_hash_rebuild(net);
1265                         spin_unlock_bh(rt_hash_lock_addr(hash));
1266
1267                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1268                                         ifindex, rt_genid(net));
1269                         goto restart;
1270                 }
1271         }
1272
1273         /* Try to bind route to arp only if it is output
1274            route or unicast forwarding path.
1275          */
1276         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1277                 int err = rt_bind_neighbour(rt);
1278                 if (err) {
1279                         spin_unlock_bh(rt_hash_lock_addr(hash));
1280
1281                         if (err != -ENOBUFS) {
1282                                 rt_drop(rt);
1283                                 return ERR_PTR(err);
1284                         }
1285
1286                         /* Neighbour tables are full and nothing
1287                            can be released. Try to shrink route cache,
1288                            it is most likely it holds some neighbour records.
1289                          */
1290                         if (attempts-- > 0) {
1291                                 int saved_elasticity = ip_rt_gc_elasticity;
1292                                 int saved_int = ip_rt_gc_min_interval;
1293                                 ip_rt_gc_elasticity     = 1;
1294                                 ip_rt_gc_min_interval   = 0;
1295                                 rt_garbage_collect(&ipv4_dst_ops);
1296                                 ip_rt_gc_min_interval   = saved_int;
1297                                 ip_rt_gc_elasticity     = saved_elasticity;
1298                                 goto restart;
1299                         }
1300
1301                         if (net_ratelimit())
1302                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1303                         rt_drop(rt);
1304                         return ERR_PTR(-ENOBUFS);
1305                 }
1306         }
1307
1308         rt->dst.rt_next = rt_hash_table[hash].chain;
1309
1310         /*
1311          * Since lookup is lockfree, we must make sure
1312          * previous writes to rt are committed to memory
1313          * before making rt visible to other CPUS.
1314          */
1315         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1316
1317         spin_unlock_bh(rt_hash_lock_addr(hash));
1318
1319 skip_hashing:
1320         if (skb)
1321                 skb_dst_set(skb, &rt->dst);
1322         return rt;
1323 }
1324
1325 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1326
1327 static u32 rt_peer_genid(void)
1328 {
1329         return atomic_read(&__rt_peer_genid);
1330 }
1331
1332 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1333 {
1334         struct inet_peer *peer;
1335
1336         peer = inet_getpeer_v4(daddr, create);
1337
1338         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1339                 inet_putpeer(peer);
1340         else
1341                 rt->rt_peer_genid = rt_peer_genid();
1342 }
1343
1344 /*
1345  * Peer allocation may fail only in serious out-of-memory conditions.  However
1346  * we still can generate some output.
1347  * Random ID selection looks a bit dangerous because we have no chances to
1348  * select ID being unique in a reasonable period of time.
1349  * But broken packet identifier may be better than no packet at all.
1350  */
1351 static void ip_select_fb_ident(struct iphdr *iph)
1352 {
1353         static DEFINE_SPINLOCK(ip_fb_id_lock);
1354         static u32 ip_fallback_id;
1355         u32 salt;
1356
1357         spin_lock_bh(&ip_fb_id_lock);
1358         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1359         iph->id = htons(salt & 0xFFFF);
1360         ip_fallback_id = salt;
1361         spin_unlock_bh(&ip_fb_id_lock);
1362 }
1363
1364 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1365 {
1366         struct rtable *rt = (struct rtable *) dst;
1367
1368         if (rt && !(rt->dst.flags & DST_NOPEER)) {
1369                 if (rt->peer == NULL)
1370                         rt_bind_peer(rt, rt->rt_dst, 1);
1371
1372                 /* If peer is attached to destination, it is never detached,
1373                    so that we need not to grab a lock to dereference it.
1374                  */
1375                 if (rt->peer) {
1376                         iph->id = htons(inet_getid(rt->peer, more));
1377                         return;
1378                 }
1379         } else if (!rt)
1380                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1381                        __builtin_return_address(0));
1382
1383         ip_select_fb_ident(iph);
1384 }
1385 EXPORT_SYMBOL(__ip_select_ident);
1386
1387 static void rt_del(unsigned hash, struct rtable *rt)
1388 {
1389         struct rtable __rcu **rthp;
1390         struct rtable *aux;
1391
1392         rthp = &rt_hash_table[hash].chain;
1393         spin_lock_bh(rt_hash_lock_addr(hash));
1394         ip_rt_put(rt);
1395         while ((aux = rcu_dereference_protected(*rthp,
1396                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1397                 if (aux == rt || rt_is_expired(aux)) {
1398                         *rthp = aux->dst.rt_next;
1399                         rt_free(aux);
1400                         continue;
1401                 }
1402                 rthp = &aux->dst.rt_next;
1403         }
1404         spin_unlock_bh(rt_hash_lock_addr(hash));
1405 }
1406
1407 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1408 {
1409         struct rtable *rt = (struct rtable *) dst;
1410         __be32 orig_gw = rt->rt_gateway;
1411         struct neighbour *n, *old_n;
1412
1413         dst_confirm(&rt->dst);
1414
1415         rt->rt_gateway = peer->redirect_learned.a4;
1416
1417         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1418         if (IS_ERR(n)) {
1419                 rt->rt_gateway = orig_gw;
1420                 return;
1421         }
1422         old_n = xchg(&rt->dst._neighbour, n);
1423         if (old_n)
1424                 neigh_release(old_n);
1425         if (!(n->nud_state & NUD_VALID)) {
1426                 neigh_event_send(n, NULL);
1427         } else {
1428                 rt->rt_flags |= RTCF_REDIRECTED;
1429                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1430         }
1431 }
1432
1433 /* called in rcu_read_lock() section */
1434 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1435                     __be32 saddr, struct net_device *dev)
1436 {
1437         int s, i;
1438         struct in_device *in_dev = __in_dev_get_rcu(dev);
1439         __be32 skeys[2] = { saddr, 0 };
1440         int    ikeys[2] = { dev->ifindex, 0 };
1441         struct inet_peer *peer;
1442         struct net *net;
1443
1444         if (!in_dev)
1445                 return;
1446
1447         net = dev_net(dev);
1448         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1449             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1450             ipv4_is_zeronet(new_gw))
1451                 goto reject_redirect;
1452
1453         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1454                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1455                         goto reject_redirect;
1456                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1457                         goto reject_redirect;
1458         } else {
1459                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1460                         goto reject_redirect;
1461         }
1462
1463         for (s = 0; s < 2; s++) {
1464                 for (i = 0; i < 2; i++) {
1465                         unsigned int hash;
1466                         struct rtable __rcu **rthp;
1467                         struct rtable *rt;
1468
1469                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1470
1471                         rthp = &rt_hash_table[hash].chain;
1472
1473                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1474                                 rthp = &rt->dst.rt_next;
1475
1476                                 if (rt->rt_key_dst != daddr ||
1477                                     rt->rt_key_src != skeys[s] ||
1478                                     rt->rt_oif != ikeys[i] ||
1479                                     rt_is_input_route(rt) ||
1480                                     rt_is_expired(rt) ||
1481                                     !net_eq(dev_net(rt->dst.dev), net) ||
1482                                     rt->dst.error ||
1483                                     rt->dst.dev != dev ||
1484                                     rt->rt_gateway != old_gw)
1485                                         continue;
1486
1487                                 if (!rt->peer)
1488                                         rt_bind_peer(rt, rt->rt_dst, 1);
1489
1490                                 peer = rt->peer;
1491                                 if (peer) {
1492                                         if (peer->redirect_learned.a4 != new_gw) {
1493                                                 peer->redirect_learned.a4 = new_gw;
1494                                                 atomic_inc(&__rt_peer_genid);
1495                                         }
1496                                         check_peer_redir(&rt->dst, peer);
1497                                 }
1498                         }
1499                 }
1500         }
1501         return;
1502
1503 reject_redirect:
1504 #ifdef CONFIG_IP_ROUTE_VERBOSE
1505         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1506                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1507                         "  Advised path = %pI4 -> %pI4\n",
1508                        &old_gw, dev->name, &new_gw,
1509                        &saddr, &daddr);
1510 #endif
1511         ;
1512 }
1513
1514 static bool peer_pmtu_expired(struct inet_peer *peer)
1515 {
1516         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1517
1518         return orig &&
1519                time_after_eq(jiffies, orig) &&
1520                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1521 }
1522
1523 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1524 {
1525         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1526
1527         return orig &&
1528                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1529 }
1530
1531 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1532 {
1533         struct rtable *rt = (struct rtable *)dst;
1534         struct dst_entry *ret = dst;
1535
1536         if (rt) {
1537                 if (dst->obsolete > 0) {
1538                         ip_rt_put(rt);
1539                         ret = NULL;
1540                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1541                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1542                                                 rt->rt_oif,
1543                                                 rt_genid(dev_net(dst->dev)));
1544                         rt_del(hash, rt);
1545                         ret = NULL;
1546                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1547                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1548                 }
1549         }
1550         return ret;
1551 }
1552
1553 /*
1554  * Algorithm:
1555  *      1. The first ip_rt_redirect_number redirects are sent
1556  *         with exponential backoff, then we stop sending them at all,
1557  *         assuming that the host ignores our redirects.
1558  *      2. If we did not see packets requiring redirects
1559  *         during ip_rt_redirect_silence, we assume that the host
1560  *         forgot redirected route and start to send redirects again.
1561  *
1562  * This algorithm is much cheaper and more intelligent than dumb load limiting
1563  * in icmp.c.
1564  *
1565  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1566  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1567  */
1568
1569 void ip_rt_send_redirect(struct sk_buff *skb)
1570 {
1571         struct rtable *rt = skb_rtable(skb);
1572         struct in_device *in_dev;
1573         struct inet_peer *peer;
1574         int log_martians;
1575
1576         rcu_read_lock();
1577         in_dev = __in_dev_get_rcu(rt->dst.dev);
1578         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1579                 rcu_read_unlock();
1580                 return;
1581         }
1582         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1583         rcu_read_unlock();
1584
1585         if (!rt->peer)
1586                 rt_bind_peer(rt, rt->rt_dst, 1);
1587         peer = rt->peer;
1588         if (!peer) {
1589                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1590                 return;
1591         }
1592
1593         /* No redirected packets during ip_rt_redirect_silence;
1594          * reset the algorithm.
1595          */
1596         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1597                 peer->rate_tokens = 0;
1598
1599         /* Too many ignored redirects; do not send anything
1600          * set dst.rate_last to the last seen redirected packet.
1601          */
1602         if (peer->rate_tokens >= ip_rt_redirect_number) {
1603                 peer->rate_last = jiffies;
1604                 return;
1605         }
1606
1607         /* Check for load limit; set rate_last to the latest sent
1608          * redirect.
1609          */
1610         if (peer->rate_tokens == 0 ||
1611             time_after(jiffies,
1612                        (peer->rate_last +
1613                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1614                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1615                 peer->rate_last = jiffies;
1616                 ++peer->rate_tokens;
1617 #ifdef CONFIG_IP_ROUTE_VERBOSE
1618                 if (log_martians &&
1619                     peer->rate_tokens == ip_rt_redirect_number &&
1620                     net_ratelimit())
1621                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1622                                &ip_hdr(skb)->saddr, rt->rt_iif,
1623                                 &rt->rt_dst, &rt->rt_gateway);
1624 #endif
1625         }
1626 }
1627
1628 static int ip_error(struct sk_buff *skb)
1629 {
1630         struct rtable *rt = skb_rtable(skb);
1631         struct inet_peer *peer;
1632         unsigned long now;
1633         bool send;
1634         int code;
1635
1636         switch (rt->dst.error) {
1637         case EINVAL:
1638         default:
1639                 goto out;
1640         case EHOSTUNREACH:
1641                 code = ICMP_HOST_UNREACH;
1642                 break;
1643         case ENETUNREACH:
1644                 code = ICMP_NET_UNREACH;
1645                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1646                                 IPSTATS_MIB_INNOROUTES);
1647                 break;
1648         case EACCES:
1649                 code = ICMP_PKT_FILTERED;
1650                 break;
1651         }
1652
1653         if (!rt->peer)
1654                 rt_bind_peer(rt, rt->rt_dst, 1);
1655         peer = rt->peer;
1656
1657         send = true;
1658         if (peer) {
1659                 now = jiffies;
1660                 peer->rate_tokens += now - peer->rate_last;
1661                 if (peer->rate_tokens > ip_rt_error_burst)
1662                         peer->rate_tokens = ip_rt_error_burst;
1663                 peer->rate_last = now;
1664                 if (peer->rate_tokens >= ip_rt_error_cost)
1665                         peer->rate_tokens -= ip_rt_error_cost;
1666                 else
1667                         send = false;
1668         }
1669         if (send)
1670                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1671
1672 out:    kfree_skb(skb);
1673         return 0;
1674 }
1675
1676 /*
1677  *      The last two values are not from the RFC but
1678  *      are needed for AMPRnet AX.25 paths.
1679  */
1680
1681 static const unsigned short mtu_plateau[] =
1682 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1683
1684 static inline unsigned short guess_mtu(unsigned short old_mtu)
1685 {
1686         int i;
1687
1688         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1689                 if (old_mtu > mtu_plateau[i])
1690                         return mtu_plateau[i];
1691         return 68;
1692 }
1693
1694 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1695                                  unsigned short new_mtu,
1696                                  struct net_device *dev)
1697 {
1698         unsigned short old_mtu = ntohs(iph->tot_len);
1699         unsigned short est_mtu = 0;
1700         struct inet_peer *peer;
1701
1702         peer = inet_getpeer_v4(iph->daddr, 1);
1703         if (peer) {
1704                 unsigned short mtu = new_mtu;
1705
1706                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1707                         /* BSD 4.2 derived systems incorrectly adjust
1708                          * tot_len by the IP header length, and report
1709                          * a zero MTU in the ICMP message.
1710                          */
1711                         if (mtu == 0 &&
1712                             old_mtu >= 68 + (iph->ihl << 2))
1713                                 old_mtu -= iph->ihl << 2;
1714                         mtu = guess_mtu(old_mtu);
1715                 }
1716
1717                 if (mtu < ip_rt_min_pmtu)
1718                         mtu = ip_rt_min_pmtu;
1719                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1720                         unsigned long pmtu_expires;
1721
1722                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1723                         if (!pmtu_expires)
1724                                 pmtu_expires = 1UL;
1725
1726                         est_mtu = mtu;
1727                         peer->pmtu_learned = mtu;
1728                         peer->pmtu_expires = pmtu_expires;
1729                         atomic_inc(&__rt_peer_genid);
1730                 }
1731
1732                 inet_putpeer(peer);
1733         }
1734         return est_mtu ? : new_mtu;
1735 }
1736
1737 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1738 {
1739         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1740
1741         if (!expires)
1742                 return;
1743         if (time_before(jiffies, expires)) {
1744                 u32 orig_dst_mtu = dst_mtu(dst);
1745                 if (peer->pmtu_learned < orig_dst_mtu) {
1746                         if (!peer->pmtu_orig)
1747                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1748                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1749                 }
1750         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1751                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1752 }
1753
1754 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1755 {
1756         struct rtable *rt = (struct rtable *) dst;
1757         struct inet_peer *peer;
1758
1759         dst_confirm(dst);
1760
1761         if (!rt->peer)
1762                 rt_bind_peer(rt, rt->rt_dst, 1);
1763         peer = rt->peer;
1764         if (peer) {
1765                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1766
1767                 if (mtu < ip_rt_min_pmtu)
1768                         mtu = ip_rt_min_pmtu;
1769                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1770
1771                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1772                         if (!pmtu_expires)
1773                                 pmtu_expires = 1UL;
1774
1775                         peer->pmtu_learned = mtu;
1776                         peer->pmtu_expires = pmtu_expires;
1777
1778                         atomic_inc(&__rt_peer_genid);
1779                         rt->rt_peer_genid = rt_peer_genid();
1780                 }
1781                 check_peer_pmtu(dst, peer);
1782         }
1783 }
1784
1785
1786 static void ipv4_validate_peer(struct rtable *rt)
1787 {
1788         if (rt->rt_peer_genid != rt_peer_genid()) {
1789                 struct inet_peer *peer;
1790
1791                 if (!rt->peer)
1792                         rt_bind_peer(rt, rt->rt_dst, 0);
1793
1794                 peer = rt->peer;
1795                 if (peer) {
1796                         check_peer_pmtu(&rt->dst, peer);
1797
1798                         if (peer->redirect_learned.a4 &&
1799                             peer->redirect_learned.a4 != rt->rt_gateway)
1800                                 check_peer_redir(&rt->dst, peer);
1801                 }
1802
1803                 rt->rt_peer_genid = rt_peer_genid();
1804         }
1805 }
1806
1807 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1808 {
1809         struct rtable *rt = (struct rtable *) dst;
1810
1811         if (rt_is_expired(rt))
1812                 return NULL;
1813         ipv4_validate_peer(rt);
1814         return dst;
1815 }
1816
1817 static void ipv4_dst_destroy(struct dst_entry *dst)
1818 {
1819         struct rtable *rt = (struct rtable *) dst;
1820         struct inet_peer *peer = rt->peer;
1821
1822         if (rt->fi) {
1823                 fib_info_put(rt->fi);
1824                 rt->fi = NULL;
1825         }
1826         if (peer) {
1827                 rt->peer = NULL;
1828                 inet_putpeer(peer);
1829         }
1830 }
1831
1832
1833 static void ipv4_link_failure(struct sk_buff *skb)
1834 {
1835         struct rtable *rt;
1836
1837         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1838
1839         rt = skb_rtable(skb);
1840         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1841                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1842 }
1843
1844 static int ip_rt_bug(struct sk_buff *skb)
1845 {
1846         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1847                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1848                 skb->dev ? skb->dev->name : "?");
1849         kfree_skb(skb);
1850         WARN_ON(1);
1851         return 0;
1852 }
1853
1854 /*
1855    We do not cache source address of outgoing interface,
1856    because it is used only by IP RR, TS and SRR options,
1857    so that it out of fast path.
1858
1859    BTW remember: "addr" is allowed to be not aligned
1860    in IP options!
1861  */
1862
1863 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1864 {
1865         __be32 src;
1866
1867         if (rt_is_output_route(rt))
1868                 src = ip_hdr(skb)->saddr;
1869         else {
1870                 struct fib_result res;
1871                 struct flowi4 fl4;
1872                 struct iphdr *iph;
1873
1874                 iph = ip_hdr(skb);
1875
1876                 memset(&fl4, 0, sizeof(fl4));
1877                 fl4.daddr = iph->daddr;
1878                 fl4.saddr = iph->saddr;
1879                 fl4.flowi4_tos = RT_TOS(iph->tos);
1880                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1881                 fl4.flowi4_iif = skb->dev->ifindex;
1882                 fl4.flowi4_mark = skb->mark;
1883
1884                 rcu_read_lock();
1885                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1886                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1887                 else
1888                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1889                                         RT_SCOPE_UNIVERSE);
1890                 rcu_read_unlock();
1891         }
1892         memcpy(addr, &src, 4);
1893 }
1894
1895 #ifdef CONFIG_IP_ROUTE_CLASSID
1896 static void set_class_tag(struct rtable *rt, u32 tag)
1897 {
1898         if (!(rt->dst.tclassid & 0xFFFF))
1899                 rt->dst.tclassid |= tag & 0xFFFF;
1900         if (!(rt->dst.tclassid & 0xFFFF0000))
1901                 rt->dst.tclassid |= tag & 0xFFFF0000;
1902 }
1903 #endif
1904
1905 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1906 {
1907         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1908
1909         if (advmss == 0) {
1910                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1911                                ip_rt_min_advmss);
1912                 if (advmss > 65535 - 40)
1913                         advmss = 65535 - 40;
1914         }
1915         return advmss;
1916 }
1917
1918 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1919 {
1920         const struct rtable *rt = (const struct rtable *) dst;
1921         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1922
1923         if (mtu && rt_is_output_route(rt))
1924                 return mtu;
1925
1926         mtu = dst->dev->mtu;
1927
1928         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1929
1930                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1931                         mtu = 576;
1932         }
1933
1934         if (mtu > IP_MAX_MTU)
1935                 mtu = IP_MAX_MTU;
1936
1937         return mtu;
1938 }
1939
1940 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1941                             struct fib_info *fi)
1942 {
1943         struct inet_peer *peer;
1944         int create = 0;
1945
1946         /* If a peer entry exists for this destination, we must hook
1947          * it up in order to get at cached metrics.
1948          */
1949         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1950                 create = 1;
1951
1952         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1953         if (peer) {
1954                 rt->rt_peer_genid = rt_peer_genid();
1955                 if (inet_metrics_new(peer))
1956                         memcpy(peer->metrics, fi->fib_metrics,
1957                                sizeof(u32) * RTAX_MAX);
1958                 dst_init_metrics(&rt->dst, peer->metrics, false);
1959
1960                 check_peer_pmtu(&rt->dst, peer);
1961
1962                 if (peer->redirect_learned.a4 &&
1963                     peer->redirect_learned.a4 != rt->rt_gateway) {
1964                         rt->rt_gateway = peer->redirect_learned.a4;
1965                         rt->rt_flags |= RTCF_REDIRECTED;
1966                 }
1967         } else {
1968                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1969                         rt->fi = fi;
1970                         atomic_inc(&fi->fib_clntref);
1971                 }
1972                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1973         }
1974 }
1975
1976 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1977                            const struct fib_result *res,
1978                            struct fib_info *fi, u16 type, u32 itag)
1979 {
1980         struct dst_entry *dst = &rt->dst;
1981
1982         if (fi) {
1983                 if (FIB_RES_GW(*res) &&
1984                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1985                         rt->rt_gateway = FIB_RES_GW(*res);
1986                 rt_init_metrics(rt, fl4, fi);
1987 #ifdef CONFIG_IP_ROUTE_CLASSID
1988                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1989 #endif
1990         }
1991
1992         if (dst_mtu(dst) > IP_MAX_MTU)
1993                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1994         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1995                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1996
1997 #ifdef CONFIG_IP_ROUTE_CLASSID
1998 #ifdef CONFIG_IP_MULTIPLE_TABLES
1999         set_class_tag(rt, fib_rules_tclass(res));
2000 #endif
2001         set_class_tag(rt, itag);
2002 #endif
2003 }
2004
2005 static struct rtable *rt_dst_alloc(struct net_device *dev,
2006                                    bool nopolicy, bool noxfrm)
2007 {
2008         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2009                          DST_HOST |
2010                          (nopolicy ? DST_NOPOLICY : 0) |
2011                          (noxfrm ? DST_NOXFRM : 0));
2012 }
2013
2014 /* called in rcu_read_lock() section */
2015 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2016                                 u8 tos, struct net_device *dev, int our)
2017 {
2018         unsigned int hash;
2019         struct rtable *rth;
2020         __be32 spec_dst;
2021         struct in_device *in_dev = __in_dev_get_rcu(dev);
2022         u32 itag = 0;
2023         int err;
2024
2025         /* Primary sanity checks. */
2026
2027         if (in_dev == NULL)
2028                 return -EINVAL;
2029
2030         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2031             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2032                 goto e_inval;
2033
2034         if (ipv4_is_zeronet(saddr)) {
2035                 if (!ipv4_is_local_multicast(daddr))
2036                         goto e_inval;
2037                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2038         } else {
2039                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2040                                           &itag);
2041                 if (err < 0)
2042                         goto e_err;
2043         }
2044         rth = rt_dst_alloc(init_net.loopback_dev,
2045                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2046         if (!rth)
2047                 goto e_nobufs;
2048
2049 #ifdef CONFIG_IP_ROUTE_CLASSID
2050         rth->dst.tclassid = itag;
2051 #endif
2052         rth->dst.output = ip_rt_bug;
2053
2054         rth->rt_key_dst = daddr;
2055         rth->rt_key_src = saddr;
2056         rth->rt_genid   = rt_genid(dev_net(dev));
2057         rth->rt_flags   = RTCF_MULTICAST;
2058         rth->rt_type    = RTN_MULTICAST;
2059         rth->rt_key_tos = tos;
2060         rth->rt_dst     = daddr;
2061         rth->rt_src     = saddr;
2062         rth->rt_route_iif = dev->ifindex;
2063         rth->rt_iif     = dev->ifindex;
2064         rth->rt_oif     = 0;
2065         rth->rt_mark    = skb->mark;
2066         rth->rt_gateway = daddr;
2067         rth->rt_spec_dst= spec_dst;
2068         rth->rt_peer_genid = 0;
2069         rth->peer = NULL;
2070         rth->fi = NULL;
2071         if (our) {
2072                 rth->dst.input= ip_local_deliver;
2073                 rth->rt_flags |= RTCF_LOCAL;
2074         }
2075
2076 #ifdef CONFIG_IP_MROUTE
2077         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2078                 rth->dst.input = ip_mr_input;
2079 #endif
2080         RT_CACHE_STAT_INC(in_slow_mc);
2081
2082         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2083         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2084         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2085
2086 e_nobufs:
2087         return -ENOBUFS;
2088 e_inval:
2089         return -EINVAL;
2090 e_err:
2091         return err;
2092 }
2093
2094
2095 static void ip_handle_martian_source(struct net_device *dev,
2096                                      struct in_device *in_dev,
2097                                      struct sk_buff *skb,
2098                                      __be32 daddr,
2099                                      __be32 saddr)
2100 {
2101         RT_CACHE_STAT_INC(in_martian_src);
2102 #ifdef CONFIG_IP_ROUTE_VERBOSE
2103         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2104                 /*
2105                  *      RFC1812 recommendation, if source is martian,
2106                  *      the only hint is MAC header.
2107                  */
2108                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2109                         &daddr, &saddr, dev->name);
2110                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2111                         int i;
2112                         const unsigned char *p = skb_mac_header(skb);
2113                         printk(KERN_WARNING "ll header: ");
2114                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2115                                 printk("%02x", *p);
2116                                 if (i < (dev->hard_header_len - 1))
2117                                         printk(":");
2118                         }
2119                         printk("\n");
2120                 }
2121         }
2122 #endif
2123 }
2124
2125 /* called in rcu_read_lock() section */
2126 static int __mkroute_input(struct sk_buff *skb,
2127                            const struct fib_result *res,
2128                            struct in_device *in_dev,
2129                            __be32 daddr, __be32 saddr, u32 tos,
2130                            struct rtable **result)
2131 {
2132         struct rtable *rth;
2133         int err;
2134         struct in_device *out_dev;
2135         unsigned int flags = 0;
2136         __be32 spec_dst;
2137         u32 itag;
2138
2139         /* get a working reference to the output device */
2140         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2141         if (out_dev == NULL) {
2142                 if (net_ratelimit())
2143                         printk(KERN_CRIT "Bug in ip_route_input" \
2144                                "_slow(). Please, report\n");
2145                 return -EINVAL;
2146         }
2147
2148
2149         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2150                                   in_dev->dev, &spec_dst, &itag);
2151         if (err < 0) {
2152                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2153                                          saddr);
2154
2155                 goto cleanup;
2156         }
2157
2158         if (err)
2159                 flags |= RTCF_DIRECTSRC;
2160
2161         if (out_dev == in_dev && err &&
2162             (IN_DEV_SHARED_MEDIA(out_dev) ||
2163              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2164                 flags |= RTCF_DOREDIRECT;
2165
2166         if (skb->protocol != htons(ETH_P_IP)) {
2167                 /* Not IP (i.e. ARP). Do not create route, if it is
2168                  * invalid for proxy arp. DNAT routes are always valid.
2169                  *
2170                  * Proxy arp feature have been extended to allow, ARP
2171                  * replies back to the same interface, to support
2172                  * Private VLAN switch technologies. See arp.c.
2173                  */
2174                 if (out_dev == in_dev &&
2175                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2176                         err = -EINVAL;
2177                         goto cleanup;
2178                 }
2179         }
2180
2181         rth = rt_dst_alloc(out_dev->dev,
2182                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2183                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2184         if (!rth) {
2185                 err = -ENOBUFS;
2186                 goto cleanup;
2187         }
2188
2189         rth->rt_key_dst = daddr;
2190         rth->rt_key_src = saddr;
2191         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2192         rth->rt_flags = flags;
2193         rth->rt_type = res->type;
2194         rth->rt_key_tos = tos;
2195         rth->rt_dst     = daddr;
2196         rth->rt_src     = saddr;
2197         rth->rt_route_iif = in_dev->dev->ifindex;
2198         rth->rt_iif     = in_dev->dev->ifindex;
2199         rth->rt_oif     = 0;
2200         rth->rt_mark    = skb->mark;
2201         rth->rt_gateway = daddr;
2202         rth->rt_spec_dst= spec_dst;
2203         rth->rt_peer_genid = 0;
2204         rth->peer = NULL;
2205         rth->fi = NULL;
2206
2207         rth->dst.input = ip_forward;
2208         rth->dst.output = ip_output;
2209
2210         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2211
2212         *result = rth;
2213         err = 0;
2214  cleanup:
2215         return err;
2216 }
2217
2218 static int ip_mkroute_input(struct sk_buff *skb,
2219                             struct fib_result *res,
2220                             const struct flowi4 *fl4,
2221                             struct in_device *in_dev,
2222                             __be32 daddr, __be32 saddr, u32 tos)
2223 {
2224         struct rtable* rth = NULL;
2225         int err;
2226         unsigned hash;
2227
2228 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2229         if (res->fi && res->fi->fib_nhs > 1)
2230                 fib_select_multipath(res);
2231 #endif
2232
2233         /* create a routing cache entry */
2234         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2235         if (err)
2236                 return err;
2237
2238         /* put it into the cache */
2239         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2240                        rt_genid(dev_net(rth->dst.dev)));
2241         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2242         if (IS_ERR(rth))
2243                 return PTR_ERR(rth);
2244         return 0;
2245 }
2246
2247 /*
2248  *      NOTE. We drop all the packets that has local source
2249  *      addresses, because every properly looped back packet
2250  *      must have correct destination already attached by output routine.
2251  *
2252  *      Such approach solves two big problems:
2253  *      1. Not simplex devices are handled properly.
2254  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2255  *      called with rcu_read_lock()
2256  */
2257
2258 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2259                                u8 tos, struct net_device *dev)
2260 {
2261         struct fib_result res;
2262         struct in_device *in_dev = __in_dev_get_rcu(dev);
2263         struct flowi4   fl4;
2264         unsigned        flags = 0;
2265         u32             itag = 0;
2266         struct rtable * rth;
2267         unsigned        hash;
2268         __be32          spec_dst;
2269         int             err = -EINVAL;
2270         struct net    * net = dev_net(dev);
2271
2272         /* IP on this device is disabled. */
2273
2274         if (!in_dev)
2275                 goto out;
2276
2277         /* Check for the most weird martians, which can be not detected
2278            by fib_lookup.
2279          */
2280
2281         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2282             ipv4_is_loopback(saddr))
2283                 goto martian_source;
2284
2285         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2286                 goto brd_input;
2287
2288         /* Accept zero addresses only to limited broadcast;
2289          * I even do not know to fix it or not. Waiting for complains :-)
2290          */
2291         if (ipv4_is_zeronet(saddr))
2292                 goto martian_source;
2293
2294         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2295                 goto martian_destination;
2296
2297         /*
2298          *      Now we are ready to route packet.
2299          */
2300         fl4.flowi4_oif = 0;
2301         fl4.flowi4_iif = dev->ifindex;
2302         fl4.flowi4_mark = skb->mark;
2303         fl4.flowi4_tos = tos;
2304         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2305         fl4.daddr = daddr;
2306         fl4.saddr = saddr;
2307         err = fib_lookup(net, &fl4, &res);
2308         if (err != 0) {
2309                 if (!IN_DEV_FORWARD(in_dev))
2310                         goto e_hostunreach;
2311                 goto no_route;
2312         }
2313
2314         RT_CACHE_STAT_INC(in_slow_tot);
2315
2316         if (res.type == RTN_BROADCAST)
2317                 goto brd_input;
2318
2319         if (res.type == RTN_LOCAL) {
2320                 err = fib_validate_source(skb, saddr, daddr, tos,
2321                                           net->loopback_dev->ifindex,
2322                                           dev, &spec_dst, &itag);
2323                 if (err < 0)
2324                         goto martian_source_keep_err;
2325                 if (err)
2326                         flags |= RTCF_DIRECTSRC;
2327                 spec_dst = daddr;
2328                 goto local_input;
2329         }
2330
2331         if (!IN_DEV_FORWARD(in_dev))
2332                 goto e_hostunreach;
2333         if (res.type != RTN_UNICAST)
2334                 goto martian_destination;
2335
2336         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2337 out:    return err;
2338
2339 brd_input:
2340         if (skb->protocol != htons(ETH_P_IP))
2341                 goto e_inval;
2342
2343         if (ipv4_is_zeronet(saddr))
2344                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2345         else {
2346                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2347                                           &itag);
2348                 if (err < 0)
2349                         goto martian_source_keep_err;
2350                 if (err)
2351                         flags |= RTCF_DIRECTSRC;
2352         }
2353         flags |= RTCF_BROADCAST;
2354         res.type = RTN_BROADCAST;
2355         RT_CACHE_STAT_INC(in_brd);
2356
2357 local_input:
2358         rth = rt_dst_alloc(net->loopback_dev,
2359                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2360         if (!rth)
2361                 goto e_nobufs;
2362
2363         rth->dst.input= ip_local_deliver;
2364         rth->dst.output= ip_rt_bug;
2365 #ifdef CONFIG_IP_ROUTE_CLASSID
2366         rth->dst.tclassid = itag;
2367 #endif
2368
2369         rth->rt_key_dst = daddr;
2370         rth->rt_key_src = saddr;
2371         rth->rt_genid = rt_genid(net);
2372         rth->rt_flags   = flags|RTCF_LOCAL;
2373         rth->rt_type    = res.type;
2374         rth->rt_key_tos = tos;
2375         rth->rt_dst     = daddr;
2376         rth->rt_src     = saddr;
2377 #ifdef CONFIG_IP_ROUTE_CLASSID
2378         rth->dst.tclassid = itag;
2379 #endif
2380         rth->rt_route_iif = dev->ifindex;
2381         rth->rt_iif     = dev->ifindex;
2382         rth->rt_oif     = 0;
2383         rth->rt_mark    = skb->mark;
2384         rth->rt_gateway = daddr;
2385         rth->rt_spec_dst= spec_dst;
2386         rth->rt_peer_genid = 0;
2387         rth->peer = NULL;
2388         rth->fi = NULL;
2389         if (res.type == RTN_UNREACHABLE) {
2390                 rth->dst.input= ip_error;
2391                 rth->dst.error= -err;
2392                 rth->rt_flags   &= ~RTCF_LOCAL;
2393         }
2394         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2395         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2396         err = 0;
2397         if (IS_ERR(rth))
2398                 err = PTR_ERR(rth);
2399         goto out;
2400
2401 no_route:
2402         RT_CACHE_STAT_INC(in_no_route);
2403         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2404         res.type = RTN_UNREACHABLE;
2405         if (err == -ESRCH)
2406                 err = -ENETUNREACH;
2407         goto local_input;
2408
2409         /*
2410          *      Do not cache martian addresses: they should be logged (RFC1812)
2411          */
2412 martian_destination:
2413         RT_CACHE_STAT_INC(in_martian_dst);
2414 #ifdef CONFIG_IP_ROUTE_VERBOSE
2415         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2416                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2417                         &daddr, &saddr, dev->name);
2418 #endif
2419
2420 e_hostunreach:
2421         err = -EHOSTUNREACH;
2422         goto out;
2423
2424 e_inval:
2425         err = -EINVAL;
2426         goto out;
2427
2428 e_nobufs:
2429         err = -ENOBUFS;
2430         goto out;
2431
2432 martian_source:
2433         err = -EINVAL;
2434 martian_source_keep_err:
2435         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2436         goto out;
2437 }
2438
2439 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2440                            u8 tos, struct net_device *dev, bool noref)
2441 {
2442         struct rtable * rth;
2443         unsigned        hash;
2444         int iif = dev->ifindex;
2445         struct net *net;
2446         int res;
2447
2448         net = dev_net(dev);
2449
2450         rcu_read_lock();
2451
2452         if (!rt_caching(net))
2453                 goto skip_cache;
2454
2455         tos &= IPTOS_RT_MASK;
2456         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2457
2458         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2459              rth = rcu_dereference(rth->dst.rt_next)) {
2460                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2461                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2462                      (rth->rt_route_iif ^ iif) |
2463                      (rth->rt_key_tos ^ tos)) == 0 &&
2464                     rth->rt_mark == skb->mark &&
2465                     net_eq(dev_net(rth->dst.dev), net) &&
2466                     !rt_is_expired(rth)) {
2467                         ipv4_validate_peer(rth);
2468                         if (noref) {
2469                                 dst_use_noref(&rth->dst, jiffies);
2470                                 skb_dst_set_noref(skb, &rth->dst);
2471                         } else {
2472                                 dst_use(&rth->dst, jiffies);
2473                                 skb_dst_set(skb, &rth->dst);
2474                         }
2475                         RT_CACHE_STAT_INC(in_hit);
2476                         rcu_read_unlock();
2477                         return 0;
2478                 }
2479                 RT_CACHE_STAT_INC(in_hlist_search);
2480         }
2481
2482 skip_cache:
2483         /* Multicast recognition logic is moved from route cache to here.
2484            The problem was that too many Ethernet cards have broken/missing
2485            hardware multicast filters :-( As result the host on multicasting
2486            network acquires a lot of useless route cache entries, sort of
2487            SDR messages from all the world. Now we try to get rid of them.
2488            Really, provided software IP multicast filter is organized
2489            reasonably (at least, hashed), it does not result in a slowdown
2490            comparing with route cache reject entries.
2491            Note, that multicast routers are not affected, because
2492            route cache entry is created eventually.
2493          */
2494         if (ipv4_is_multicast(daddr)) {
2495                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2496
2497                 if (in_dev) {
2498                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2499                                                   ip_hdr(skb)->protocol);
2500                         if (our
2501 #ifdef CONFIG_IP_MROUTE
2502                                 ||
2503                             (!ipv4_is_local_multicast(daddr) &&
2504                              IN_DEV_MFORWARD(in_dev))
2505 #endif
2506                            ) {
2507                                 int res = ip_route_input_mc(skb, daddr, saddr,
2508                                                             tos, dev, our);
2509                                 rcu_read_unlock();
2510                                 return res;
2511                         }
2512                 }
2513                 rcu_read_unlock();
2514                 return -EINVAL;
2515         }
2516         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2517         rcu_read_unlock();
2518         return res;
2519 }
2520 EXPORT_SYMBOL(ip_route_input_common);
2521
2522 /* called with rcu_read_lock() */
2523 static struct rtable *__mkroute_output(const struct fib_result *res,
2524                                        const struct flowi4 *fl4,
2525                                        __be32 orig_daddr, __be32 orig_saddr,
2526                                        int orig_oif, __u8 orig_rtos,
2527                                        struct net_device *dev_out,
2528                                        unsigned int flags)
2529 {
2530         struct fib_info *fi = res->fi;
2531         struct in_device *in_dev;
2532         u16 type = res->type;
2533         struct rtable *rth;
2534
2535         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2536                 return ERR_PTR(-EINVAL);
2537
2538         if (ipv4_is_lbcast(fl4->daddr))
2539                 type = RTN_BROADCAST;
2540         else if (ipv4_is_multicast(fl4->daddr))
2541                 type = RTN_MULTICAST;
2542         else if (ipv4_is_zeronet(fl4->daddr))
2543                 return ERR_PTR(-EINVAL);
2544
2545         if (dev_out->flags & IFF_LOOPBACK)
2546                 flags |= RTCF_LOCAL;
2547
2548         in_dev = __in_dev_get_rcu(dev_out);
2549         if (!in_dev)
2550                 return ERR_PTR(-EINVAL);
2551
2552         if (type == RTN_BROADCAST) {
2553                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2554                 fi = NULL;
2555         } else if (type == RTN_MULTICAST) {
2556                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2557                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2558                                      fl4->flowi4_proto))
2559                         flags &= ~RTCF_LOCAL;
2560                 /* If multicast route do not exist use
2561                  * default one, but do not gateway in this case.
2562                  * Yes, it is hack.
2563                  */
2564                 if (fi && res->prefixlen < 4)
2565                         fi = NULL;
2566         }
2567
2568         rth = rt_dst_alloc(dev_out,
2569                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2570                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2571         if (!rth)
2572                 return ERR_PTR(-ENOBUFS);
2573
2574         rth->dst.output = ip_output;
2575
2576         rth->rt_key_dst = orig_daddr;
2577         rth->rt_key_src = orig_saddr;
2578         rth->rt_genid = rt_genid(dev_net(dev_out));
2579         rth->rt_flags   = flags;
2580         rth->rt_type    = type;
2581         rth->rt_key_tos = orig_rtos;
2582         rth->rt_dst     = fl4->daddr;
2583         rth->rt_src     = fl4->saddr;
2584         rth->rt_route_iif = 0;
2585         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2586         rth->rt_oif     = orig_oif;
2587         rth->rt_mark    = fl4->flowi4_mark;
2588         rth->rt_gateway = fl4->daddr;
2589         rth->rt_spec_dst= fl4->saddr;
2590         rth->rt_peer_genid = 0;
2591         rth->peer = NULL;
2592         rth->fi = NULL;
2593
2594         RT_CACHE_STAT_INC(out_slow_tot);
2595
2596         if (flags & RTCF_LOCAL) {
2597                 rth->dst.input = ip_local_deliver;
2598                 rth->rt_spec_dst = fl4->daddr;
2599         }
2600         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2601                 rth->rt_spec_dst = fl4->saddr;
2602                 if (flags & RTCF_LOCAL &&
2603                     !(dev_out->flags & IFF_LOOPBACK)) {
2604                         rth->dst.output = ip_mc_output;
2605                         RT_CACHE_STAT_INC(out_slow_mc);
2606                 }
2607 #ifdef CONFIG_IP_MROUTE
2608                 if (type == RTN_MULTICAST) {
2609                         if (IN_DEV_MFORWARD(in_dev) &&
2610                             !ipv4_is_local_multicast(fl4->daddr)) {
2611                                 rth->dst.input = ip_mr_input;
2612                                 rth->dst.output = ip_mc_output;
2613                         }
2614                 }
2615 #endif
2616         }
2617
2618         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2619
2620         return rth;
2621 }
2622
2623 /*
2624  * Major route resolver routine.
2625  * called with rcu_read_lock();
2626  */
2627
2628 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2629 {
2630         struct net_device *dev_out = NULL;
2631         __u8 tos = RT_FL_TOS(fl4);
2632         unsigned int flags = 0;
2633         struct fib_result res;
2634         struct rtable *rth;
2635         __be32 orig_daddr;
2636         __be32 orig_saddr;
2637         int orig_oif;
2638
2639         res.fi          = NULL;
2640 #ifdef CONFIG_IP_MULTIPLE_TABLES
2641         res.r           = NULL;
2642 #endif
2643
2644         orig_daddr = fl4->daddr;
2645         orig_saddr = fl4->saddr;
2646         orig_oif = fl4->flowi4_oif;
2647
2648         fl4->flowi4_iif = net->loopback_dev->ifindex;
2649         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2650         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2651                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2652
2653         rcu_read_lock();
2654         if (fl4->saddr) {
2655                 rth = ERR_PTR(-EINVAL);
2656                 if (ipv4_is_multicast(fl4->saddr) ||
2657                     ipv4_is_lbcast(fl4->saddr) ||
2658                     ipv4_is_zeronet(fl4->saddr))
2659                         goto out;
2660
2661                 /* I removed check for oif == dev_out->oif here.
2662                    It was wrong for two reasons:
2663                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2664                       is assigned to multiple interfaces.
2665                    2. Moreover, we are allowed to send packets with saddr
2666                       of another iface. --ANK
2667                  */
2668
2669                 if (fl4->flowi4_oif == 0 &&
2670                     (ipv4_is_multicast(fl4->daddr) ||
2671                      ipv4_is_lbcast(fl4->daddr))) {
2672                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2673                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2674                         if (dev_out == NULL)
2675                                 goto out;
2676
2677                         /* Special hack: user can direct multicasts
2678                            and limited broadcast via necessary interface
2679                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2680                            This hack is not just for fun, it allows
2681                            vic,vat and friends to work.
2682                            They bind socket to loopback, set ttl to zero
2683                            and expect that it will work.
2684                            From the viewpoint of routing cache they are broken,
2685                            because we are not allowed to build multicast path
2686                            with loopback source addr (look, routing cache
2687                            cannot know, that ttl is zero, so that packet
2688                            will not leave this host and route is valid).
2689                            Luckily, this hack is good workaround.
2690                          */
2691
2692                         fl4->flowi4_oif = dev_out->ifindex;
2693                         goto make_route;
2694                 }
2695
2696                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2697                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2698                         if (!__ip_dev_find(net, fl4->saddr, false))
2699                                 goto out;
2700                 }
2701         }
2702
2703
2704         if (fl4->flowi4_oif) {
2705                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2706                 rth = ERR_PTR(-ENODEV);
2707                 if (dev_out == NULL)
2708                         goto out;
2709
2710                 /* RACE: Check return value of inet_select_addr instead. */
2711                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2712                         rth = ERR_PTR(-ENETUNREACH);
2713                         goto out;
2714                 }
2715                 if (ipv4_is_local_multicast(fl4->daddr) ||
2716                     ipv4_is_lbcast(fl4->daddr)) {
2717                         if (!fl4->saddr)
2718                                 fl4->saddr = inet_select_addr(dev_out, 0,
2719                                                               RT_SCOPE_LINK);
2720                         goto make_route;
2721                 }
2722                 if (fl4->saddr) {
2723                         if (ipv4_is_multicast(fl4->daddr))
2724                                 fl4->saddr = inet_select_addr(dev_out, 0,
2725                                                               fl4->flowi4_scope);
2726                         else if (!fl4->daddr)
2727                                 fl4->saddr = inet_select_addr(dev_out, 0,
2728                                                               RT_SCOPE_HOST);
2729                 }
2730         }
2731
2732         if (!fl4->daddr) {
2733                 fl4->daddr = fl4->saddr;
2734                 if (!fl4->daddr)
2735                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2736                 dev_out = net->loopback_dev;
2737                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2738                 res.type = RTN_LOCAL;
2739                 flags |= RTCF_LOCAL;
2740                 goto make_route;
2741         }
2742
2743         if (fib_lookup(net, fl4, &res)) {
2744                 res.fi = NULL;
2745                 if (fl4->flowi4_oif) {
2746                         /* Apparently, routing tables are wrong. Assume,
2747                            that the destination is on link.
2748
2749                            WHY? DW.
2750                            Because we are allowed to send to iface
2751                            even if it has NO routes and NO assigned
2752                            addresses. When oif is specified, routing
2753                            tables are looked up with only one purpose:
2754                            to catch if destination is gatewayed, rather than
2755                            direct. Moreover, if MSG_DONTROUTE is set,
2756                            we send packet, ignoring both routing tables
2757                            and ifaddr state. --ANK
2758
2759
2760                            We could make it even if oif is unknown,
2761                            likely IPv6, but we do not.
2762                          */
2763
2764                         if (fl4->saddr == 0)
2765                                 fl4->saddr = inet_select_addr(dev_out, 0,
2766                                                               RT_SCOPE_LINK);
2767                         res.type = RTN_UNICAST;
2768                         goto make_route;
2769                 }
2770                 rth = ERR_PTR(-ENETUNREACH);
2771                 goto out;
2772         }
2773
2774         if (res.type == RTN_LOCAL) {
2775                 if (!fl4->saddr) {
2776                         if (res.fi->fib_prefsrc)
2777                                 fl4->saddr = res.fi->fib_prefsrc;
2778                         else
2779                                 fl4->saddr = fl4->daddr;
2780                 }
2781                 dev_out = net->loopback_dev;
2782                 fl4->flowi4_oif = dev_out->ifindex;
2783                 res.fi = NULL;
2784                 flags |= RTCF_LOCAL;
2785                 goto make_route;
2786         }
2787
2788 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2789         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2790                 fib_select_multipath(&res);
2791         else
2792 #endif
2793         if (!res.prefixlen &&
2794             res.table->tb_num_default > 1 &&
2795             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2796                 fib_select_default(&res);
2797
2798         if (!fl4->saddr)
2799                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2800
2801         dev_out = FIB_RES_DEV(res);
2802         fl4->flowi4_oif = dev_out->ifindex;
2803
2804
2805 make_route:
2806         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2807                                tos, dev_out, flags);
2808         if (!IS_ERR(rth)) {
2809                 unsigned int hash;
2810
2811                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2812                                rt_genid(dev_net(dev_out)));
2813                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2814         }
2815
2816 out:
2817         rcu_read_unlock();
2818         return rth;
2819 }
2820
2821 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2822 {
2823         struct rtable *rth;
2824         unsigned int hash;
2825
2826         if (!rt_caching(net))
2827                 goto slow_output;
2828
2829         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2830
2831         rcu_read_lock_bh();
2832         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2833                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2834                 if (rth->rt_key_dst == flp4->daddr &&
2835                     rth->rt_key_src == flp4->saddr &&
2836                     rt_is_output_route(rth) &&
2837                     rth->rt_oif == flp4->flowi4_oif &&
2838                     rth->rt_mark == flp4->flowi4_mark &&
2839                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2840                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2841                     net_eq(dev_net(rth->dst.dev), net) &&
2842                     !rt_is_expired(rth)) {
2843                         ipv4_validate_peer(rth);
2844                         dst_use(&rth->dst, jiffies);
2845                         RT_CACHE_STAT_INC(out_hit);
2846                         rcu_read_unlock_bh();
2847                         if (!flp4->saddr)
2848                                 flp4->saddr = rth->rt_src;
2849                         if (!flp4->daddr)
2850                                 flp4->daddr = rth->rt_dst;
2851                         return rth;
2852                 }
2853                 RT_CACHE_STAT_INC(out_hlist_search);
2854         }
2855         rcu_read_unlock_bh();
2856
2857 slow_output:
2858         return ip_route_output_slow(net, flp4);
2859 }
2860 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2861
2862 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2863 {
2864         return NULL;
2865 }
2866
2867 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2868 {
2869         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2870
2871         return mtu ? : dst->dev->mtu;
2872 }
2873
2874 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2875 {
2876 }
2877
2878 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2879                                           unsigned long old)
2880 {
2881         return NULL;
2882 }
2883
2884 static struct dst_ops ipv4_dst_blackhole_ops = {
2885         .family                 =       AF_INET,
2886         .protocol               =       cpu_to_be16(ETH_P_IP),
2887         .destroy                =       ipv4_dst_destroy,
2888         .check                  =       ipv4_blackhole_dst_check,
2889         .mtu                    =       ipv4_blackhole_mtu,
2890         .default_advmss         =       ipv4_default_advmss,
2891         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2892         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2893         .neigh_lookup           =       ipv4_neigh_lookup,
2894 };
2895
2896 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2897 {
2898         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2899         struct rtable *ort = (struct rtable *) dst_orig;
2900
2901         if (rt) {
2902                 struct dst_entry *new = &rt->dst;
2903
2904                 new->__use = 1;
2905                 new->input = dst_discard;
2906                 new->output = dst_discard;
2907                 dst_copy_metrics(new, &ort->dst);
2908
2909                 new->dev = ort->dst.dev;
2910                 if (new->dev)
2911                         dev_hold(new->dev);
2912
2913                 rt->rt_key_dst = ort->rt_key_dst;
2914                 rt->rt_key_src = ort->rt_key_src;
2915                 rt->rt_key_tos = ort->rt_key_tos;
2916                 rt->rt_route_iif = ort->rt_route_iif;
2917                 rt->rt_iif = ort->rt_iif;
2918                 rt->rt_oif = ort->rt_oif;
2919                 rt->rt_mark = ort->rt_mark;
2920
2921                 rt->rt_genid = rt_genid(net);
2922                 rt->rt_flags = ort->rt_flags;
2923                 rt->rt_type = ort->rt_type;
2924                 rt->rt_dst = ort->rt_dst;
2925                 rt->rt_src = ort->rt_src;
2926                 rt->rt_gateway = ort->rt_gateway;
2927                 rt->rt_spec_dst = ort->rt_spec_dst;
2928                 rt->peer = ort->peer;
2929                 if (rt->peer)
2930                         atomic_inc(&rt->peer->refcnt);
2931                 rt->fi = ort->fi;
2932                 if (rt->fi)
2933                         atomic_inc(&rt->fi->fib_clntref);
2934
2935                 dst_free(new);
2936         }
2937
2938         dst_release(dst_orig);
2939
2940         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2941 }
2942
2943 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2944                                     struct sock *sk)
2945 {
2946         struct rtable *rt = __ip_route_output_key(net, flp4);
2947
2948         if (IS_ERR(rt))
2949                 return rt;
2950
2951         if (flp4->flowi4_proto)
2952                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2953                                                    flowi4_to_flowi(flp4),
2954                                                    sk, 0);
2955
2956         return rt;
2957 }
2958 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2959
2960 static int rt_fill_info(struct net *net,
2961                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2962                         int nowait, unsigned int flags)
2963 {
2964         struct rtable *rt = skb_rtable(skb);
2965         struct rtmsg *r;
2966         struct nlmsghdr *nlh;
2967         unsigned long expires = 0;
2968         const struct inet_peer *peer = rt->peer;
2969         u32 id = 0, ts = 0, tsage = 0, error;
2970
2971         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2972         if (nlh == NULL)
2973                 return -EMSGSIZE;
2974
2975         r = nlmsg_data(nlh);
2976         r->rtm_family    = AF_INET;
2977         r->rtm_dst_len  = 32;
2978         r->rtm_src_len  = 0;
2979         r->rtm_tos      = rt->rt_key_tos;
2980         r->rtm_table    = RT_TABLE_MAIN;
2981         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2982         r->rtm_type     = rt->rt_type;
2983         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2984         r->rtm_protocol = RTPROT_UNSPEC;
2985         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2986         if (rt->rt_flags & RTCF_NOTIFY)
2987                 r->rtm_flags |= RTM_F_NOTIFY;
2988
2989         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2990
2991         if (rt->rt_key_src) {
2992                 r->rtm_src_len = 32;
2993                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2994         }
2995         if (rt->dst.dev)
2996                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2997 #ifdef CONFIG_IP_ROUTE_CLASSID
2998         if (rt->dst.tclassid)
2999                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
3000 #endif
3001         if (rt_is_input_route(rt))
3002                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
3003         else if (rt->rt_src != rt->rt_key_src)
3004                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
3005
3006         if (rt->rt_dst != rt->rt_gateway)
3007                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
3008
3009         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
3010                 goto nla_put_failure;
3011
3012         if (rt->rt_mark)
3013                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
3014
3015         error = rt->dst.error;
3016         if (peer) {
3017                 inet_peer_refcheck(rt->peer);
3018                 id = atomic_read(&peer->ip_id_count) & 0xffff;
3019                 if (peer->tcp_ts_stamp) {
3020                         ts = peer->tcp_ts;
3021                         tsage = get_seconds() - peer->tcp_ts_stamp;
3022                 }
3023                 expires = ACCESS_ONCE(peer->pmtu_expires);
3024                 if (expires) {
3025                         if (time_before(jiffies, expires))
3026                                 expires -= jiffies;
3027                         else
3028                                 expires = 0;
3029                 }
3030         }
3031
3032         if (rt_is_input_route(rt)) {
3033 #ifdef CONFIG_IP_MROUTE
3034                 __be32 dst = rt->rt_dst;
3035
3036                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3037                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3038                         int err = ipmr_get_route(net, skb,
3039                                                  rt->rt_src, rt->rt_dst,
3040                                                  r, nowait);
3041                         if (err <= 0) {
3042                                 if (!nowait) {
3043                                         if (err == 0)
3044                                                 return 0;
3045                                         goto nla_put_failure;
3046                                 } else {
3047                                         if (err == -EMSGSIZE)
3048                                                 goto nla_put_failure;
3049                                         error = err;
3050                                 }
3051                         }
3052                 } else
3053 #endif
3054                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
3055         }
3056
3057         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3058                                expires, error) < 0)
3059                 goto nla_put_failure;
3060
3061         return nlmsg_end(skb, nlh);
3062
3063 nla_put_failure:
3064         nlmsg_cancel(skb, nlh);
3065         return -EMSGSIZE;
3066 }
3067
3068 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
3069 {
3070         struct net *net = sock_net(in_skb->sk);
3071         struct rtmsg *rtm;
3072         struct nlattr *tb[RTA_MAX+1];
3073         struct rtable *rt = NULL;
3074         __be32 dst = 0;
3075         __be32 src = 0;
3076         u32 iif;
3077         int err;
3078         int mark;
3079         struct sk_buff *skb;
3080
3081         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3082         if (err < 0)
3083                 goto errout;
3084
3085         rtm = nlmsg_data(nlh);
3086
3087         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3088         if (skb == NULL) {
3089                 err = -ENOBUFS;
3090                 goto errout;
3091         }
3092
3093         /* Reserve room for dummy headers, this skb can pass
3094            through good chunk of routing engine.
3095          */
3096         skb_reset_mac_header(skb);
3097         skb_reset_network_header(skb);
3098
3099         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3100         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3101         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3102
3103         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3104         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3105         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3106         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3107
3108         if (iif) {
3109                 struct net_device *dev;
3110
3111                 dev = __dev_get_by_index(net, iif);
3112                 if (dev == NULL) {
3113                         err = -ENODEV;
3114                         goto errout_free;
3115                 }
3116
3117                 skb->protocol   = htons(ETH_P_IP);
3118                 skb->dev        = dev;
3119                 skb->mark       = mark;
3120                 local_bh_disable();
3121                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3122                 local_bh_enable();
3123
3124                 rt = skb_rtable(skb);
3125                 if (err == 0 && rt->dst.error)
3126                         err = -rt->dst.error;
3127         } else {
3128                 struct flowi4 fl4 = {
3129                         .daddr = dst,
3130                         .saddr = src,
3131                         .flowi4_tos = rtm->rtm_tos,
3132                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3133                         .flowi4_mark = mark,
3134                 };
3135                 rt = ip_route_output_key(net, &fl4);
3136
3137                 err = 0;
3138                 if (IS_ERR(rt))
3139                         err = PTR_ERR(rt);
3140         }
3141
3142         if (err)
3143                 goto errout_free;
3144
3145         skb_dst_set(skb, &rt->dst);
3146         if (rtm->rtm_flags & RTM_F_NOTIFY)
3147                 rt->rt_flags |= RTCF_NOTIFY;
3148
3149         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3150                            RTM_NEWROUTE, 0, 0);
3151         if (err <= 0)
3152                 goto errout_free;
3153
3154         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3155 errout:
3156         return err;
3157
3158 errout_free:
3159         kfree_skb(skb);
3160         goto errout;
3161 }
3162
3163 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3164 {
3165         struct rtable *rt;
3166         int h, s_h;
3167         int idx, s_idx;
3168         struct net *net;
3169
3170         net = sock_net(skb->sk);
3171
3172         s_h = cb->args[0];
3173         if (s_h < 0)
3174                 s_h = 0;
3175         s_idx = idx = cb->args[1];
3176         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3177                 if (!rt_hash_table[h].chain)
3178                         continue;
3179                 rcu_read_lock_bh();
3180                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3181                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3182                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3183                                 continue;
3184                         if (rt_is_expired(rt))
3185                                 continue;
3186                         skb_dst_set_noref(skb, &rt->dst);
3187                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3188                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3189                                          1, NLM_F_MULTI) <= 0) {
3190                                 skb_dst_drop(skb);
3191                                 rcu_read_unlock_bh();
3192                                 goto done;
3193                         }
3194                         skb_dst_drop(skb);
3195                 }
3196                 rcu_read_unlock_bh();
3197         }
3198
3199 done:
3200         cb->args[0] = h;
3201         cb->args[1] = idx;
3202         return skb->len;
3203 }
3204
3205 void ip_rt_multicast_event(struct in_device *in_dev)
3206 {
3207         rt_cache_flush(dev_net(in_dev->dev), 0);
3208 }
3209
3210 #ifdef CONFIG_SYSCTL
3211 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3212                                         void __user *buffer,
3213                                         size_t *lenp, loff_t *ppos)
3214 {
3215         if (write) {
3216                 int flush_delay;
3217                 ctl_table ctl;
3218                 struct net *net;
3219
3220                 memcpy(&ctl, __ctl, sizeof(ctl));
3221                 ctl.data = &flush_delay;
3222                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3223
3224                 net = (struct net *)__ctl->extra1;
3225                 rt_cache_flush(net, flush_delay);
3226                 return 0;
3227         }
3228
3229         return -EINVAL;
3230 }
3231
3232 static ctl_table ipv4_route_table[] = {
3233         {
3234                 .procname       = "gc_thresh",
3235                 .data           = &ipv4_dst_ops.gc_thresh,
3236                 .maxlen         = sizeof(int),
3237                 .mode           = 0644,
3238                 .proc_handler   = proc_dointvec,
3239         },
3240         {
3241                 .procname       = "max_size",
3242                 .data           = &ip_rt_max_size,
3243                 .maxlen         = sizeof(int),
3244                 .mode           = 0644,
3245                 .proc_handler   = proc_dointvec,
3246         },
3247         {
3248                 /*  Deprecated. Use gc_min_interval_ms */
3249
3250                 .procname       = "gc_min_interval",
3251                 .data           = &ip_rt_gc_min_interval,
3252                 .maxlen         = sizeof(int),
3253                 .mode           = 0644,
3254                 .proc_handler   = proc_dointvec_jiffies,
3255         },
3256         {
3257                 .procname       = "gc_min_interval_ms",
3258                 .data           = &ip_rt_gc_min_interval,
3259                 .maxlen         = sizeof(int),
3260                 .mode           = 0644,
3261                 .proc_handler   = proc_dointvec_ms_jiffies,
3262         },
3263         {
3264                 .procname       = "gc_timeout",
3265                 .data           = &ip_rt_gc_timeout,
3266                 .maxlen         = sizeof(int),
3267                 .mode           = 0644,
3268                 .proc_handler   = proc_dointvec_jiffies,
3269         },
3270         {
3271                 .procname       = "gc_interval",
3272                 .data           = &ip_rt_gc_interval,
3273                 .maxlen         = sizeof(int),
3274                 .mode           = 0644,
3275                 .proc_handler   = proc_dointvec_jiffies,
3276         },
3277         {
3278                 .procname       = "redirect_load",
3279                 .data           = &ip_rt_redirect_load,
3280                 .maxlen         = sizeof(int),
3281                 .mode           = 0644,
3282                 .proc_handler   = proc_dointvec,
3283         },
3284         {
3285                 .procname       = "redirect_number",
3286                 .data           = &ip_rt_redirect_number,
3287                 .maxlen         = sizeof(int),
3288                 .mode           = 0644,
3289                 .proc_handler   = proc_dointvec,
3290         },
3291         {
3292                 .procname       = "redirect_silence",
3293                 .data           = &ip_rt_redirect_silence,
3294                 .maxlen         = sizeof(int),
3295                 .mode           = 0644,
3296                 .proc_handler   = proc_dointvec,
3297         },
3298         {
3299                 .procname       = "error_cost",
3300                 .data           = &ip_rt_error_cost,
3301                 .maxlen         = sizeof(int),
3302                 .mode           = 0644,
3303                 .proc_handler   = proc_dointvec,
3304         },
3305         {
3306                 .procname       = "error_burst",
3307                 .data           = &ip_rt_error_burst,
3308                 .maxlen         = sizeof(int),
3309                 .mode           = 0644,
3310                 .proc_handler   = proc_dointvec,
3311         },
3312         {
3313                 .procname       = "gc_elasticity",
3314                 .data           = &ip_rt_gc_elasticity,
3315                 .maxlen         = sizeof(int),
3316                 .mode           = 0644,
3317                 .proc_handler   = proc_dointvec,
3318         },
3319         {
3320                 .procname       = "mtu_expires",
3321                 .data           = &ip_rt_mtu_expires,
3322                 .maxlen         = sizeof(int),
3323                 .mode           = 0644,
3324                 .proc_handler   = proc_dointvec_jiffies,
3325         },
3326         {
3327                 .procname       = "min_pmtu",
3328                 .data           = &ip_rt_min_pmtu,
3329                 .maxlen         = sizeof(int),
3330                 .mode           = 0644,
3331                 .proc_handler   = proc_dointvec,
3332         },
3333         {
3334                 .procname       = "min_adv_mss",
3335                 .data           = &ip_rt_min_advmss,
3336                 .maxlen         = sizeof(int),
3337                 .mode           = 0644,
3338                 .proc_handler   = proc_dointvec,
3339         },
3340         { }
3341 };
3342
3343 static struct ctl_table empty[1];
3344
3345 static struct ctl_table ipv4_skeleton[] =
3346 {
3347         { .procname = "route",
3348           .mode = 0555, .child = ipv4_route_table},
3349         { .procname = "neigh",
3350           .mode = 0555, .child = empty},
3351         { }
3352 };
3353
3354 static __net_initdata struct ctl_path ipv4_path[] = {
3355         { .procname = "net", },
3356         { .procname = "ipv4", },
3357         { },
3358 };
3359
3360 static struct ctl_table ipv4_route_flush_table[] = {
3361         {
3362                 .procname       = "flush",
3363                 .maxlen         = sizeof(int),
3364                 .mode           = 0200,
3365                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3366         },
3367         { },
3368 };
3369
3370 static __net_initdata struct ctl_path ipv4_route_path[] = {
3371         { .procname = "net", },
3372         { .procname = "ipv4", },
3373         { .procname = "route", },
3374         { },
3375 };
3376
3377 static __net_init int sysctl_route_net_init(struct net *net)
3378 {
3379         struct ctl_table *tbl;
3380
3381         tbl = ipv4_route_flush_table;
3382         if (!net_eq(net, &init_net)) {
3383                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3384                 if (tbl == NULL)
3385                         goto err_dup;
3386         }
3387         tbl[0].extra1 = net;
3388
3389         net->ipv4.route_hdr =
3390                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3391         if (net->ipv4.route_hdr == NULL)
3392                 goto err_reg;
3393         return 0;
3394
3395 err_reg:
3396         if (tbl != ipv4_route_flush_table)
3397                 kfree(tbl);
3398 err_dup:
3399         return -ENOMEM;
3400 }
3401
3402 static __net_exit void sysctl_route_net_exit(struct net *net)
3403 {
3404         struct ctl_table *tbl;
3405
3406         tbl = net->ipv4.route_hdr->ctl_table_arg;
3407         unregister_net_sysctl_table(net->ipv4.route_hdr);
3408         BUG_ON(tbl == ipv4_route_flush_table);
3409         kfree(tbl);
3410 }
3411
3412 static __net_initdata struct pernet_operations sysctl_route_ops = {
3413         .init = sysctl_route_net_init,
3414         .exit = sysctl_route_net_exit,
3415 };
3416 #endif
3417
3418 static __net_init int rt_genid_init(struct net *net)
3419 {
3420         get_random_bytes(&net->ipv4.rt_genid,
3421                          sizeof(net->ipv4.rt_genid));
3422         get_random_bytes(&net->ipv4.dev_addr_genid,
3423                          sizeof(net->ipv4.dev_addr_genid));
3424         return 0;
3425 }
3426
3427 static __net_initdata struct pernet_operations rt_genid_ops = {
3428         .init = rt_genid_init,
3429 };
3430
3431
3432 #ifdef CONFIG_IP_ROUTE_CLASSID
3433 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3434 #endif /* CONFIG_IP_ROUTE_CLASSID */
3435
3436 static __initdata unsigned long rhash_entries;
3437 static int __init set_rhash_entries(char *str)
3438 {
3439         if (!str)
3440                 return 0;
3441         rhash_entries = simple_strtoul(str, &str, 0);
3442         return 1;
3443 }
3444 __setup("rhash_entries=", set_rhash_entries);
3445
3446 int __init ip_rt_init(void)
3447 {
3448         int rc = 0;
3449
3450 #ifdef CONFIG_IP_ROUTE_CLASSID
3451         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3452         if (!ip_rt_acct)
3453                 panic("IP: failed to allocate ip_rt_acct\n");
3454 #endif
3455
3456         ipv4_dst_ops.kmem_cachep =
3457                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3458                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3459
3460         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3461
3462         if (dst_entries_init(&ipv4_dst_ops) < 0)
3463                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3464
3465         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3466                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3467
3468         rt_hash_table = (struct rt_hash_bucket *)
3469                 alloc_large_system_hash("IP route cache",
3470                                         sizeof(struct rt_hash_bucket),
3471                                         rhash_entries,
3472                                         (totalram_pages >= 128 * 1024) ?
3473                                         15 : 17,
3474                                         0,
3475                                         &rt_hash_log,
3476                                         &rt_hash_mask,
3477                                         rhash_entries ? 0 : 512 * 1024);
3478         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3479         rt_hash_lock_init();
3480
3481         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3482         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3483
3484         devinet_init();
3485         ip_fib_init();
3486
3487         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3488         expires_ljiffies = jiffies;
3489         schedule_delayed_work(&expires_work,
3490                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3491
3492         if (ip_rt_proc_init())
3493                 printk(KERN_ERR "Unable to create route proc files\n");
3494 #ifdef CONFIG_XFRM
3495         xfrm_init();
3496         xfrm4_init(ip_rt_max_size);
3497 #endif
3498         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3499
3500 #ifdef CONFIG_SYSCTL
3501         register_pernet_subsys(&sysctl_route_ops);
3502 #endif
3503         register_pernet_subsys(&rt_genid_ops);
3504         return rc;
3505 }
3506
3507 #ifdef CONFIG_SYSCTL
3508 /*
3509  * We really need to sanitize the damn ipv4 init order, then all
3510  * this nonsense will go away.
3511  */
3512 void __init ip_static_sysctl_init(void)
3513 {
3514         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3515 }
3516 #endif