net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111 #include <net/atmclip.h>
 112 #include <net/secure_seq.h>
 113
 114 #define RT_FL_TOS(oldflp4) \
 115     ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 116
 117 #define IP_MAX_MTU      0xFFF0
 118
 119 #define RT_GC_TIMEOUT (300*HZ)
 120
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 124 static int ip_rt_redirect_number __read_mostly  = 9;
 125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost __read_mostly       = HZ;
 128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129 static int ip_rt_gc_elasticity __read_mostly    = 8;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133 static int rt_chain_length_max __read_mostly    = 20;
 134 static int redirect_genid;
 135
 136 /*
 137  *      Interface to generic destination cache.
 138  */
 139
 140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 141 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 142 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 143 static void              ipv4_dst_destroy(struct dst_entry *dst);
 144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 145 static void              ipv4_link_failure(struct sk_buff *skb);
 146 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 147 static int rt_garbage_collect(struct dst_ops *ops);
 148
 149 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 150                             int how)
 151 {
 152 }
 153
 154 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 155 {
 156         struct rtable *rt = (struct rtable *) dst;
 157         struct inet_peer *peer;
 158         u32 *p = NULL;
 159
 160         if (!rt->peer)
 161                 rt_bind_peer(rt, rt->rt_dst, 1);
 162
 163         peer = rt->peer;
 164         if (peer) {
 165                 u32 *old_p = __DST_METRICS_PTR(old);
 166                 unsigned long prev, new;
 167
 168                 p = peer->metrics;
 169                 if (inet_metrics_new(peer))
 170                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 171
 172                 new = (unsigned long) p;
 173                 prev = cmpxchg(&dst->_metrics, old, new);
 174
 175                 if (prev != old) {
 176                         p = __DST_METRICS_PTR(prev);
 177                         if (prev & DST_METRICS_READ_ONLY)
 178                                 p = NULL;
 179                 } else {
 180                         if (rt->fi) {
 181                                 fib_info_put(rt->fi);
 182                                 rt->fi = NULL;
 183                         }
 184                 }
 185         }
 186         return p;
 187 }
 188
 189 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 190
 191 static struct dst_ops ipv4_dst_ops = {
 192         .family =               AF_INET,
 193         .protocol =             cpu_to_be16(ETH_P_IP),
 194         .gc =                   rt_garbage_collect,
 195         .check =                ipv4_dst_check,
 196         .default_advmss =       ipv4_default_advmss,
 197         .mtu =                  ipv4_mtu,
 198         .cow_metrics =          ipv4_cow_metrics,
 199         .destroy =              ipv4_dst_destroy,
 200         .ifdown =               ipv4_dst_ifdown,
 201         .negative_advice =      ipv4_negative_advice,
 202         .link_failure =         ipv4_link_failure,
 203         .update_pmtu =          ip_rt_update_pmtu,
 204         .local_out =            __ip_local_out,
 205         .neigh_lookup =         ipv4_neigh_lookup,
 206 };
 207
 208 #define ECN_OR_COST(class)      TC_PRIO_##class
 209
 210 const __u8 ip_tos2prio[16] = {
 211         TC_PRIO_BESTEFFORT,
 212         ECN_OR_COST(BESTEFFORT),
 213         TC_PRIO_BESTEFFORT,
 214         ECN_OR_COST(BESTEFFORT),
 215         TC_PRIO_BULK,
 216         ECN_OR_COST(BULK),
 217         TC_PRIO_BULK,
 218         ECN_OR_COST(BULK),
 219         TC_PRIO_INTERACTIVE,
 220         ECN_OR_COST(INTERACTIVE),
 221         TC_PRIO_INTERACTIVE,
 222         ECN_OR_COST(INTERACTIVE),
 223         TC_PRIO_INTERACTIVE_BULK,
 224         ECN_OR_COST(INTERACTIVE_BULK),
 225         TC_PRIO_INTERACTIVE_BULK,
 226         ECN_OR_COST(INTERACTIVE_BULK)
 227 };
 228
 229
 230 /*
 231  * Route cache.
 232  */
 233
 234 /* The locking scheme is rather straight forward:
 235  *
 236  * 1) Read-Copy Update protects the buckets of the central route hash.
 237  * 2) Only writers remove entries, and they hold the lock
 238  *    as they look at rtable reference counts.
 239  * 3) Only readers acquire references to rtable entries,
 240  *    they do so with atomic increments and with the
 241  *    lock held.
 242  */
 243
 244 struct rt_hash_bucket {
 245         struct rtable __rcu     *chain;
 246 };
 247
 248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 249         defined(CONFIG_PROVE_LOCKING)
 250 /*
 251  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 252  * The size of this table is a power of two and depends on the number of CPUS.
 253  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 254  */
 255 #ifdef CONFIG_LOCKDEP
 256 # define RT_HASH_LOCK_SZ        256
 257 #else
 258 # if NR_CPUS >= 32
 259 #  define RT_HASH_LOCK_SZ       4096
 260 # elif NR_CPUS >= 16
 261 #  define RT_HASH_LOCK_SZ       2048
 262 # elif NR_CPUS >= 8
 263 #  define RT_HASH_LOCK_SZ       1024
 264 # elif NR_CPUS >= 4
 265 #  define RT_HASH_LOCK_SZ       512
 266 # else
 267 #  define RT_HASH_LOCK_SZ       256
 268 # endif
 269 #endif
 270
 271 static spinlock_t       *rt_hash_locks;
 272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 273
 274 static __init void rt_hash_lock_init(void)
 275 {
 276         int i;
 277
 278         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 279                         GFP_KERNEL);
 280         if (!rt_hash_locks)
 281                 panic("IP: failed to allocate rt_hash_locks\n");
 282
 283         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 284                 spin_lock_init(&rt_hash_locks[i]);
 285 }
 286 #else
 287 # define rt_hash_lock_addr(slot) NULL
 288
 289 static inline void rt_hash_lock_init(void)
 290 {
 291 }
 292 #endif
 293
 294 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 295 static unsigned                 rt_hash_mask __read_mostly;
 296 static unsigned int             rt_hash_log  __read_mostly;
 297
 298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 300
 301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 302                                    int genid)
 303 {
 304         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 305                             idx, genid)
 306                 & rt_hash_mask;
 307 }
 308
 309 static inline int rt_genid(struct net *net)
 310 {
 311         return atomic_read(&net->ipv4.rt_genid);
 312 }
 313
 314 #ifdef CONFIG_PROC_FS
 315 struct rt_cache_iter_state {
 316         struct seq_net_private p;
 317         int bucket;
 318         int genid;
 319 };
 320
 321 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 322 {
 323         struct rt_cache_iter_state *st = seq->private;
 324         struct rtable *r = NULL;
 325
 326         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 327                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 328                         continue;
 329                 rcu_read_lock_bh();
 330                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 331                 while (r) {
 332                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 333                             r->rt_genid == st->genid)
 334                                 return r;
 335                         r = rcu_dereference_bh(r->dst.rt_next);
 336                 }
 337                 rcu_read_unlock_bh();
 338         }
 339         return r;
 340 }
 341
 342 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 343                                           struct rtable *r)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346
 347         r = rcu_dereference_bh(r->dst.rt_next);
 348         while (!r) {
 349                 rcu_read_unlock_bh();
 350                 do {
 351                         if (--st->bucket < 0)
 352                                 return NULL;
 353                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 354                 rcu_read_lock_bh();
 355                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 356         }
 357         return r;
 358 }
 359
 360 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 361                                         struct rtable *r)
 362 {
 363         struct rt_cache_iter_state *st = seq->private;
 364         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 365                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 366                         continue;
 367                 if (r->rt_genid == st->genid)
 368                         break;
 369         }
 370         return r;
 371 }
 372
 373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 374 {
 375         struct rtable *r = rt_cache_get_first(seq);
 376
 377         if (r)
 378                 while (pos && (r = rt_cache_get_next(seq, r)))
 379                         --pos;
 380         return pos ? NULL : r;
 381 }
 382
 383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 384 {
 385         struct rt_cache_iter_state *st = seq->private;
 386         if (*pos)
 387                 return rt_cache_get_idx(seq, *pos - 1);
 388         st->genid = rt_genid(seq_file_net(seq));
 389         return SEQ_START_TOKEN;
 390 }
 391
 392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 393 {
 394         struct rtable *r;
 395
 396         if (v == SEQ_START_TOKEN)
 397                 r = rt_cache_get_first(seq);
 398         else
 399                 r = rt_cache_get_next(seq, v);
 400         ++*pos;
 401         return r;
 402 }
 403
 404 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 405 {
 406         if (v && v != SEQ_START_TOKEN)
 407                 rcu_read_unlock_bh();
 408 }
 409
 410 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 411 {
 412         if (v == SEQ_START_TOKEN)
 413                 seq_printf(seq, "%-127s\n",
 414                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 415                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 416                            "HHUptod\tSpecDst");
 417         else {
 418                 struct rtable *r = v;
 419                 struct neighbour *n;
 420                 int len, HHUptod;
 421
 422                 rcu_read_lock();
 423                 n = dst_get_neighbour(&r->dst);
 424                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 425                 rcu_read_unlock();
 426
 427                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 428                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 429                         r->dst.dev ? r->dst.dev->name : "*",
 430                         (__force u32)r->rt_dst,
 431                         (__force u32)r->rt_gateway,
 432                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 433                         r->dst.__use, 0, (__force u32)r->rt_src,
 434                         dst_metric_advmss(&r->dst) + 40,
 435                         dst_metric(&r->dst, RTAX_WINDOW),
 436                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 437                               dst_metric(&r->dst, RTAX_RTTVAR)),
 438                         r->rt_key_tos,
 439                         -1,
 440                         HHUptod,
 441                         r->rt_spec_dst, &len);
 442
 443                 seq_printf(seq, "%*s\n", 127 - len, "");
 444         }
 445         return 0;
 446 }
 447
 448 static const struct seq_operations rt_cache_seq_ops = {
 449         .start  = rt_cache_seq_start,
 450         .next   = rt_cache_seq_next,
 451         .stop   = rt_cache_seq_stop,
 452         .show   = rt_cache_seq_show,
 453 };
 454
 455 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 456 {
 457         return seq_open_net(inode, file, &rt_cache_seq_ops,
 458                         sizeof(struct rt_cache_iter_state));
 459 }
 460
 461 static const struct file_operations rt_cache_seq_fops = {
 462         .owner   = THIS_MODULE,
 463         .open    = rt_cache_seq_open,
 464         .read    = seq_read,
 465         .llseek  = seq_lseek,
 466         .release = seq_release_net,
 467 };
 468
 469
 470 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 471 {
 472         int cpu;
 473
 474         if (*pos == 0)
 475                 return SEQ_START_TOKEN;
 476
 477         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 478                 if (!cpu_possible(cpu))
 479                         continue;
 480                 *pos = cpu+1;
 481                 return &per_cpu(rt_cache_stat, cpu);
 482         }
 483         return NULL;
 484 }
 485
 486 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 487 {
 488         int cpu;
 489
 490         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 491                 if (!cpu_possible(cpu))
 492                         continue;
 493                 *pos = cpu+1;
 494                 return &per_cpu(rt_cache_stat, cpu);
 495         }
 496         return NULL;
 497
 498 }
 499
 500 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 501 {
 502
 503 }
 504
 505 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 506 {
 507         struct rt_cache_stat *st = v;
 508
 509         if (v == SEQ_START_TOKEN) {
 510                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 511                 return 0;
 512         }
 513
 514         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 515                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 516                    dst_entries_get_slow(&ipv4_dst_ops),
 517                    st->in_hit,
 518                    st->in_slow_tot,
 519                    st->in_slow_mc,
 520                    st->in_no_route,
 521                    st->in_brd,
 522                    st->in_martian_dst,
 523                    st->in_martian_src,
 524
 525                    st->out_hit,
 526                    st->out_slow_tot,
 527                    st->out_slow_mc,
 528
 529                    st->gc_total,
 530                    st->gc_ignored,
 531                    st->gc_goal_miss,
 532                    st->gc_dst_overflow,
 533                    st->in_hlist_search,
 534                    st->out_hlist_search
 535                 );
 536         return 0;
 537 }
 538
 539 static const struct seq_operations rt_cpu_seq_ops = {
 540         .start  = rt_cpu_seq_start,
 541         .next   = rt_cpu_seq_next,
 542         .stop   = rt_cpu_seq_stop,
 543         .show   = rt_cpu_seq_show,
 544 };
 545
 546
 547 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 548 {
 549         return seq_open(file, &rt_cpu_seq_ops);
 550 }
 551
 552 static const struct file_operations rt_cpu_seq_fops = {
 553         .owner   = THIS_MODULE,
 554         .open    = rt_cpu_seq_open,
 555         .read    = seq_read,
 556         .llseek  = seq_lseek,
 557         .release = seq_release,
 558 };
 559
 560 #ifdef CONFIG_IP_ROUTE_CLASSID
 561 static int rt_acct_proc_show(struct seq_file *m, void *v)
 562 {
 563         struct ip_rt_acct *dst, *src;
 564         unsigned int i, j;
 565
 566         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 567         if (!dst)
 568                 return -ENOMEM;
 569
 570         for_each_possible_cpu(i) {
 571                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 572                 for (j = 0; j < 256; j++) {
 573                         dst[j].o_bytes   += src[j].o_bytes;
 574                         dst[j].o_packets += src[j].o_packets;
 575                         dst[j].i_bytes   += src[j].i_bytes;
 576                         dst[j].i_packets += src[j].i_packets;
 577                 }
 578         }
 579
 580         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 581         kfree(dst);
 582         return 0;
 583 }
 584
 585 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 586 {
 587         return single_open(file, rt_acct_proc_show, NULL);
 588 }
 589
 590 static const struct file_operations rt_acct_proc_fops = {
 591         .owner          = THIS_MODULE,
 592         .open           = rt_acct_proc_open,
 593         .read           = seq_read,
 594         .llseek         = seq_lseek,
 595         .release        = single_release,
 596 };
 597 #endif
 598
 599 static int __net_init ip_rt_do_proc_init(struct net *net)
 600 {
 601         struct proc_dir_entry *pde;
 602
 603         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 604                         &rt_cache_seq_fops);
 605         if (!pde)
 606                 goto err1;
 607
 608         pde = proc_create("rt_cache", S_IRUGO,
 609                           net->proc_net_stat, &rt_cpu_seq_fops);
 610         if (!pde)
 611                 goto err2;
 612
 613 #ifdef CONFIG_IP_ROUTE_CLASSID
 614         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 615         if (!pde)
 616                 goto err3;
 617 #endif
 618         return 0;
 619
 620 #ifdef CONFIG_IP_ROUTE_CLASSID
 621 err3:
 622         remove_proc_entry("rt_cache", net->proc_net_stat);
 623 #endif
 624 err2:
 625         remove_proc_entry("rt_cache", net->proc_net);
 626 err1:
 627         return -ENOMEM;
 628 }
 629
 630 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 631 {
 632         remove_proc_entry("rt_cache", net->proc_net_stat);
 633         remove_proc_entry("rt_cache", net->proc_net);
 634 #ifdef CONFIG_IP_ROUTE_CLASSID
 635         remove_proc_entry("rt_acct", net->proc_net);
 636 #endif
 637 }
 638
 639 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 640         .init = ip_rt_do_proc_init,
 641         .exit = ip_rt_do_proc_exit,
 642 };
 643
 644 static int __init ip_rt_proc_init(void)
 645 {
 646         return register_pernet_subsys(&ip_rt_proc_ops);
 647 }
 648
 649 #else
 650 static inline int ip_rt_proc_init(void)
 651 {
 652         return 0;
 653 }
 654 #endif /* CONFIG_PROC_FS */
 655
 656 static inline void rt_free(struct rtable *rt)
 657 {
 658         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 659 }
 660
 661 static inline void rt_drop(struct rtable *rt)
 662 {
 663         ip_rt_put(rt);
 664         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 665 }
 666
 667 static inline int rt_fast_clean(struct rtable *rth)
 668 {
 669         /* Kill broadcast/multicast entries very aggresively, if they
 670            collide in hash table with more useful entries */
 671         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 672                 rt_is_input_route(rth) && rth->dst.rt_next;
 673 }
 674
 675 static inline int rt_valuable(struct rtable *rth)
 676 {
 677         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 678                 (rth->peer && rth->peer->pmtu_expires);
 679 }
 680
 681 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 682 {
 683         unsigned long age;
 684         int ret = 0;
 685
 686         if (atomic_read(&rth->dst.__refcnt))
 687                 goto out;
 688
 689         age = jiffies - rth->dst.lastuse;
 690         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 691             (age <= tmo2 && rt_valuable(rth)))
 692                 goto out;
 693         ret = 1;
 694 out:    return ret;
 695 }
 696
 697 /* Bits of score are:
 698  * 31: very valuable
 699  * 30: not quite useless
 700  * 29..0: usage counter
 701  */
 702 static inline u32 rt_score(struct rtable *rt)
 703 {
 704         u32 score = jiffies - rt->dst.lastuse;
 705
 706         score = ~score & ~(3<<30);
 707
 708         if (rt_valuable(rt))
 709                 score |= (1<<31);
 710
 711         if (rt_is_output_route(rt) ||
 712             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 713                 score |= (1<<30);
 714
 715         return score;
 716 }
 717
 718 static inline bool rt_caching(const struct net *net)
 719 {
 720         return net->ipv4.current_rt_cache_rebuild_count <=
 721                 net->ipv4.sysctl_rt_cache_rebuild_count;
 722 }
 723
 724 static inline bool compare_hash_inputs(const struct rtable *rt1,
 725                                        const struct rtable *rt2)
 726 {
 727         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 728                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 729                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 730 }
 731
 732 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 733 {
 734         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 735                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 736                 (rt1->rt_mark ^ rt2->rt_mark) |
 737                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 738                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 739                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 740 }
 741
 742 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 743 {
 744         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 745 }
 746
 747 static inline int rt_is_expired(struct rtable *rth)
 748 {
 749         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 750 }
 751
 752 /*
 753  * Perform a full scan of hash table and free all entries.
 754  * Can be called by a softirq or a process.
 755  * In the later case, we want to be reschedule if necessary
 756  */
 757 static void rt_do_flush(struct net *net, int process_context)
 758 {
 759         unsigned int i;
 760         struct rtable *rth, *next;
 761
 762         for (i = 0; i <= rt_hash_mask; i++) {
 763                 struct rtable __rcu **pprev;
 764                 struct rtable *list;
 765
 766                 if (process_context && need_resched())
 767                         cond_resched();
 768                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 769                 if (!rth)
 770                         continue;
 771
 772                 spin_lock_bh(rt_hash_lock_addr(i));
 773
 774                 list = NULL;
 775                 pprev = &rt_hash_table[i].chain;
 776                 rth = rcu_dereference_protected(*pprev,
 777                         lockdep_is_held(rt_hash_lock_addr(i)));
 778
 779                 while (rth) {
 780                         next = rcu_dereference_protected(rth->dst.rt_next,
 781                                 lockdep_is_held(rt_hash_lock_addr(i)));
 782
 783                         if (!net ||
 784                             net_eq(dev_net(rth->dst.dev), net)) {
 785                                 rcu_assign_pointer(*pprev, next);
 786                                 rcu_assign_pointer(rth->dst.rt_next, list);
 787                                 list = rth;
 788                         } else {
 789                                 pprev = &rth->dst.rt_next;
 790                         }
 791                         rth = next;
 792                 }
 793
 794                 spin_unlock_bh(rt_hash_lock_addr(i));
 795
 796                 for (; list; list = next) {
 797                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 798                         rt_free(list);
 799                 }
 800         }
 801 }
 802
 803 /*
 804  * While freeing expired entries, we compute average chain length
 805  * and standard deviation, using fixed-point arithmetic.
 806  * This to have an estimation of rt_chain_length_max
 807  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 808  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 809  */
 810
 811 #define FRACT_BITS 3
 812 #define ONE (1UL << FRACT_BITS)
 813
 814 /*
 815  * Given a hash chain and an item in this hash chain,
 816  * find if a previous entry has the same hash_inputs
 817  * (but differs on tos, mark or oif)
 818  * Returns 0 if an alias is found.
 819  * Returns ONE if rth has no alias before itself.
 820  */
 821 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 822 {
 823         const struct rtable *aux = head;
 824
 825         while (aux != rth) {
 826                 if (compare_hash_inputs(aux, rth))
 827                         return 0;
 828                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 829         }
 830         return ONE;
 831 }
 832
 833 /*
 834  * Perturbation of rt_genid by a small quantity [1..256]
 835  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 836  * many times (2^24) without giving recent rt_genid.
 837  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 838  */
 839 static void rt_cache_invalidate(struct net *net)
 840 {
 841         unsigned char shuffle;
 842
 843         get_random_bytes(&shuffle, sizeof(shuffle));
 844         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 845         redirect_genid++;
 846 }
 847
 848 /*
 849  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 850  * delay >= 0 : invalidate & flush cache (can be long)
 851  */
 852 void rt_cache_flush(struct net *net, int delay)
 853 {
 854         rt_cache_invalidate(net);
 855         if (delay >= 0)
 856                 rt_do_flush(net, !in_softirq());
 857 }
 858
 859 /* Flush previous cache invalidated entries from the cache */
 860 void rt_cache_flush_batch(struct net *net)
 861 {
 862         rt_do_flush(net, !in_softirq());
 863 }
 864
 865 static void rt_emergency_hash_rebuild(struct net *net)
 866 {
 867         if (net_ratelimit())
 868                 printk(KERN_WARNING "Route hash chain too long!\n");
 869         rt_cache_invalidate(net);
 870 }
 871
 872 /*
 873    Short description of GC goals.
 874
 875    We want to build algorithm, which will keep routing cache
 876    at some equilibrium point, when number of aged off entries
 877    is kept approximately equal to newly generated ones.
 878
 879    Current expiration strength is variable "expire".
 880    We try to adjust it dynamically, so that if networking
 881    is idle expires is large enough to keep enough of warm entries,
 882    and when load increases it reduces to limit cache size.
 883  */
 884
 885 static int rt_garbage_collect(struct dst_ops *ops)
 886 {
 887         static unsigned long expire = RT_GC_TIMEOUT;
 888         static unsigned long last_gc;
 889         static int rover;
 890         static int equilibrium;
 891         struct rtable *rth;
 892         struct rtable __rcu **rthp;
 893         unsigned long now = jiffies;
 894         int goal;
 895         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 896
 897         /*
 898          * Garbage collection is pretty expensive,
 899          * do not make it too frequently.
 900          */
 901
 902         RT_CACHE_STAT_INC(gc_total);
 903
 904         if (now - last_gc < ip_rt_gc_min_interval &&
 905             entries < ip_rt_max_size) {
 906                 RT_CACHE_STAT_INC(gc_ignored);
 907                 goto out;
 908         }
 909
 910         entries = dst_entries_get_slow(&ipv4_dst_ops);
 911         /* Calculate number of entries, which we want to expire now. */
 912         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 913         if (goal <= 0) {
 914                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 915                         equilibrium = ipv4_dst_ops.gc_thresh;
 916                 goal = entries - equilibrium;
 917                 if (goal > 0) {
 918                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 919                         goal = entries - equilibrium;
 920                 }
 921         } else {
 922                 /* We are in dangerous area. Try to reduce cache really
 923                  * aggressively.
 924                  */
 925                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 926                 equilibrium = entries - goal;
 927         }
 928
 929         if (now - last_gc >= ip_rt_gc_min_interval)
 930                 last_gc = now;
 931
 932         if (goal <= 0) {
 933                 equilibrium += goal;
 934                 goto work_done;
 935         }
 936
 937         do {
 938                 int i, k;
 939
 940                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 941                         unsigned long tmo = expire;
 942
 943                         k = (k + 1) & rt_hash_mask;
 944                         rthp = &rt_hash_table[k].chain;
 945                         spin_lock_bh(rt_hash_lock_addr(k));
 946                         while ((rth = rcu_dereference_protected(*rthp,
 947                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 948                                 if (!rt_is_expired(rth) &&
 949                                         !rt_may_expire(rth, tmo, expire)) {
 950                                         tmo >>= 1;
 951                                         rthp = &rth->dst.rt_next;
 952                                         continue;
 953                                 }
 954                                 *rthp = rth->dst.rt_next;
 955                                 rt_free(rth);
 956                                 goal--;
 957                         }
 958                         spin_unlock_bh(rt_hash_lock_addr(k));
 959                         if (goal <= 0)
 960                                 break;
 961                 }
 962                 rover = k;
 963
 964                 if (goal <= 0)
 965                         goto work_done;
 966
 967                 /* Goal is not achieved. We stop process if:
 968
 969                    - if expire reduced to zero. Otherwise, expire is halfed.
 970                    - if table is not full.
 971                    - if we are called from interrupt.
 972                    - jiffies check is just fallback/debug loop breaker.
 973                      We will not spin here for long time in any case.
 974                  */
 975
 976                 RT_CACHE_STAT_INC(gc_goal_miss);
 977
 978                 if (expire == 0)
 979                         break;
 980
 981                 expire >>= 1;
 982
 983                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 984                         goto out;
 985         } while (!in_softirq() && time_before_eq(jiffies, now));
 986
 987         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 988                 goto out;
 989         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
 990                 goto out;
 991         if (net_ratelimit())
 992                 printk(KERN_WARNING "dst cache overflow\n");
 993         RT_CACHE_STAT_INC(gc_dst_overflow);
 994         return 1;
 995
 996 work_done:
 997         expire += ip_rt_gc_min_interval;
 998         if (expire > ip_rt_gc_timeout ||
 999             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1000             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1001                 expire = ip_rt_gc_timeout;
1002 out:    return 0;
1003 }
1004
1005 /*
1006  * Returns number of entries in a hash chain that have different hash_inputs
1007  */
1008 static int slow_chain_length(const struct rtable *head)
1009 {
1010         int length = 0;
1011         const struct rtable *rth = head;
1012
1013         while (rth) {
1014                 length += has_noalias(head, rth);
1015                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1016         }
1017         return length >> FRACT_BITS;
1018 }
1019
1020 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1021 {
1022         struct neigh_table *tbl = &arp_tbl;
1023         static const __be32 inaddr_any = 0;
1024         struct net_device *dev = dst->dev;
1025         const __be32 *pkey = daddr;
1026         struct neighbour *n;
1027
1028 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1029         if (dev->type == ARPHRD_ATM)
1030                 tbl = clip_tbl_hook;
1031 #endif
1032         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1033                 pkey = &inaddr_any;
1034
1035         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1036         if (n)
1037                 return n;
1038         return neigh_create(tbl, pkey, dev);
1039 }
1040
1041 static int rt_bind_neighbour(struct rtable *rt)
1042 {
1043         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1044         if (IS_ERR(n))
1045                 return PTR_ERR(n);
1046         dst_set_neighbour(&rt->dst, n);
1047
1048         return 0;
1049 }
1050
1051 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1052                                      struct sk_buff *skb, int ifindex)
1053 {
1054         struct rtable   *rth, *cand;
1055         struct rtable __rcu **rthp, **candp;
1056         unsigned long   now;
1057         u32             min_score;
1058         int             chain_length;
1059         int attempts = !in_softirq();
1060
1061 restart:
1062         chain_length = 0;
1063         min_score = ~(u32)0;
1064         cand = NULL;
1065         candp = NULL;
1066         now = jiffies;
1067
1068         if (!rt_caching(dev_net(rt->dst.dev))) {
1069                 /*
1070                  * If we're not caching, just tell the caller we
1071                  * were successful and don't touch the route.  The
1072                  * caller hold the sole reference to the cache entry, and
1073                  * it will be released when the caller is done with it.
1074                  * If we drop it here, the callers have no way to resolve routes
1075                  * when we're not caching.  Instead, just point *rp at rt, so
1076                  * the caller gets a single use out of the route
1077                  * Note that we do rt_free on this new route entry, so that
1078                  * once its refcount hits zero, we are still able to reap it
1079                  * (Thanks Alexey)
1080                  * Note: To avoid expensive rcu stuff for this uncached dst,
1081                  * we set DST_NOCACHE so that dst_release() can free dst without
1082                  * waiting a grace period.
1083                  */
1084
1085                 rt->dst.flags |= DST_NOCACHE;
1086                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1087                         int err = rt_bind_neighbour(rt);
1088                         if (err) {
1089                                 if (net_ratelimit())
1090                                         printk(KERN_WARNING
1091                                             "Neighbour table failure & not caching routes.\n");
1092                                 ip_rt_put(rt);
1093                                 return ERR_PTR(err);
1094                         }
1095                 }
1096
1097                 goto skip_hashing;
1098         }
1099
1100         rthp = &rt_hash_table[hash].chain;
1101
1102         spin_lock_bh(rt_hash_lock_addr(hash));
1103         while ((rth = rcu_dereference_protected(*rthp,
1104                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1105                 if (rt_is_expired(rth)) {
1106                         *rthp = rth->dst.rt_next;
1107                         rt_free(rth);
1108                         continue;
1109                 }
1110                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1111                         /* Put it first */
1112                         *rthp = rth->dst.rt_next;
1113                         /*
1114                          * Since lookup is lockfree, the deletion
1115                          * must be visible to another weakly ordered CPU before
1116                          * the insertion at the start of the hash chain.
1117                          */
1118                         rcu_assign_pointer(rth->dst.rt_next,
1119                                            rt_hash_table[hash].chain);
1120                         /*
1121                          * Since lookup is lockfree, the update writes
1122                          * must be ordered for consistency on SMP.
1123                          */
1124                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1125
1126                         dst_use(&rth->dst, now);
1127                         spin_unlock_bh(rt_hash_lock_addr(hash));
1128
1129                         rt_drop(rt);
1130                         if (skb)
1131                                 skb_dst_set(skb, &rth->dst);
1132                         return rth;
1133                 }
1134
1135                 if (!atomic_read(&rth->dst.__refcnt)) {
1136                         u32 score = rt_score(rth);
1137
1138                         if (score <= min_score) {
1139                                 cand = rth;
1140                                 candp = rthp;
1141                                 min_score = score;
1142                         }
1143                 }
1144
1145                 chain_length++;
1146
1147                 rthp = &rth->dst.rt_next;
1148         }
1149
1150         if (cand) {
1151                 /* ip_rt_gc_elasticity used to be average length of chain
1152                  * length, when exceeded gc becomes really aggressive.
1153                  *
1154                  * The second limit is less certain. At the moment it allows
1155                  * only 2 entries per bucket. We will see.
1156                  */
1157                 if (chain_length > ip_rt_gc_elasticity) {
1158                         *candp = cand->dst.rt_next;
1159                         rt_free(cand);
1160                 }
1161         } else {
1162                 if (chain_length > rt_chain_length_max &&
1163                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1164                         struct net *net = dev_net(rt->dst.dev);
1165                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1166                         if (!rt_caching(net)) {
1167                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1168                                         rt->dst.dev->name, num);
1169                         }
1170                         rt_emergency_hash_rebuild(net);
1171                         spin_unlock_bh(rt_hash_lock_addr(hash));
1172
1173                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1174                                         ifindex, rt_genid(net));
1175                         goto restart;
1176                 }
1177         }
1178
1179         /* Try to bind route to arp only if it is output
1180            route or unicast forwarding path.
1181          */
1182         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1183                 int err = rt_bind_neighbour(rt);
1184                 if (err) {
1185                         spin_unlock_bh(rt_hash_lock_addr(hash));
1186
1187                         if (err != -ENOBUFS) {
1188                                 rt_drop(rt);
1189                                 return ERR_PTR(err);
1190                         }
1191
1192                         /* Neighbour tables are full and nothing
1193                            can be released. Try to shrink route cache,
1194                            it is most likely it holds some neighbour records.
1195                          */
1196                         if (attempts-- > 0) {
1197                                 int saved_elasticity = ip_rt_gc_elasticity;
1198                                 int saved_int = ip_rt_gc_min_interval;
1199                                 ip_rt_gc_elasticity     = 1;
1200                                 ip_rt_gc_min_interval   = 0;
1201                                 rt_garbage_collect(&ipv4_dst_ops);
1202                                 ip_rt_gc_min_interval   = saved_int;
1203                                 ip_rt_gc_elasticity     = saved_elasticity;
1204                                 goto restart;
1205                         }
1206
1207                         if (net_ratelimit())
1208                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1209                         rt_drop(rt);
1210                         return ERR_PTR(-ENOBUFS);
1211                 }
1212         }
1213
1214         rt->dst.rt_next = rt_hash_table[hash].chain;
1215
1216         /*
1217          * Since lookup is lockfree, we must make sure
1218          * previous writes to rt are committed to memory
1219          * before making rt visible to other CPUS.
1220          */
1221         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1222
1223         spin_unlock_bh(rt_hash_lock_addr(hash));
1224
1225 skip_hashing:
1226         if (skb)
1227                 skb_dst_set(skb, &rt->dst);
1228         return rt;
1229 }
1230
1231 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1232
1233 static u32 rt_peer_genid(void)
1234 {
1235         return atomic_read(&__rt_peer_genid);
1236 }
1237
1238 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1239 {
1240         struct inet_peer *peer;
1241
1242         peer = inet_getpeer_v4(daddr, create);
1243
1244         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1245                 inet_putpeer(peer);
1246         else
1247                 rt->rt_peer_genid = rt_peer_genid();
1248 }
1249
1250 /*
1251  * Peer allocation may fail only in serious out-of-memory conditions.  However
1252  * we still can generate some output.
1253  * Random ID selection looks a bit dangerous because we have no chances to
1254  * select ID being unique in a reasonable period of time.
1255  * But broken packet identifier may be better than no packet at all.
1256  */
1257 static void ip_select_fb_ident(struct iphdr *iph)
1258 {
1259         static DEFINE_SPINLOCK(ip_fb_id_lock);
1260         static u32 ip_fallback_id;
1261         u32 salt;
1262
1263         spin_lock_bh(&ip_fb_id_lock);
1264         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1265         iph->id = htons(salt & 0xFFFF);
1266         ip_fallback_id = salt;
1267         spin_unlock_bh(&ip_fb_id_lock);
1268 }
1269
1270 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1271 {
1272         struct rtable *rt = (struct rtable *) dst;
1273
1274         if (rt) {
1275                 if (rt->peer == NULL)
1276                         rt_bind_peer(rt, rt->rt_dst, 1);
1277
1278                 /* If peer is attached to destination, it is never detached,
1279                    so that we need not to grab a lock to dereference it.
1280                  */
1281                 if (rt->peer) {
1282                         iph->id = htons(inet_getid(rt->peer, more));
1283                         return;
1284                 }
1285         } else
1286                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1287                        __builtin_return_address(0));
1288
1289         ip_select_fb_ident(iph);
1290 }
1291 EXPORT_SYMBOL(__ip_select_ident);
1292
1293 static void rt_del(unsigned hash, struct rtable *rt)
1294 {
1295         struct rtable __rcu **rthp;
1296         struct rtable *aux;
1297
1298         rthp = &rt_hash_table[hash].chain;
1299         spin_lock_bh(rt_hash_lock_addr(hash));
1300         ip_rt_put(rt);
1301         while ((aux = rcu_dereference_protected(*rthp,
1302                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1303                 if (aux == rt || rt_is_expired(aux)) {
1304                         *rthp = aux->dst.rt_next;
1305                         rt_free(aux);
1306                         continue;
1307                 }
1308                 rthp = &aux->dst.rt_next;
1309         }
1310         spin_unlock_bh(rt_hash_lock_addr(hash));
1311 }
1312
1313 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1314 {
1315         struct rtable *rt = (struct rtable *) dst;
1316         __be32 orig_gw = rt->rt_gateway;
1317         struct neighbour *n, *old_n;
1318
1319         dst_confirm(&rt->dst);
1320
1321         rt->rt_gateway = peer->redirect_learned.a4;
1322
1323         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1324         if (IS_ERR(n))
1325                 return PTR_ERR(n);
1326         old_n = xchg(&rt->dst._neighbour, n);
1327         if (old_n)
1328                 neigh_release(old_n);
1329         if (!n || !(n->nud_state & NUD_VALID)) {
1330                 if (n)
1331                         neigh_event_send(n, NULL);
1332                 rt->rt_gateway = orig_gw;
1333                 return -EAGAIN;
1334         } else {
1335                 rt->rt_flags |= RTCF_REDIRECTED;
1336                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1337         }
1338         return 0;
1339 }
1340
1341 /* called in rcu_read_lock() section */
1342 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1343                     __be32 saddr, struct net_device *dev)
1344 {
1345         int s, i;
1346         struct in_device *in_dev = __in_dev_get_rcu(dev);
1347         __be32 skeys[2] = { saddr, 0 };
1348         int    ikeys[2] = { dev->ifindex, 0 };
1349         struct inet_peer *peer;
1350         struct net *net;
1351
1352         if (!in_dev)
1353                 return;
1354
1355         net = dev_net(dev);
1356         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1357             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1358             ipv4_is_zeronet(new_gw))
1359                 goto reject_redirect;
1360
1361         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1362                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1363                         goto reject_redirect;
1364                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1365                         goto reject_redirect;
1366         } else {
1367                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1368                         goto reject_redirect;
1369         }
1370
1371         for (s = 0; s < 2; s++) {
1372                 for (i = 0; i < 2; i++) {
1373                         unsigned int hash;
1374                         struct rtable __rcu **rthp;
1375                         struct rtable *rt;
1376
1377                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1378
1379                         rthp = &rt_hash_table[hash].chain;
1380
1381                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1382                                 rthp = &rt->dst.rt_next;
1383
1384                                 if (rt->rt_key_dst != daddr ||
1385                                     rt->rt_key_src != skeys[s] ||
1386                                     rt->rt_oif != ikeys[i] ||
1387                                     rt_is_input_route(rt) ||
1388                                     rt_is_expired(rt) ||
1389                                     !net_eq(dev_net(rt->dst.dev), net) ||
1390                                     rt->dst.error ||
1391                                     rt->dst.dev != dev ||
1392                                     rt->rt_gateway != old_gw)
1393                                         continue;
1394
1395                                 if (!rt->peer)
1396                                         rt_bind_peer(rt, rt->rt_dst, 1);
1397
1398                                 peer = rt->peer;
1399                                 if (peer) {
1400                                         if (peer->redirect_learned.a4 != new_gw ||
1401                                             peer->redirect_genid != redirect_genid) {
1402                                                 peer->redirect_learned.a4 = new_gw;
1403                                                 peer->redirect_genid = redirect_genid;
1404                                                 atomic_inc(&__rt_peer_genid);
1405                                         }
1406                                         check_peer_redir(&rt->dst, peer);
1407                                 }
1408                         }
1409                 }
1410         }
1411         return;
1412
1413 reject_redirect:
1414 #ifdef CONFIG_IP_ROUTE_VERBOSE
1415         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1416                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1417                         "  Advised path = %pI4 -> %pI4\n",
1418                        &old_gw, dev->name, &new_gw,
1419                        &saddr, &daddr);
1420 #endif
1421         ;
1422 }
1423
1424 static bool peer_pmtu_expired(struct inet_peer *peer)
1425 {
1426         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1427
1428         return orig &&
1429                time_after_eq(jiffies, orig) &&
1430                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1431 }
1432
1433 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1434 {
1435         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1436
1437         return orig &&
1438                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1439 }
1440
1441 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1442 {
1443         struct rtable *rt = (struct rtable *)dst;
1444         struct dst_entry *ret = dst;
1445
1446         if (rt) {
1447                 if (dst->obsolete > 0) {
1448                         ip_rt_put(rt);
1449                         ret = NULL;
1450                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1451                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1452                                                 rt->rt_oif,
1453                                                 rt_genid(dev_net(dst->dev)));
1454                         rt_del(hash, rt);
1455                         ret = NULL;
1456                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1457                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1458                 }
1459         }
1460         return ret;
1461 }
1462
1463 /*
1464  * Algorithm:
1465  *      1. The first ip_rt_redirect_number redirects are sent
1466  *         with exponential backoff, then we stop sending them at all,
1467  *         assuming that the host ignores our redirects.
1468  *      2. If we did not see packets requiring redirects
1469  *         during ip_rt_redirect_silence, we assume that the host
1470  *         forgot redirected route and start to send redirects again.
1471  *
1472  * This algorithm is much cheaper and more intelligent than dumb load limiting
1473  * in icmp.c.
1474  *
1475  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1476  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1477  */
1478
1479 void ip_rt_send_redirect(struct sk_buff *skb)
1480 {
1481         struct rtable *rt = skb_rtable(skb);
1482         struct in_device *in_dev;
1483         struct inet_peer *peer;
1484         int log_martians;
1485
1486         rcu_read_lock();
1487         in_dev = __in_dev_get_rcu(rt->dst.dev);
1488         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1489                 rcu_read_unlock();
1490                 return;
1491         }
1492         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1493         rcu_read_unlock();
1494
1495         if (!rt->peer)
1496                 rt_bind_peer(rt, rt->rt_dst, 1);
1497         peer = rt->peer;
1498         if (!peer) {
1499                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1500                 return;
1501         }
1502
1503         /* No redirected packets during ip_rt_redirect_silence;
1504          * reset the algorithm.
1505          */
1506         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1507                 peer->rate_tokens = 0;
1508
1509         /* Too many ignored redirects; do not send anything
1510          * set dst.rate_last to the last seen redirected packet.
1511          */
1512         if (peer->rate_tokens >= ip_rt_redirect_number) {
1513                 peer->rate_last = jiffies;
1514                 return;
1515         }
1516
1517         /* Check for load limit; set rate_last to the latest sent
1518          * redirect.
1519          */
1520         if (peer->rate_tokens == 0 ||
1521             time_after(jiffies,
1522                        (peer->rate_last +
1523                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1524                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1525                 peer->rate_last = jiffies;
1526                 ++peer->rate_tokens;
1527 #ifdef CONFIG_IP_ROUTE_VERBOSE
1528                 if (log_martians &&
1529                     peer->rate_tokens == ip_rt_redirect_number &&
1530                     net_ratelimit())
1531                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1532                                &ip_hdr(skb)->saddr, rt->rt_iif,
1533                                 &rt->rt_dst, &rt->rt_gateway);
1534 #endif
1535         }
1536 }
1537
1538 static int ip_error(struct sk_buff *skb)
1539 {
1540         struct rtable *rt = skb_rtable(skb);
1541         struct inet_peer *peer;
1542         unsigned long now;
1543         bool send;
1544         int code;
1545
1546         switch (rt->dst.error) {
1547         case EINVAL:
1548         default:
1549                 goto out;
1550         case EHOSTUNREACH:
1551                 code = ICMP_HOST_UNREACH;
1552                 break;
1553         case ENETUNREACH:
1554                 code = ICMP_NET_UNREACH;
1555                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1556                                 IPSTATS_MIB_INNOROUTES);
1557                 break;
1558         case EACCES:
1559                 code = ICMP_PKT_FILTERED;
1560                 break;
1561         }
1562
1563         if (!rt->peer)
1564                 rt_bind_peer(rt, rt->rt_dst, 1);
1565         peer = rt->peer;
1566
1567         send = true;
1568         if (peer) {
1569                 now = jiffies;
1570                 peer->rate_tokens += now - peer->rate_last;
1571                 if (peer->rate_tokens > ip_rt_error_burst)
1572                         peer->rate_tokens = ip_rt_error_burst;
1573                 peer->rate_last = now;
1574                 if (peer->rate_tokens >= ip_rt_error_cost)
1575                         peer->rate_tokens -= ip_rt_error_cost;
1576                 else
1577                         send = false;
1578         }
1579         if (send)
1580                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1581
1582 out:    kfree_skb(skb);
1583         return 0;
1584 }
1585
1586 /*
1587  *      The last two values are not from the RFC but
1588  *      are needed for AMPRnet AX.25 paths.
1589  */
1590
1591 static const unsigned short mtu_plateau[] =
1592 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1593
1594 static inline unsigned short guess_mtu(unsigned short old_mtu)
1595 {
1596         int i;
1597
1598         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1599                 if (old_mtu > mtu_plateau[i])
1600                         return mtu_plateau[i];
1601         return 68;
1602 }
1603
1604 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1605                                  unsigned short new_mtu,
1606                                  struct net_device *dev)
1607 {
1608         unsigned short old_mtu = ntohs(iph->tot_len);
1609         unsigned short est_mtu = 0;
1610         struct inet_peer *peer;
1611
1612         peer = inet_getpeer_v4(iph->daddr, 1);
1613         if (peer) {
1614                 unsigned short mtu = new_mtu;
1615
1616                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1617                         /* BSD 4.2 derived systems incorrectly adjust
1618                          * tot_len by the IP header length, and report
1619                          * a zero MTU in the ICMP message.
1620                          */
1621                         if (mtu == 0 &&
1622                             old_mtu >= 68 + (iph->ihl << 2))
1623                                 old_mtu -= iph->ihl << 2;
1624                         mtu = guess_mtu(old_mtu);
1625                 }
1626
1627                 if (mtu < ip_rt_min_pmtu)
1628                         mtu = ip_rt_min_pmtu;
1629                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1630                         unsigned long pmtu_expires;
1631
1632                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1633                         if (!pmtu_expires)
1634                                 pmtu_expires = 1UL;
1635
1636                         est_mtu = mtu;
1637                         peer->pmtu_learned = mtu;
1638                         peer->pmtu_expires = pmtu_expires;
1639                         atomic_inc(&__rt_peer_genid);
1640                 }
1641
1642                 inet_putpeer(peer);
1643         }
1644         return est_mtu ? : new_mtu;
1645 }
1646
1647 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1648 {
1649         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1650
1651         if (!expires)
1652                 return;
1653         if (time_before(jiffies, expires)) {
1654                 u32 orig_dst_mtu = dst_mtu(dst);
1655                 if (peer->pmtu_learned < orig_dst_mtu) {
1656                         if (!peer->pmtu_orig)
1657                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1658                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1659                 }
1660         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1661                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1662 }
1663
1664 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1665 {
1666         struct rtable *rt = (struct rtable *) dst;
1667         struct inet_peer *peer;
1668
1669         dst_confirm(dst);
1670
1671         if (!rt->peer)
1672                 rt_bind_peer(rt, rt->rt_dst, 1);
1673         peer = rt->peer;
1674         if (peer) {
1675                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1676
1677                 if (mtu < ip_rt_min_pmtu)
1678                         mtu = ip_rt_min_pmtu;
1679                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1680
1681                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1682                         if (!pmtu_expires)
1683                                 pmtu_expires = 1UL;
1684
1685                         peer->pmtu_learned = mtu;
1686                         peer->pmtu_expires = pmtu_expires;
1687
1688                         atomic_inc(&__rt_peer_genid);
1689                         rt->rt_peer_genid = rt_peer_genid();
1690                 }
1691                 check_peer_pmtu(dst, peer);
1692         }
1693 }
1694
1695
1696 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1697 {
1698         struct rtable *rt = (struct rtable *) dst;
1699
1700         if (rt_is_expired(rt))
1701                 return NULL;
1702         if (rt->rt_peer_genid != rt_peer_genid()) {
1703                 struct inet_peer *peer;
1704
1705                 if (!rt->peer)
1706                         rt_bind_peer(rt, rt->rt_dst, 0);
1707
1708                 peer = rt->peer;
1709                 if (peer) {
1710                         check_peer_pmtu(dst, peer);
1711
1712                         if (peer->redirect_genid != redirect_genid)
1713                                 peer->redirect_learned.a4 = 0;
1714                         if (peer->redirect_learned.a4 &&
1715                             peer->redirect_learned.a4 != rt->rt_gateway) {
1716                                 if (check_peer_redir(dst, peer))
1717                                         return NULL;
1718                         }
1719                 }
1720
1721                 rt->rt_peer_genid = rt_peer_genid();
1722         }
1723         return dst;
1724 }
1725
1726 static void ipv4_dst_destroy(struct dst_entry *dst)
1727 {
1728         struct rtable *rt = (struct rtable *) dst;
1729         struct inet_peer *peer = rt->peer;
1730
1731         if (rt->fi) {
1732                 fib_info_put(rt->fi);
1733                 rt->fi = NULL;
1734         }
1735         if (peer) {
1736                 rt->peer = NULL;
1737                 inet_putpeer(peer);
1738         }
1739 }
1740
1741
1742 static void ipv4_link_failure(struct sk_buff *skb)
1743 {
1744         struct rtable *rt;
1745
1746         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1747
1748         rt = skb_rtable(skb);
1749         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1750                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1751 }
1752
1753 static int ip_rt_bug(struct sk_buff *skb)
1754 {
1755         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1756                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1757                 skb->dev ? skb->dev->name : "?");
1758         kfree_skb(skb);
1759         WARN_ON(1);
1760         return 0;
1761 }
1762
1763 /*
1764    We do not cache source address of outgoing interface,
1765    because it is used only by IP RR, TS and SRR options,
1766    so that it out of fast path.
1767
1768    BTW remember: "addr" is allowed to be not aligned
1769    in IP options!
1770  */
1771
1772 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1773 {
1774         __be32 src;
1775
1776         if (rt_is_output_route(rt))
1777                 src = ip_hdr(skb)->saddr;
1778         else {
1779                 struct fib_result res;
1780                 struct flowi4 fl4;
1781                 struct iphdr *iph;
1782
1783                 iph = ip_hdr(skb);
1784
1785                 memset(&fl4, 0, sizeof(fl4));
1786                 fl4.daddr = iph->daddr;
1787                 fl4.saddr = iph->saddr;
1788                 fl4.flowi4_tos = RT_TOS(iph->tos);
1789                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1790                 fl4.flowi4_iif = skb->dev->ifindex;
1791                 fl4.flowi4_mark = skb->mark;
1792
1793                 rcu_read_lock();
1794                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1795                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1796                 else
1797                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1798                                         RT_SCOPE_UNIVERSE);
1799                 rcu_read_unlock();
1800         }
1801         memcpy(addr, &src, 4);
1802 }
1803
1804 #ifdef CONFIG_IP_ROUTE_CLASSID
1805 static void set_class_tag(struct rtable *rt, u32 tag)
1806 {
1807         if (!(rt->dst.tclassid & 0xFFFF))
1808                 rt->dst.tclassid |= tag & 0xFFFF;
1809         if (!(rt->dst.tclassid & 0xFFFF0000))
1810                 rt->dst.tclassid |= tag & 0xFFFF0000;
1811 }
1812 #endif
1813
1814 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1815 {
1816         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1817
1818         if (advmss == 0) {
1819                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1820                                ip_rt_min_advmss);
1821                 if (advmss > 65535 - 40)
1822                         advmss = 65535 - 40;
1823         }
1824         return advmss;
1825 }
1826
1827 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1828 {
1829         const struct rtable *rt = (const struct rtable *) dst;
1830         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1831
1832         if (mtu && rt_is_output_route(rt))
1833                 return mtu;
1834
1835         mtu = dst->dev->mtu;
1836
1837         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1838
1839                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1840                         mtu = 576;
1841         }
1842
1843         if (mtu > IP_MAX_MTU)
1844                 mtu = IP_MAX_MTU;
1845
1846         return mtu;
1847 }
1848
1849 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1850                             struct fib_info *fi)
1851 {
1852         struct inet_peer *peer;
1853         int create = 0;
1854
1855         /* If a peer entry exists for this destination, we must hook
1856          * it up in order to get at cached metrics.
1857          */
1858         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1859                 create = 1;
1860
1861         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1862         if (peer) {
1863                 rt->rt_peer_genid = rt_peer_genid();
1864                 if (inet_metrics_new(peer))
1865                         memcpy(peer->metrics, fi->fib_metrics,
1866                                sizeof(u32) * RTAX_MAX);
1867                 dst_init_metrics(&rt->dst, peer->metrics, false);
1868
1869                 check_peer_pmtu(&rt->dst, peer);
1870                 if (peer->redirect_genid != redirect_genid)
1871                         peer->redirect_learned.a4 = 0;
1872                 if (peer->redirect_learned.a4 &&
1873                     peer->redirect_learned.a4 != rt->rt_gateway) {
1874                         rt->rt_gateway = peer->redirect_learned.a4;
1875                         rt->rt_flags |= RTCF_REDIRECTED;
1876                 }
1877         } else {
1878                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1879                         rt->fi = fi;
1880                         atomic_inc(&fi->fib_clntref);
1881                 }
1882                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1883         }
1884 }
1885
1886 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1887                            const struct fib_result *res,
1888                            struct fib_info *fi, u16 type, u32 itag)
1889 {
1890         struct dst_entry *dst = &rt->dst;
1891
1892         if (fi) {
1893                 if (FIB_RES_GW(*res) &&
1894                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1895                         rt->rt_gateway = FIB_RES_GW(*res);
1896                 rt_init_metrics(rt, fl4, fi);
1897 #ifdef CONFIG_IP_ROUTE_CLASSID
1898                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1899 #endif
1900         }
1901
1902         if (dst_mtu(dst) > IP_MAX_MTU)
1903                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1904         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1905                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1906
1907 #ifdef CONFIG_IP_ROUTE_CLASSID
1908 #ifdef CONFIG_IP_MULTIPLE_TABLES
1909         set_class_tag(rt, fib_rules_tclass(res));
1910 #endif
1911         set_class_tag(rt, itag);
1912 #endif
1913 }
1914
1915 static struct rtable *rt_dst_alloc(struct net_device *dev,
1916                                    bool nopolicy, bool noxfrm)
1917 {
1918         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1919                          DST_HOST |
1920                          (nopolicy ? DST_NOPOLICY : 0) |
1921                          (noxfrm ? DST_NOXFRM : 0));
1922 }
1923
1924 /* called in rcu_read_lock() section */
1925 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1926                                 u8 tos, struct net_device *dev, int our)
1927 {
1928         unsigned int hash;
1929         struct rtable *rth;
1930         __be32 spec_dst;
1931         struct in_device *in_dev = __in_dev_get_rcu(dev);
1932         u32 itag = 0;
1933         int err;
1934
1935         /* Primary sanity checks. */
1936
1937         if (in_dev == NULL)
1938                 return -EINVAL;
1939
1940         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1941             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1942                 goto e_inval;
1943
1944         if (ipv4_is_zeronet(saddr)) {
1945                 if (!ipv4_is_local_multicast(daddr))
1946                         goto e_inval;
1947                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1948         } else {
1949                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1950                                           &itag);
1951                 if (err < 0)
1952                         goto e_err;
1953         }
1954         rth = rt_dst_alloc(init_net.loopback_dev,
1955                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1956         if (!rth)
1957                 goto e_nobufs;
1958
1959 #ifdef CONFIG_IP_ROUTE_CLASSID
1960         rth->dst.tclassid = itag;
1961 #endif
1962         rth->dst.output = ip_rt_bug;
1963
1964         rth->rt_key_dst = daddr;
1965         rth->rt_key_src = saddr;
1966         rth->rt_genid   = rt_genid(dev_net(dev));
1967         rth->rt_flags   = RTCF_MULTICAST;
1968         rth->rt_type    = RTN_MULTICAST;
1969         rth->rt_key_tos = tos;
1970         rth->rt_dst     = daddr;
1971         rth->rt_src     = saddr;
1972         rth->rt_route_iif = dev->ifindex;
1973         rth->rt_iif     = dev->ifindex;
1974         rth->rt_oif     = 0;
1975         rth->rt_mark    = skb->mark;
1976         rth->rt_gateway = daddr;
1977         rth->rt_spec_dst= spec_dst;
1978         rth->rt_peer_genid = 0;
1979         rth->peer = NULL;
1980         rth->fi = NULL;
1981         if (our) {
1982                 rth->dst.input= ip_local_deliver;
1983                 rth->rt_flags |= RTCF_LOCAL;
1984         }
1985
1986 #ifdef CONFIG_IP_MROUTE
1987         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1988                 rth->dst.input = ip_mr_input;
1989 #endif
1990         RT_CACHE_STAT_INC(in_slow_mc);
1991
1992         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1993         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1994         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1995
1996 e_nobufs:
1997         return -ENOBUFS;
1998 e_inval:
1999         return -EINVAL;
2000 e_err:
2001         return err;
2002 }
2003
2004
2005 static void ip_handle_martian_source(struct net_device *dev,
2006                                      struct in_device *in_dev,
2007                                      struct sk_buff *skb,
2008                                      __be32 daddr,
2009                                      __be32 saddr)
2010 {
2011         RT_CACHE_STAT_INC(in_martian_src);
2012 #ifdef CONFIG_IP_ROUTE_VERBOSE
2013         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2014                 /*
2015                  *      RFC1812 recommendation, if source is martian,
2016                  *      the only hint is MAC header.
2017                  */
2018                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2019                         &daddr, &saddr, dev->name);
2020                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2021                         int i;
2022                         const unsigned char *p = skb_mac_header(skb);
2023                         printk(KERN_WARNING "ll header: ");
2024                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2025                                 printk("%02x", *p);
2026                                 if (i < (dev->hard_header_len - 1))
2027                                         printk(":");
2028                         }
2029                         printk("\n");
2030                 }
2031         }
2032 #endif
2033 }
2034
2035 /* called in rcu_read_lock() section */
2036 static int __mkroute_input(struct sk_buff *skb,
2037                            const struct fib_result *res,
2038                            struct in_device *in_dev,
2039                            __be32 daddr, __be32 saddr, u32 tos,
2040                            struct rtable **result)
2041 {
2042         struct rtable *rth;
2043         int err;
2044         struct in_device *out_dev;
2045         unsigned int flags = 0;
2046         __be32 spec_dst;
2047         u32 itag;
2048
2049         /* get a working reference to the output device */
2050         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2051         if (out_dev == NULL) {
2052                 if (net_ratelimit())
2053                         printk(KERN_CRIT "Bug in ip_route_input" \
2054                                "_slow(). Please, report\n");
2055                 return -EINVAL;
2056         }
2057
2058
2059         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2060                                   in_dev->dev, &spec_dst, &itag);
2061         if (err < 0) {
2062                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2063                                          saddr);
2064
2065                 goto cleanup;
2066         }
2067
2068         if (err)
2069                 flags |= RTCF_DIRECTSRC;
2070
2071         if (out_dev == in_dev && err &&
2072             (IN_DEV_SHARED_MEDIA(out_dev) ||
2073              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2074                 flags |= RTCF_DOREDIRECT;
2075
2076         if (skb->protocol != htons(ETH_P_IP)) {
2077                 /* Not IP (i.e. ARP). Do not create route, if it is
2078                  * invalid for proxy arp. DNAT routes are always valid.
2079                  *
2080                  * Proxy arp feature have been extended to allow, ARP
2081                  * replies back to the same interface, to support
2082                  * Private VLAN switch technologies. See arp.c.
2083                  */
2084                 if (out_dev == in_dev &&
2085                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2086                         err = -EINVAL;
2087                         goto cleanup;
2088                 }
2089         }
2090
2091         rth = rt_dst_alloc(out_dev->dev,
2092                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2093                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2094         if (!rth) {
2095                 err = -ENOBUFS;
2096                 goto cleanup;
2097         }
2098
2099         rth->rt_key_dst = daddr;
2100         rth->rt_key_src = saddr;
2101         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2102         rth->rt_flags = flags;
2103         rth->rt_type = res->type;
2104         rth->rt_key_tos = tos;
2105         rth->rt_dst     = daddr;
2106         rth->rt_src     = saddr;
2107         rth->rt_route_iif = in_dev->dev->ifindex;
2108         rth->rt_iif     = in_dev->dev->ifindex;
2109         rth->rt_oif     = 0;
2110         rth->rt_mark    = skb->mark;
2111         rth->rt_gateway = daddr;
2112         rth->rt_spec_dst= spec_dst;
2113         rth->rt_peer_genid = 0;
2114         rth->peer = NULL;
2115         rth->fi = NULL;
2116
2117         rth->dst.input = ip_forward;
2118         rth->dst.output = ip_output;
2119
2120         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2121
2122         *result = rth;
2123         err = 0;
2124  cleanup:
2125         return err;
2126 }
2127
2128 static int ip_mkroute_input(struct sk_buff *skb,
2129                             struct fib_result *res,
2130                             const struct flowi4 *fl4,
2131                             struct in_device *in_dev,
2132                             __be32 daddr, __be32 saddr, u32 tos)
2133 {
2134         struct rtable* rth = NULL;
2135         int err;
2136         unsigned hash;
2137
2138 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2139         if (res->fi && res->fi->fib_nhs > 1)
2140                 fib_select_multipath(res);
2141 #endif
2142
2143         /* create a routing cache entry */
2144         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2145         if (err)
2146                 return err;
2147
2148         /* put it into the cache */
2149         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2150                        rt_genid(dev_net(rth->dst.dev)));
2151         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2152         if (IS_ERR(rth))
2153                 return PTR_ERR(rth);
2154         return 0;
2155 }
2156
2157 /*
2158  *      NOTE. We drop all the packets that has local source
2159  *      addresses, because every properly looped back packet
2160  *      must have correct destination already attached by output routine.
2161  *
2162  *      Such approach solves two big problems:
2163  *      1. Not simplex devices are handled properly.
2164  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2165  *      called with rcu_read_lock()
2166  */
2167
2168 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2169                                u8 tos, struct net_device *dev)
2170 {
2171         struct fib_result res;
2172         struct in_device *in_dev = __in_dev_get_rcu(dev);
2173         struct flowi4   fl4;
2174         unsigned        flags = 0;
2175         u32             itag = 0;
2176         struct rtable * rth;
2177         unsigned        hash;
2178         __be32          spec_dst;
2179         int             err = -EINVAL;
2180         struct net    * net = dev_net(dev);
2181
2182         /* IP on this device is disabled. */
2183
2184         if (!in_dev)
2185                 goto out;
2186
2187         /* Check for the most weird martians, which can be not detected
2188            by fib_lookup.
2189          */
2190
2191         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2192             ipv4_is_loopback(saddr))
2193                 goto martian_source;
2194
2195         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2196                 goto brd_input;
2197
2198         /* Accept zero addresses only to limited broadcast;
2199          * I even do not know to fix it or not. Waiting for complains :-)
2200          */
2201         if (ipv4_is_zeronet(saddr))
2202                 goto martian_source;
2203
2204         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2205                 goto martian_destination;
2206
2207         /*
2208          *      Now we are ready to route packet.
2209          */
2210         fl4.flowi4_oif = 0;
2211         fl4.flowi4_iif = dev->ifindex;
2212         fl4.flowi4_mark = skb->mark;
2213         fl4.flowi4_tos = tos;
2214         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2215         fl4.daddr = daddr;
2216         fl4.saddr = saddr;
2217         err = fib_lookup(net, &fl4, &res);
2218         if (err != 0) {
2219                 if (!IN_DEV_FORWARD(in_dev))
2220                         goto e_hostunreach;
2221                 goto no_route;
2222         }
2223
2224         RT_CACHE_STAT_INC(in_slow_tot);
2225
2226         if (res.type == RTN_BROADCAST)
2227                 goto brd_input;
2228
2229         if (res.type == RTN_LOCAL) {
2230                 err = fib_validate_source(skb, saddr, daddr, tos,
2231                                           net->loopback_dev->ifindex,
2232                                           dev, &spec_dst, &itag);
2233                 if (err < 0)
2234                         goto martian_source_keep_err;
2235                 if (err)
2236                         flags |= RTCF_DIRECTSRC;
2237                 spec_dst = daddr;
2238                 goto local_input;
2239         }
2240
2241         if (!IN_DEV_FORWARD(in_dev))
2242                 goto e_hostunreach;
2243         if (res.type != RTN_UNICAST)
2244                 goto martian_destination;
2245
2246         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2247 out:    return err;
2248
2249 brd_input:
2250         if (skb->protocol != htons(ETH_P_IP))
2251                 goto e_inval;
2252
2253         if (ipv4_is_zeronet(saddr))
2254                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2255         else {
2256                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2257                                           &itag);
2258                 if (err < 0)
2259                         goto martian_source_keep_err;
2260                 if (err)
2261                         flags |= RTCF_DIRECTSRC;
2262         }
2263         flags |= RTCF_BROADCAST;
2264         res.type = RTN_BROADCAST;
2265         RT_CACHE_STAT_INC(in_brd);
2266
2267 local_input:
2268         rth = rt_dst_alloc(net->loopback_dev,
2269                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2270         if (!rth)
2271                 goto e_nobufs;
2272
2273         rth->dst.input= ip_local_deliver;
2274         rth->dst.output= ip_rt_bug;
2275 #ifdef CONFIG_IP_ROUTE_CLASSID
2276         rth->dst.tclassid = itag;
2277 #endif
2278
2279         rth->rt_key_dst = daddr;
2280         rth->rt_key_src = saddr;
2281         rth->rt_genid = rt_genid(net);
2282         rth->rt_flags   = flags|RTCF_LOCAL;
2283         rth->rt_type    = res.type;
2284         rth->rt_key_tos = tos;
2285         rth->rt_dst     = daddr;
2286         rth->rt_src     = saddr;
2287 #ifdef CONFIG_IP_ROUTE_CLASSID
2288         rth->dst.tclassid = itag;
2289 #endif
2290         rth->rt_route_iif = dev->ifindex;
2291         rth->rt_iif     = dev->ifindex;
2292         rth->rt_oif     = 0;
2293         rth->rt_mark    = skb->mark;
2294         rth->rt_gateway = daddr;
2295         rth->rt_spec_dst= spec_dst;
2296         rth->rt_peer_genid = 0;
2297         rth->peer = NULL;
2298         rth->fi = NULL;
2299         if (res.type == RTN_UNREACHABLE) {
2300                 rth->dst.input= ip_error;
2301                 rth->dst.error= -err;
2302                 rth->rt_flags   &= ~RTCF_LOCAL;
2303         }
2304         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2305         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2306         err = 0;
2307         if (IS_ERR(rth))
2308                 err = PTR_ERR(rth);
2309         goto out;
2310
2311 no_route:
2312         RT_CACHE_STAT_INC(in_no_route);
2313         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2314         res.type = RTN_UNREACHABLE;
2315         if (err == -ESRCH)
2316                 err = -ENETUNREACH;
2317         goto local_input;
2318
2319         /*
2320          *      Do not cache martian addresses: they should be logged (RFC1812)
2321          */
2322 martian_destination:
2323         RT_CACHE_STAT_INC(in_martian_dst);
2324 #ifdef CONFIG_IP_ROUTE_VERBOSE
2325         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2326                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2327                         &daddr, &saddr, dev->name);
2328 #endif
2329
2330 e_hostunreach:
2331         err = -EHOSTUNREACH;
2332         goto out;
2333
2334 e_inval:
2335         err = -EINVAL;
2336         goto out;
2337
2338 e_nobufs:
2339         err = -ENOBUFS;
2340         goto out;
2341
2342 martian_source:
2343         err = -EINVAL;
2344 martian_source_keep_err:
2345         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2346         goto out;
2347 }
2348
2349 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2350                            u8 tos, struct net_device *dev, bool noref)
2351 {
2352         struct rtable * rth;
2353         unsigned        hash;
2354         int iif = dev->ifindex;
2355         struct net *net;
2356         int res;
2357
2358         net = dev_net(dev);
2359
2360         rcu_read_lock();
2361
2362         if (!rt_caching(net))
2363                 goto skip_cache;
2364
2365         tos &= IPTOS_RT_MASK;
2366         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2367
2368         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2369              rth = rcu_dereference(rth->dst.rt_next)) {
2370                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2371                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2372                      (rth->rt_route_iif ^ iif) |
2373                      (rth->rt_key_tos ^ tos)) == 0 &&
2374                     rth->rt_mark == skb->mark &&
2375                     net_eq(dev_net(rth->dst.dev), net) &&
2376                     !rt_is_expired(rth)) {
2377                         if (noref) {
2378                                 dst_use_noref(&rth->dst, jiffies);
2379                                 skb_dst_set_noref(skb, &rth->dst);
2380                         } else {
2381                                 dst_use(&rth->dst, jiffies);
2382                                 skb_dst_set(skb, &rth->dst);
2383                         }
2384                         RT_CACHE_STAT_INC(in_hit);
2385                         rcu_read_unlock();
2386                         return 0;
2387                 }
2388                 RT_CACHE_STAT_INC(in_hlist_search);
2389         }
2390
2391 skip_cache:
2392         /* Multicast recognition logic is moved from route cache to here.
2393            The problem was that too many Ethernet cards have broken/missing
2394            hardware multicast filters :-( As result the host on multicasting
2395            network acquires a lot of useless route cache entries, sort of
2396            SDR messages from all the world. Now we try to get rid of them.
2397            Really, provided software IP multicast filter is organized
2398            reasonably (at least, hashed), it does not result in a slowdown
2399            comparing with route cache reject entries.
2400            Note, that multicast routers are not affected, because
2401            route cache entry is created eventually.
2402          */
2403         if (ipv4_is_multicast(daddr)) {
2404                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2405
2406                 if (in_dev) {
2407                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2408                                                   ip_hdr(skb)->protocol);
2409                         if (our
2410 #ifdef CONFIG_IP_MROUTE
2411                                 ||
2412                             (!ipv4_is_local_multicast(daddr) &&
2413                              IN_DEV_MFORWARD(in_dev))
2414 #endif
2415                            ) {
2416                                 int res = ip_route_input_mc(skb, daddr, saddr,
2417                                                             tos, dev, our);
2418                                 rcu_read_unlock();
2419                                 return res;
2420                         }
2421                 }
2422                 rcu_read_unlock();
2423                 return -EINVAL;
2424         }
2425         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2426         rcu_read_unlock();
2427         return res;
2428 }
2429 EXPORT_SYMBOL(ip_route_input_common);
2430
2431 /* called with rcu_read_lock() */
2432 static struct rtable *__mkroute_output(const struct fib_result *res,
2433                                        const struct flowi4 *fl4,
2434                                        __be32 orig_daddr, __be32 orig_saddr,
2435                                        int orig_oif, struct net_device *dev_out,
2436                                        unsigned int flags)
2437 {
2438         struct fib_info *fi = res->fi;
2439         u32 tos = RT_FL_TOS(fl4);
2440         struct in_device *in_dev;
2441         u16 type = res->type;
2442         struct rtable *rth;
2443
2444         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2445                 return ERR_PTR(-EINVAL);
2446
2447         if (ipv4_is_lbcast(fl4->daddr))
2448                 type = RTN_BROADCAST;
2449         else if (ipv4_is_multicast(fl4->daddr))
2450                 type = RTN_MULTICAST;
2451         else if (ipv4_is_zeronet(fl4->daddr))
2452                 return ERR_PTR(-EINVAL);
2453
2454         if (dev_out->flags & IFF_LOOPBACK)
2455                 flags |= RTCF_LOCAL;
2456
2457         in_dev = __in_dev_get_rcu(dev_out);
2458         if (!in_dev)
2459                 return ERR_PTR(-EINVAL);
2460
2461         if (type == RTN_BROADCAST) {
2462                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2463                 fi = NULL;
2464         } else if (type == RTN_MULTICAST) {
2465                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2466                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2467                                      fl4->flowi4_proto))
2468                         flags &= ~RTCF_LOCAL;
2469                 /* If multicast route do not exist use
2470                  * default one, but do not gateway in this case.
2471                  * Yes, it is hack.
2472                  */
2473                 if (fi && res->prefixlen < 4)
2474                         fi = NULL;
2475         }
2476
2477         rth = rt_dst_alloc(dev_out,
2478                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2479                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2480         if (!rth)
2481                 return ERR_PTR(-ENOBUFS);
2482
2483         rth->dst.output = ip_output;
2484
2485         rth->rt_key_dst = orig_daddr;
2486         rth->rt_key_src = orig_saddr;
2487         rth->rt_genid = rt_genid(dev_net(dev_out));
2488         rth->rt_flags   = flags;
2489         rth->rt_type    = type;
2490         rth->rt_key_tos = tos;
2491         rth->rt_dst     = fl4->daddr;
2492         rth->rt_src     = fl4->saddr;
2493         rth->rt_route_iif = 0;
2494         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2495         rth->rt_oif     = orig_oif;
2496         rth->rt_mark    = fl4->flowi4_mark;
2497         rth->rt_gateway = fl4->daddr;
2498         rth->rt_spec_dst= fl4->saddr;
2499         rth->rt_peer_genid = 0;
2500         rth->peer = NULL;
2501         rth->fi = NULL;
2502
2503         RT_CACHE_STAT_INC(out_slow_tot);
2504
2505         if (flags & RTCF_LOCAL) {
2506                 rth->dst.input = ip_local_deliver;
2507                 rth->rt_spec_dst = fl4->daddr;
2508         }
2509         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2510                 rth->rt_spec_dst = fl4->saddr;
2511                 if (flags & RTCF_LOCAL &&
2512                     !(dev_out->flags & IFF_LOOPBACK)) {
2513                         rth->dst.output = ip_mc_output;
2514                         RT_CACHE_STAT_INC(out_slow_mc);
2515                 }
2516 #ifdef CONFIG_IP_MROUTE
2517                 if (type == RTN_MULTICAST) {
2518                         if (IN_DEV_MFORWARD(in_dev) &&
2519                             !ipv4_is_local_multicast(fl4->daddr)) {
2520                                 rth->dst.input = ip_mr_input;
2521                                 rth->dst.output = ip_mc_output;
2522                         }
2523                 }
2524 #endif
2525         }
2526
2527         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2528
2529         return rth;
2530 }
2531
2532 /*
2533  * Major route resolver routine.
2534  * called with rcu_read_lock();
2535  */
2536
2537 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2538 {
2539         struct net_device *dev_out = NULL;
2540         u32 tos = RT_FL_TOS(fl4);
2541         unsigned int flags = 0;
2542         struct fib_result res;
2543         struct rtable *rth;
2544         __be32 orig_daddr;
2545         __be32 orig_saddr;
2546         int orig_oif;
2547
2548         res.fi          = NULL;
2549 #ifdef CONFIG_IP_MULTIPLE_TABLES
2550         res.r           = NULL;
2551 #endif
2552
2553         orig_daddr = fl4->daddr;
2554         orig_saddr = fl4->saddr;
2555         orig_oif = fl4->flowi4_oif;
2556
2557         fl4->flowi4_iif = net->loopback_dev->ifindex;
2558         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2559         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2560                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2561
2562         rcu_read_lock();
2563         if (fl4->saddr) {
2564                 rth = ERR_PTR(-EINVAL);
2565                 if (ipv4_is_multicast(fl4->saddr) ||
2566                     ipv4_is_lbcast(fl4->saddr) ||
2567                     ipv4_is_zeronet(fl4->saddr))
2568                         goto out;
2569
2570                 /* I removed check for oif == dev_out->oif here.
2571                    It was wrong for two reasons:
2572                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2573                       is assigned to multiple interfaces.
2574                    2. Moreover, we are allowed to send packets with saddr
2575                       of another iface. --ANK
2576                  */
2577
2578                 if (fl4->flowi4_oif == 0 &&
2579                     (ipv4_is_multicast(fl4->daddr) ||
2580                      ipv4_is_lbcast(fl4->daddr))) {
2581                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2582                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2583                         if (dev_out == NULL)
2584                                 goto out;
2585
2586                         /* Special hack: user can direct multicasts
2587                            and limited broadcast via necessary interface
2588                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2589                            This hack is not just for fun, it allows
2590                            vic,vat and friends to work.
2591                            They bind socket to loopback, set ttl to zero
2592                            and expect that it will work.
2593                            From the viewpoint of routing cache they are broken,
2594                            because we are not allowed to build multicast path
2595                            with loopback source addr (look, routing cache
2596                            cannot know, that ttl is zero, so that packet
2597                            will not leave this host and route is valid).
2598                            Luckily, this hack is good workaround.
2599                          */
2600
2601                         fl4->flowi4_oif = dev_out->ifindex;
2602                         goto make_route;
2603                 }
2604
2605                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2606                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2607                         if (!__ip_dev_find(net, fl4->saddr, false))
2608                                 goto out;
2609                 }
2610         }
2611
2612
2613         if (fl4->flowi4_oif) {
2614                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2615                 rth = ERR_PTR(-ENODEV);
2616                 if (dev_out == NULL)
2617                         goto out;
2618
2619                 /* RACE: Check return value of inet_select_addr instead. */
2620                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2621                         rth = ERR_PTR(-ENETUNREACH);
2622                         goto out;
2623                 }
2624                 if (ipv4_is_local_multicast(fl4->daddr) ||
2625                     ipv4_is_lbcast(fl4->daddr)) {
2626                         if (!fl4->saddr)
2627                                 fl4->saddr = inet_select_addr(dev_out, 0,
2628                                                               RT_SCOPE_LINK);
2629                         goto make_route;
2630                 }
2631                 if (fl4->saddr) {
2632                         if (ipv4_is_multicast(fl4->daddr))
2633                                 fl4->saddr = inet_select_addr(dev_out, 0,
2634                                                               fl4->flowi4_scope);
2635                         else if (!fl4->daddr)
2636                                 fl4->saddr = inet_select_addr(dev_out, 0,
2637                                                               RT_SCOPE_HOST);
2638                 }
2639         }
2640
2641         if (!fl4->daddr) {
2642                 fl4->daddr = fl4->saddr;
2643                 if (!fl4->daddr)
2644                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2645                 dev_out = net->loopback_dev;
2646                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2647                 res.type = RTN_LOCAL;
2648                 flags |= RTCF_LOCAL;
2649                 goto make_route;
2650         }
2651
2652         if (fib_lookup(net, fl4, &res)) {
2653                 res.fi = NULL;
2654                 if (fl4->flowi4_oif) {
2655                         /* Apparently, routing tables are wrong. Assume,
2656                            that the destination is on link.
2657
2658                            WHY? DW.
2659                            Because we are allowed to send to iface
2660                            even if it has NO routes and NO assigned
2661                            addresses. When oif is specified, routing
2662                            tables are looked up with only one purpose:
2663                            to catch if destination is gatewayed, rather than
2664                            direct. Moreover, if MSG_DONTROUTE is set,
2665                            we send packet, ignoring both routing tables
2666                            and ifaddr state. --ANK
2667
2668
2669                            We could make it even if oif is unknown,
2670                            likely IPv6, but we do not.
2671                          */
2672
2673                         if (fl4->saddr == 0)
2674                                 fl4->saddr = inet_select_addr(dev_out, 0,
2675                                                               RT_SCOPE_LINK);
2676                         res.type = RTN_UNICAST;
2677                         goto make_route;
2678                 }
2679                 rth = ERR_PTR(-ENETUNREACH);
2680                 goto out;
2681         }
2682
2683         if (res.type == RTN_LOCAL) {
2684                 if (!fl4->saddr) {
2685                         if (res.fi->fib_prefsrc)
2686                                 fl4->saddr = res.fi->fib_prefsrc;
2687                         else
2688                                 fl4->saddr = fl4->daddr;
2689                 }
2690                 dev_out = net->loopback_dev;
2691                 fl4->flowi4_oif = dev_out->ifindex;
2692                 res.fi = NULL;
2693                 flags |= RTCF_LOCAL;
2694                 goto make_route;
2695         }
2696
2697 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2698         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2699                 fib_select_multipath(&res);
2700         else
2701 #endif
2702         if (!res.prefixlen &&
2703             res.table->tb_num_default > 1 &&
2704             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2705                 fib_select_default(&res);
2706
2707         if (!fl4->saddr)
2708                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2709
2710         dev_out = FIB_RES_DEV(res);
2711         fl4->flowi4_oif = dev_out->ifindex;
2712
2713
2714 make_route:
2715         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2716                                dev_out, flags);
2717         if (!IS_ERR(rth)) {
2718                 unsigned int hash;
2719
2720                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2721                                rt_genid(dev_net(dev_out)));
2722                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2723         }
2724
2725 out:
2726         rcu_read_unlock();
2727         return rth;
2728 }
2729
2730 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2731 {
2732         struct rtable *rth;
2733         unsigned int hash;
2734
2735         if (!rt_caching(net))
2736                 goto slow_output;
2737
2738         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2739
2740         rcu_read_lock_bh();
2741         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2742                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2743                 if (rth->rt_key_dst == flp4->daddr &&
2744                     rth->rt_key_src == flp4->saddr &&
2745                     rt_is_output_route(rth) &&
2746                     rth->rt_oif == flp4->flowi4_oif &&
2747                     rth->rt_mark == flp4->flowi4_mark &&
2748                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2749                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2750                     net_eq(dev_net(rth->dst.dev), net) &&
2751                     !rt_is_expired(rth)) {
2752                         dst_use(&rth->dst, jiffies);
2753                         RT_CACHE_STAT_INC(out_hit);
2754                         rcu_read_unlock_bh();
2755                         if (!flp4->saddr)
2756                                 flp4->saddr = rth->rt_src;
2757                         if (!flp4->daddr)
2758                                 flp4->daddr = rth->rt_dst;
2759                         return rth;
2760                 }
2761                 RT_CACHE_STAT_INC(out_hlist_search);
2762         }
2763         rcu_read_unlock_bh();
2764
2765 slow_output:
2766         return ip_route_output_slow(net, flp4);
2767 }
2768 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2769
2770 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2771 {
2772         return NULL;
2773 }
2774
2775 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2776 {
2777         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2778
2779         return mtu ? : dst->dev->mtu;
2780 }
2781
2782 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2783 {
2784 }
2785
2786 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2787                                           unsigned long old)
2788 {
2789         return NULL;
2790 }
2791
2792 static struct dst_ops ipv4_dst_blackhole_ops = {
2793         .family                 =       AF_INET,
2794         .protocol               =       cpu_to_be16(ETH_P_IP),
2795         .destroy                =       ipv4_dst_destroy,
2796         .check                  =       ipv4_blackhole_dst_check,
2797         .mtu                    =       ipv4_blackhole_mtu,
2798         .default_advmss         =       ipv4_default_advmss,
2799         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2800         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2801         .neigh_lookup           =       ipv4_neigh_lookup,
2802 };
2803
2804 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2805 {
2806         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2807         struct rtable *ort = (struct rtable *) dst_orig;
2808
2809         if (rt) {
2810                 struct dst_entry *new = &rt->dst;
2811
2812                 new->__use = 1;
2813                 new->input = dst_discard;
2814                 new->output = dst_discard;
2815                 dst_copy_metrics(new, &ort->dst);
2816
2817                 new->dev = ort->dst.dev;
2818                 if (new->dev)
2819                         dev_hold(new->dev);
2820
2821                 rt->rt_key_dst = ort->rt_key_dst;
2822                 rt->rt_key_src = ort->rt_key_src;
2823                 rt->rt_key_tos = ort->rt_key_tos;
2824                 rt->rt_route_iif = ort->rt_route_iif;
2825                 rt->rt_iif = ort->rt_iif;
2826                 rt->rt_oif = ort->rt_oif;
2827                 rt->rt_mark = ort->rt_mark;
2828
2829                 rt->rt_genid = rt_genid(net);
2830                 rt->rt_flags = ort->rt_flags;
2831                 rt->rt_type = ort->rt_type;
2832                 rt->rt_dst = ort->rt_dst;
2833                 rt->rt_src = ort->rt_src;
2834                 rt->rt_gateway = ort->rt_gateway;
2835                 rt->rt_spec_dst = ort->rt_spec_dst;
2836                 rt->peer = ort->peer;
2837                 if (rt->peer)
2838                         atomic_inc(&rt->peer->refcnt);
2839                 rt->fi = ort->fi;
2840                 if (rt->fi)
2841                         atomic_inc(&rt->fi->fib_clntref);
2842
2843                 dst_free(new);
2844         }
2845
2846         dst_release(dst_orig);
2847
2848         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2849 }
2850
2851 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2852                                     struct sock *sk)
2853 {
2854         struct rtable *rt = __ip_route_output_key(net, flp4);
2855
2856         if (IS_ERR(rt))
2857                 return rt;
2858
2859         if (flp4->flowi4_proto)
2860                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2861                                                    flowi4_to_flowi(flp4),
2862                                                    sk, 0);
2863
2864         return rt;
2865 }
2866 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2867
2868 static int rt_fill_info(struct net *net,
2869                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2870                         int nowait, unsigned int flags)
2871 {
2872         struct rtable *rt = skb_rtable(skb);
2873         struct rtmsg *r;
2874         struct nlmsghdr *nlh;
2875         unsigned long expires = 0;
2876         const struct inet_peer *peer = rt->peer;
2877         u32 id = 0, ts = 0, tsage = 0, error;
2878
2879         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2880         if (nlh == NULL)
2881                 return -EMSGSIZE;
2882
2883         r = nlmsg_data(nlh);
2884         r->rtm_family    = AF_INET;
2885         r->rtm_dst_len  = 32;
2886         r->rtm_src_len  = 0;
2887         r->rtm_tos      = rt->rt_key_tos;
2888         r->rtm_table    = RT_TABLE_MAIN;
2889         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2890         r->rtm_type     = rt->rt_type;
2891         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2892         r->rtm_protocol = RTPROT_UNSPEC;
2893         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2894         if (rt->rt_flags & RTCF_NOTIFY)
2895                 r->rtm_flags |= RTM_F_NOTIFY;
2896
2897         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2898
2899         if (rt->rt_key_src) {
2900                 r->rtm_src_len = 32;
2901                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2902         }
2903         if (rt->dst.dev)
2904                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2905 #ifdef CONFIG_IP_ROUTE_CLASSID
2906         if (rt->dst.tclassid)
2907                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2908 #endif
2909         if (rt_is_input_route(rt))
2910                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2911         else if (rt->rt_src != rt->rt_key_src)
2912                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2913
2914         if (rt->rt_dst != rt->rt_gateway)
2915                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2916
2917         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2918                 goto nla_put_failure;
2919
2920         if (rt->rt_mark)
2921                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2922
2923         error = rt->dst.error;
2924         if (peer) {
2925                 inet_peer_refcheck(rt->peer);
2926                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2927                 if (peer->tcp_ts_stamp) {
2928                         ts = peer->tcp_ts;
2929                         tsage = get_seconds() - peer->tcp_ts_stamp;
2930                 }
2931                 expires = ACCESS_ONCE(peer->pmtu_expires);
2932                 if (expires) {
2933                         if (time_before(jiffies, expires))
2934                                 expires -= jiffies;
2935                         else
2936                                 expires = 0;
2937                 }
2938         }
2939
2940         if (rt_is_input_route(rt)) {
2941 #ifdef CONFIG_IP_MROUTE
2942                 __be32 dst = rt->rt_dst;
2943
2944                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2945                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2946                         int err = ipmr_get_route(net, skb,
2947                                                  rt->rt_src, rt->rt_dst,
2948                                                  r, nowait);
2949                         if (err <= 0) {
2950                                 if (!nowait) {
2951                                         if (err == 0)
2952                                                 return 0;
2953                                         goto nla_put_failure;
2954                                 } else {
2955                                         if (err == -EMSGSIZE)
2956                                                 goto nla_put_failure;
2957                                         error = err;
2958                                 }
2959                         }
2960                 } else
2961 #endif
2962                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2963         }
2964
2965         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2966                                expires, error) < 0)
2967                 goto nla_put_failure;
2968
2969         return nlmsg_end(skb, nlh);
2970
2971 nla_put_failure:
2972         nlmsg_cancel(skb, nlh);
2973         return -EMSGSIZE;
2974 }
2975
2976 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2977 {
2978         struct net *net = sock_net(in_skb->sk);
2979         struct rtmsg *rtm;
2980         struct nlattr *tb[RTA_MAX+1];
2981         struct rtable *rt = NULL;
2982         __be32 dst = 0;
2983         __be32 src = 0;
2984         u32 iif;
2985         int err;
2986         int mark;
2987         struct sk_buff *skb;
2988
2989         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2990         if (err < 0)
2991                 goto errout;
2992
2993         rtm = nlmsg_data(nlh);
2994
2995         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2996         if (skb == NULL) {
2997                 err = -ENOBUFS;
2998                 goto errout;
2999         }
3000
3001         /* Reserve room for dummy headers, this skb can pass
3002            through good chunk of routing engine.
3003          */
3004         skb_reset_mac_header(skb);
3005         skb_reset_network_header(skb);
3006
3007         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3008         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3009         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3010
3011         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3012         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3013         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3014         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3015
3016         if (iif) {
3017                 struct net_device *dev;
3018
3019                 dev = __dev_get_by_index(net, iif);
3020                 if (dev == NULL) {
3021                         err = -ENODEV;
3022                         goto errout_free;
3023                 }
3024
3025                 skb->protocol   = htons(ETH_P_IP);
3026                 skb->dev        = dev;
3027                 skb->mark       = mark;
3028                 local_bh_disable();
3029                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3030                 local_bh_enable();
3031
3032                 rt = skb_rtable(skb);
3033                 if (err == 0 && rt->dst.error)
3034                         err = -rt->dst.error;
3035         } else {
3036                 struct flowi4 fl4 = {
3037                         .daddr = dst,
3038                         .saddr = src,
3039                         .flowi4_tos = rtm->rtm_tos,
3040                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3041                         .flowi4_mark = mark,
3042                 };
3043                 rt = ip_route_output_key(net, &fl4);
3044
3045                 err = 0;
3046                 if (IS_ERR(rt))
3047                         err = PTR_ERR(rt);
3048         }
3049
3050         if (err)
3051                 goto errout_free;
3052
3053         skb_dst_set(skb, &rt->dst);
3054         if (rtm->rtm_flags & RTM_F_NOTIFY)
3055                 rt->rt_flags |= RTCF_NOTIFY;
3056
3057         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3058                            RTM_NEWROUTE, 0, 0);
3059         if (err <= 0)
3060                 goto errout_free;
3061
3062         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3063 errout:
3064         return err;
3065
3066 errout_free:
3067         kfree_skb(skb);
3068         goto errout;
3069 }
3070
3071 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3072 {
3073         struct rtable *rt;
3074         int h, s_h;
3075         int idx, s_idx;
3076         struct net *net;
3077
3078         net = sock_net(skb->sk);
3079
3080         s_h = cb->args[0];
3081         if (s_h < 0)
3082                 s_h = 0;
3083         s_idx = idx = cb->args[1];
3084         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3085                 if (!rt_hash_table[h].chain)
3086                         continue;
3087                 rcu_read_lock_bh();
3088                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3089                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3090                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3091                                 continue;
3092                         if (rt_is_expired(rt))
3093                                 continue;
3094                         skb_dst_set_noref(skb, &rt->dst);
3095                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3096                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3097                                          1, NLM_F_MULTI) <= 0) {
3098                                 skb_dst_drop(skb);
3099                                 rcu_read_unlock_bh();
3100                                 goto done;
3101                         }
3102                         skb_dst_drop(skb);
3103                 }
3104                 rcu_read_unlock_bh();
3105         }
3106
3107 done:
3108         cb->args[0] = h;
3109         cb->args[1] = idx;
3110         return skb->len;
3111 }
3112
3113 void ip_rt_multicast_event(struct in_device *in_dev)
3114 {
3115         rt_cache_flush(dev_net(in_dev->dev), 0);
3116 }
3117
3118 #ifdef CONFIG_SYSCTL
3119 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3120                                         void __user *buffer,
3121                                         size_t *lenp, loff_t *ppos)
3122 {
3123         if (write) {
3124                 int flush_delay;
3125                 ctl_table ctl;
3126                 struct net *net;
3127
3128                 memcpy(&ctl, __ctl, sizeof(ctl));
3129                 ctl.data = &flush_delay;
3130                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3131
3132                 net = (struct net *)__ctl->extra1;
3133                 rt_cache_flush(net, flush_delay);
3134                 return 0;
3135         }
3136
3137         return -EINVAL;
3138 }
3139
3140 static ctl_table ipv4_route_table[] = {
3141         {
3142                 .procname       = "gc_thresh",
3143                 .data           = &ipv4_dst_ops.gc_thresh,
3144                 .maxlen         = sizeof(int),
3145                 .mode           = 0644,
3146                 .proc_handler   = proc_dointvec,
3147         },
3148         {
3149                 .procname       = "max_size",
3150                 .data           = &ip_rt_max_size,
3151                 .maxlen         = sizeof(int),
3152                 .mode           = 0644,
3153                 .proc_handler   = proc_dointvec,
3154         },
3155         {
3156                 /*  Deprecated. Use gc_min_interval_ms */
3157
3158                 .procname       = "gc_min_interval",
3159                 .data           = &ip_rt_gc_min_interval,
3160                 .maxlen         = sizeof(int),
3161                 .mode           = 0644,
3162                 .proc_handler   = proc_dointvec_jiffies,
3163         },
3164         {
3165                 .procname       = "gc_min_interval_ms",
3166                 .data           = &ip_rt_gc_min_interval,
3167                 .maxlen         = sizeof(int),
3168                 .mode           = 0644,
3169                 .proc_handler   = proc_dointvec_ms_jiffies,
3170         },
3171         {
3172                 .procname       = "gc_timeout",
3173                 .data           = &ip_rt_gc_timeout,
3174                 .maxlen         = sizeof(int),
3175                 .mode           = 0644,
3176                 .proc_handler   = proc_dointvec_jiffies,
3177         },
3178         {
3179                 .procname       = "redirect_load",
3180                 .data           = &ip_rt_redirect_load,
3181                 .maxlen         = sizeof(int),
3182                 .mode           = 0644,
3183                 .proc_handler   = proc_dointvec,
3184         },
3185         {
3186                 .procname       = "redirect_number",
3187                 .data           = &ip_rt_redirect_number,
3188                 .maxlen         = sizeof(int),
3189                 .mode           = 0644,
3190                 .proc_handler   = proc_dointvec,
3191         },
3192         {
3193                 .procname       = "redirect_silence",
3194                 .data           = &ip_rt_redirect_silence,
3195                 .maxlen         = sizeof(int),
3196                 .mode           = 0644,
3197                 .proc_handler   = proc_dointvec,
3198         },
3199         {
3200                 .procname       = "error_cost",
3201                 .data           = &ip_rt_error_cost,
3202                 .maxlen         = sizeof(int),
3203                 .mode           = 0644,
3204                 .proc_handler   = proc_dointvec,
3205         },
3206         {
3207                 .procname       = "error_burst",
3208                 .data           = &ip_rt_error_burst,
3209                 .maxlen         = sizeof(int),
3210                 .mode           = 0644,
3211                 .proc_handler   = proc_dointvec,
3212         },
3213         {
3214                 .procname       = "gc_elasticity",
3215                 .data           = &ip_rt_gc_elasticity,
3216                 .maxlen         = sizeof(int),
3217                 .mode           = 0644,
3218                 .proc_handler   = proc_dointvec,
3219         },
3220         {
3221                 .procname       = "mtu_expires",
3222                 .data           = &ip_rt_mtu_expires,
3223                 .maxlen         = sizeof(int),
3224                 .mode           = 0644,
3225                 .proc_handler   = proc_dointvec_jiffies,
3226         },
3227         {
3228                 .procname       = "min_pmtu",
3229                 .data           = &ip_rt_min_pmtu,
3230                 .maxlen         = sizeof(int),
3231                 .mode           = 0644,
3232                 .proc_handler   = proc_dointvec,
3233         },
3234         {
3235                 .procname       = "min_adv_mss",
3236                 .data           = &ip_rt_min_advmss,
3237                 .maxlen         = sizeof(int),
3238                 .mode           = 0644,
3239                 .proc_handler   = proc_dointvec,
3240         },
3241         { }
3242 };
3243
3244 static struct ctl_table empty[1];
3245
3246 static struct ctl_table ipv4_skeleton[] =
3247 {
3248         { .procname = "route",
3249           .mode = 0555, .child = ipv4_route_table},
3250         { .procname = "neigh",
3251           .mode = 0555, .child = empty},
3252         { }
3253 };
3254
3255 static __net_initdata struct ctl_path ipv4_path[] = {
3256         { .procname = "net", },
3257         { .procname = "ipv4", },
3258         { },
3259 };
3260
3261 static struct ctl_table ipv4_route_flush_table[] = {
3262         {
3263                 .procname       = "flush",
3264                 .maxlen         = sizeof(int),
3265                 .mode           = 0200,
3266                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3267         },
3268         { },
3269 };
3270
3271 static __net_initdata struct ctl_path ipv4_route_path[] = {
3272         { .procname = "net", },
3273         { .procname = "ipv4", },
3274         { .procname = "route", },
3275         { },
3276 };
3277
3278 static __net_init int sysctl_route_net_init(struct net *net)
3279 {
3280         struct ctl_table *tbl;
3281
3282         tbl = ipv4_route_flush_table;
3283         if (!net_eq(net, &init_net)) {
3284                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3285                 if (tbl == NULL)
3286                         goto err_dup;
3287         }
3288         tbl[0].extra1 = net;
3289
3290         net->ipv4.route_hdr =
3291                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3292         if (net->ipv4.route_hdr == NULL)
3293                 goto err_reg;
3294         return 0;
3295
3296 err_reg:
3297         if (tbl != ipv4_route_flush_table)
3298                 kfree(tbl);
3299 err_dup:
3300         return -ENOMEM;
3301 }
3302
3303 static __net_exit void sysctl_route_net_exit(struct net *net)
3304 {
3305         struct ctl_table *tbl;
3306
3307         tbl = net->ipv4.route_hdr->ctl_table_arg;
3308         unregister_net_sysctl_table(net->ipv4.route_hdr);
3309         BUG_ON(tbl == ipv4_route_flush_table);
3310         kfree(tbl);
3311 }
3312
3313 static __net_initdata struct pernet_operations sysctl_route_ops = {
3314         .init = sysctl_route_net_init,
3315         .exit = sysctl_route_net_exit,
3316 };
3317 #endif
3318
3319 static __net_init int rt_genid_init(struct net *net)
3320 {
3321         get_random_bytes(&net->ipv4.rt_genid,
3322                          sizeof(net->ipv4.rt_genid));
3323         get_random_bytes(&net->ipv4.dev_addr_genid,
3324                          sizeof(net->ipv4.dev_addr_genid));
3325         return 0;
3326 }
3327
3328 static __net_initdata struct pernet_operations rt_genid_ops = {
3329         .init = rt_genid_init,
3330 };
3331
3332
3333 #ifdef CONFIG_IP_ROUTE_CLASSID
3334 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3335 #endif /* CONFIG_IP_ROUTE_CLASSID */
3336
3337 static __initdata unsigned long rhash_entries;
3338 static int __init set_rhash_entries(char *str)
3339 {
3340         if (!str)
3341                 return 0;
3342         rhash_entries = simple_strtoul(str, &str, 0);
3343         return 1;
3344 }
3345 __setup("rhash_entries=", set_rhash_entries);
3346
3347 int __init ip_rt_init(void)
3348 {
3349         int rc = 0;
3350
3351 #ifdef CONFIG_IP_ROUTE_CLASSID
3352         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3353         if (!ip_rt_acct)
3354                 panic("IP: failed to allocate ip_rt_acct\n");
3355 #endif
3356
3357         ipv4_dst_ops.kmem_cachep =
3358                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3359                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3360
3361         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3362
3363         if (dst_entries_init(&ipv4_dst_ops) < 0)
3364                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3365
3366         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3367                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3368
3369         rt_hash_table = (struct rt_hash_bucket *)
3370                 alloc_large_system_hash("IP route cache",
3371                                         sizeof(struct rt_hash_bucket),
3372                                         rhash_entries,
3373                                         (totalram_pages >= 128 * 1024) ?
3374                                         15 : 17,
3375                                         0,
3376                                         &rt_hash_log,
3377                                         &rt_hash_mask,
3378                                         rhash_entries ? 0 : 512 * 1024);
3379         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3380         rt_hash_lock_init();
3381
3382         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3383         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3384
3385         devinet_init();
3386         ip_fib_init();
3387
3388         if (ip_rt_proc_init())
3389                 printk(KERN_ERR "Unable to create route proc files\n");
3390 #ifdef CONFIG_XFRM
3391         xfrm_init();
3392         xfrm4_init(ip_rt_max_size);
3393 #endif
3394         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3395
3396 #ifdef CONFIG_SYSCTL
3397         register_pernet_subsys(&sysctl_route_ops);
3398 #endif
3399         register_pernet_subsys(&rt_genid_ops);
3400         return rc;
3401 }
3402
3403 #ifdef CONFIG_SYSCTL
3404 /*
3405  * We really need to sanitize the damn ipv4 init order, then all
3406  * this nonsense will go away.
3407  */
3408 void __init ip_static_sysctl_init(void)
3409 {
3410         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3411 }
3412 #endif