net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #include <linux/module.h>
  66 #include <asm/uaccess.h>
  67 #include <asm/system.h>
  68 #include <linux/bitops.h>
  69 #include <linux/types.h>
  70 #include <linux/kernel.h>
  71 #include <linux/mm.h>
  72 #include <linux/bootmem.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/workqueue.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/jhash.h>
  91 #include <linux/rcupdate.h>
  92 #include <linux/times.h>
  93 #include <linux/slab.h>
  94 #include <net/dst.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/netevent.h>
 107 #include <net/rtnetlink.h>
 108 #ifdef CONFIG_SYSCTL
 109 #include <linux/sysctl.h>
 110 #endif
 111 #include <net/atmclip.h>
 112 #include <net/secure_seq.h>
 113
 114 #define RT_FL_TOS(oldflp4) \
 115         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 116
 117 #define IP_MAX_MTU      0xFFF0
 118
 119 #define RT_GC_TIMEOUT (300*HZ)
 120
 121 static int ip_rt_max_size;
 122 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 123 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
 124 static int ip_rt_redirect_number __read_mostly  = 9;
 125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost __read_mostly       = HZ;
 128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129 static int ip_rt_gc_elasticity __read_mostly    = 8;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133 static int rt_chain_length_max __read_mostly    = 20;
 134 static int redirect_genid;
 135
 136 /*
 137  *      Interface to generic destination cache.
 138  */
 139
 140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 141 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 142 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 143 static void              ipv4_dst_destroy(struct dst_entry *dst);
 144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 145 static void              ipv4_link_failure(struct sk_buff *skb);
 146 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 147 static int rt_garbage_collect(struct dst_ops *ops);
 148
 149 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 150                             int how)
 151 {
 152 }
 153
 154 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 155 {
 156         struct rtable *rt = (struct rtable *) dst;
 157         struct inet_peer *peer;
 158         u32 *p = NULL;
 159
 160         if (!rt->peer)
 161                 rt_bind_peer(rt, rt->rt_dst, 1);
 162
 163         peer = rt->peer;
 164         if (peer) {
 165                 u32 *old_p = __DST_METRICS_PTR(old);
 166                 unsigned long prev, new;
 167
 168                 p = peer->metrics;
 169                 if (inet_metrics_new(peer))
 170                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 171
 172                 new = (unsigned long) p;
 173                 prev = cmpxchg(&dst->_metrics, old, new);
 174
 175                 if (prev != old) {
 176                         p = __DST_METRICS_PTR(prev);
 177                         if (prev & DST_METRICS_READ_ONLY)
 178                                 p = NULL;
 179                 } else {
 180                         if (rt->fi) {
 181                                 fib_info_put(rt->fi);
 182                                 rt->fi = NULL;
 183                         }
 184                 }
 185         }
 186         return p;
 187 }
 188
 189 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 190
 191 static struct dst_ops ipv4_dst_ops = {
 192         .family =               AF_INET,
 193         .protocol =             cpu_to_be16(ETH_P_IP),
 194         .gc =                   rt_garbage_collect,
 195         .check =                ipv4_dst_check,
 196         .default_advmss =       ipv4_default_advmss,
 197         .mtu =                  ipv4_mtu,
 198         .cow_metrics =          ipv4_cow_metrics,
 199         .destroy =              ipv4_dst_destroy,
 200         .ifdown =               ipv4_dst_ifdown,
 201         .negative_advice =      ipv4_negative_advice,
 202         .link_failure =         ipv4_link_failure,
 203         .update_pmtu =          ip_rt_update_pmtu,
 204         .local_out =            __ip_local_out,
 205         .neigh_lookup =         ipv4_neigh_lookup,
 206 };
 207
 208 #define ECN_OR_COST(class)      TC_PRIO_##class
 209
 210 const __u8 ip_tos2prio[16] = {
 211         TC_PRIO_BESTEFFORT,
 212         ECN_OR_COST(BESTEFFORT),
 213         TC_PRIO_BESTEFFORT,
 214         ECN_OR_COST(BESTEFFORT),
 215         TC_PRIO_BULK,
 216         ECN_OR_COST(BULK),
 217         TC_PRIO_BULK,
 218         ECN_OR_COST(BULK),
 219         TC_PRIO_INTERACTIVE,
 220         ECN_OR_COST(INTERACTIVE),
 221         TC_PRIO_INTERACTIVE,
 222         ECN_OR_COST(INTERACTIVE),
 223         TC_PRIO_INTERACTIVE_BULK,
 224         ECN_OR_COST(INTERACTIVE_BULK),
 225         TC_PRIO_INTERACTIVE_BULK,
 226         ECN_OR_COST(INTERACTIVE_BULK)
 227 };
 228
 229
 230 /*
 231  * Route cache.
 232  */
 233
 234 /* The locking scheme is rather straight forward:
 235  *
 236  * 1) Read-Copy Update protects the buckets of the central route hash.
 237  * 2) Only writers remove entries, and they hold the lock
 238  *    as they look at rtable reference counts.
 239  * 3) Only readers acquire references to rtable entries,
 240  *    they do so with atomic increments and with the
 241  *    lock held.
 242  */
 243
 244 struct rt_hash_bucket {
 245         struct rtable __rcu     *chain;
 246 };
 247
 248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 249         defined(CONFIG_PROVE_LOCKING)
 250 /*
 251  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 252  * The size of this table is a power of two and depends on the number of CPUS.
 253  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 254  */
 255 #ifdef CONFIG_LOCKDEP
 256 # define RT_HASH_LOCK_SZ        256
 257 #else
 258 # if NR_CPUS >= 32
 259 #  define RT_HASH_LOCK_SZ       4096
 260 # elif NR_CPUS >= 16
 261 #  define RT_HASH_LOCK_SZ       2048
 262 # elif NR_CPUS >= 8
 263 #  define RT_HASH_LOCK_SZ       1024
 264 # elif NR_CPUS >= 4
 265 #  define RT_HASH_LOCK_SZ       512
 266 # else
 267 #  define RT_HASH_LOCK_SZ       256
 268 # endif
 269 #endif
 270
 271 static spinlock_t       *rt_hash_locks;
 272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 273
 274 static __init void rt_hash_lock_init(void)
 275 {
 276         int i;
 277
 278         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 279                         GFP_KERNEL);
 280         if (!rt_hash_locks)
 281                 panic("IP: failed to allocate rt_hash_locks\n");
 282
 283         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 284                 spin_lock_init(&rt_hash_locks[i]);
 285 }
 286 #else
 287 # define rt_hash_lock_addr(slot) NULL
 288
 289 static inline void rt_hash_lock_init(void)
 290 {
 291 }
 292 #endif
 293
 294 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
 295 static unsigned                 rt_hash_mask __read_mostly;
 296 static unsigned int             rt_hash_log  __read_mostly;
 297
 298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 300
 301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 302                                    int genid)
 303 {
 304         return jhash_3words((__force u32)daddr, (__force u32)saddr,
 305                             idx, genid)
 306                 & rt_hash_mask;
 307 }
 308
 309 static inline int rt_genid(struct net *net)
 310 {
 311         return atomic_read(&net->ipv4.rt_genid);
 312 }
 313
 314 #ifdef CONFIG_PROC_FS
 315 struct rt_cache_iter_state {
 316         struct seq_net_private p;
 317         int bucket;
 318         int genid;
 319 };
 320
 321 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 322 {
 323         struct rt_cache_iter_state *st = seq->private;
 324         struct rtable *r = NULL;
 325
 326         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 327                 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 328                         continue;
 329                 rcu_read_lock_bh();
 330                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 331                 while (r) {
 332                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 333                             r->rt_genid == st->genid)
 334                                 return r;
 335                         r = rcu_dereference_bh(r->dst.rt_next);
 336                 }
 337                 rcu_read_unlock_bh();
 338         }
 339         return r;
 340 }
 341
 342 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 343                                           struct rtable *r)
 344 {
 345         struct rt_cache_iter_state *st = seq->private;
 346
 347         r = rcu_dereference_bh(r->dst.rt_next);
 348         while (!r) {
 349                 rcu_read_unlock_bh();
 350                 do {
 351                         if (--st->bucket < 0)
 352                                 return NULL;
 353                 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 354                 rcu_read_lock_bh();
 355                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 356         }
 357         return r;
 358 }
 359
 360 static struct rtable *rt_cache_get_next(struct seq_file *seq,
 361                                         struct rtable *r)
 362 {
 363         struct rt_cache_iter_state *st = seq->private;
 364         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 365                 if (dev_net(r->dst.dev) != seq_file_net(seq))
 366                         continue;
 367                 if (r->rt_genid == st->genid)
 368                         break;
 369         }
 370         return r;
 371 }
 372
 373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 374 {
 375         struct rtable *r = rt_cache_get_first(seq);
 376
 377         if (r)
 378                 while (pos && (r = rt_cache_get_next(seq, r)))
 379                         --pos;
 380         return pos ? NULL : r;
 381 }
 382
 383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 384 {
 385         struct rt_cache_iter_state *st = seq->private;
 386         if (*pos)
 387                 return rt_cache_get_idx(seq, *pos - 1);
 388         st->genid = rt_genid(seq_file_net(seq));
 389         return SEQ_START_TOKEN;
 390 }
 391
 392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 393 {
 394         struct rtable *r;
 395
 396         if (v == SEQ_START_TOKEN)
 397                 r = rt_cache_get_first(seq);
 398         else
 399                 r = rt_cache_get_next(seq, v);
 400         ++*pos;
 401         return r;
 402 }
 403
 404 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 405 {
 406         if (v && v != SEQ_START_TOKEN)
 407                 rcu_read_unlock_bh();
 408 }
 409
 410 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 411 {
 412         if (v == SEQ_START_TOKEN)
 413                 seq_printf(seq, "%-127s\n",
 414                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 415                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 416                            "HHUptod\tSpecDst");
 417         else {
 418                 struct rtable *r = v;
 419                 struct neighbour *n;
 420                 int len, HHUptod;
 421
 422                 rcu_read_lock();
 423                 n = dst_get_neighbour(&r->dst);
 424                 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 425                 rcu_read_unlock();
 426
 427                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 428                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 429                         r->dst.dev ? r->dst.dev->name : "*",
 430                         (__force u32)r->rt_dst,
 431                         (__force u32)r->rt_gateway,
 432                         r->rt_flags, atomic_read(&r->dst.__refcnt),
 433                         r->dst.__use, 0, (__force u32)r->rt_src,
 434                         dst_metric_advmss(&r->dst) + 40,
 435                         dst_metric(&r->dst, RTAX_WINDOW),
 436                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 437                               dst_metric(&r->dst, RTAX_RTTVAR)),
 438                         r->rt_key_tos,
 439                         -1,
 440                         HHUptod,
 441                         r->rt_spec_dst, &len);
 442
 443                 seq_printf(seq, "%*s\n", 127 - len, "");
 444         }
 445         return 0;
 446 }
 447
 448 static const struct seq_operations rt_cache_seq_ops = {
 449         .start  = rt_cache_seq_start,
 450         .next   = rt_cache_seq_next,
 451         .stop   = rt_cache_seq_stop,
 452         .show   = rt_cache_seq_show,
 453 };
 454
 455 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 456 {
 457         return seq_open_net(inode, file, &rt_cache_seq_ops,
 458                         sizeof(struct rt_cache_iter_state));
 459 }
 460
 461 static const struct file_operations rt_cache_seq_fops = {
 462         .owner   = THIS_MODULE,
 463         .open    = rt_cache_seq_open,
 464         .read    = seq_read,
 465         .llseek  = seq_lseek,
 466         .release = seq_release_net,
 467 };
 468
 469
 470 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 471 {
 472         int cpu;
 473
 474         if (*pos == 0)
 475                 return SEQ_START_TOKEN;
 476
 477         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 478                 if (!cpu_possible(cpu))
 479                         continue;
 480                 *pos = cpu+1;
 481                 return &per_cpu(rt_cache_stat, cpu);
 482         }
 483         return NULL;
 484 }
 485
 486 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 487 {
 488         int cpu;
 489
 490         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 491                 if (!cpu_possible(cpu))
 492                         continue;
 493                 *pos = cpu+1;
 494                 return &per_cpu(rt_cache_stat, cpu);
 495         }
 496         return NULL;
 497
 498 }
 499
 500 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 501 {
 502
 503 }
 504
 505 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 506 {
 507         struct rt_cache_stat *st = v;
 508
 509         if (v == SEQ_START_TOKEN) {
 510                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 511                 return 0;
 512         }
 513
 514         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 515                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 516                    dst_entries_get_slow(&ipv4_dst_ops),
 517                    st->in_hit,
 518                    st->in_slow_tot,
 519                    st->in_slow_mc,
 520                    st->in_no_route,
 521                    st->in_brd,
 522                    st->in_martian_dst,
 523                    st->in_martian_src,
 524
 525                    st->out_hit,
 526                    st->out_slow_tot,
 527                    st->out_slow_mc,
 528
 529                    st->gc_total,
 530                    st->gc_ignored,
 531                    st->gc_goal_miss,
 532                    st->gc_dst_overflow,
 533                    st->in_hlist_search,
 534                    st->out_hlist_search
 535                 );
 536         return 0;
 537 }
 538
 539 static const struct seq_operations rt_cpu_seq_ops = {
 540         .start  = rt_cpu_seq_start,
 541         .next   = rt_cpu_seq_next,
 542         .stop   = rt_cpu_seq_stop,
 543         .show   = rt_cpu_seq_show,
 544 };
 545
 546
 547 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 548 {
 549         return seq_open(file, &rt_cpu_seq_ops);
 550 }
 551
 552 static const struct file_operations rt_cpu_seq_fops = {
 553         .owner   = THIS_MODULE,
 554         .open    = rt_cpu_seq_open,
 555         .read    = seq_read,
 556         .llseek  = seq_lseek,
 557         .release = seq_release,
 558 };
 559
 560 #ifdef CONFIG_IP_ROUTE_CLASSID
 561 static int rt_acct_proc_show(struct seq_file *m, void *v)
 562 {
 563         struct ip_rt_acct *dst, *src;
 564         unsigned int i, j;
 565
 566         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 567         if (!dst)
 568                 return -ENOMEM;
 569
 570         for_each_possible_cpu(i) {
 571                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 572                 for (j = 0; j < 256; j++) {
 573                         dst[j].o_bytes   += src[j].o_bytes;
 574                         dst[j].o_packets += src[j].o_packets;
 575                         dst[j].i_bytes   += src[j].i_bytes;
 576                         dst[j].i_packets += src[j].i_packets;
 577                 }
 578         }
 579
 580         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 581         kfree(dst);
 582         return 0;
 583 }
 584
 585 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 586 {
 587         return single_open(file, rt_acct_proc_show, NULL);
 588 }
 589
 590 static const struct file_operations rt_acct_proc_fops = {
 591         .owner          = THIS_MODULE,
 592         .open           = rt_acct_proc_open,
 593         .read           = seq_read,
 594         .llseek         = seq_lseek,
 595         .release        = single_release,
 596 };
 597 #endif
 598
 599 static int __net_init ip_rt_do_proc_init(struct net *net)
 600 {
 601         struct proc_dir_entry *pde;
 602
 603         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 604                         &rt_cache_seq_fops);
 605         if (!pde)
 606                 goto err1;
 607
 608         pde = proc_create("rt_cache", S_IRUGO,
 609                           net->proc_net_stat, &rt_cpu_seq_fops);
 610         if (!pde)
 611                 goto err2;
 612
 613 #ifdef CONFIG_IP_ROUTE_CLASSID
 614         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 615         if (!pde)
 616                 goto err3;
 617 #endif
 618         return 0;
 619
 620 #ifdef CONFIG_IP_ROUTE_CLASSID
 621 err3:
 622         remove_proc_entry("rt_cache", net->proc_net_stat);
 623 #endif
 624 err2:
 625         remove_proc_entry("rt_cache", net->proc_net);
 626 err1:
 627         return -ENOMEM;
 628 }
 629
 630 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 631 {
 632         remove_proc_entry("rt_cache", net->proc_net_stat);
 633         remove_proc_entry("rt_cache", net->proc_net);
 634 #ifdef CONFIG_IP_ROUTE_CLASSID
 635         remove_proc_entry("rt_acct", net->proc_net);
 636 #endif
 637 }
 638
 639 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 640         .init = ip_rt_do_proc_init,
 641         .exit = ip_rt_do_proc_exit,
 642 };
 643
 644 static int __init ip_rt_proc_init(void)
 645 {
 646         return register_pernet_subsys(&ip_rt_proc_ops);
 647 }
 648
 649 #else
 650 static inline int ip_rt_proc_init(void)
 651 {
 652         return 0;
 653 }
 654 #endif /* CONFIG_PROC_FS */
 655
 656 static inline void rt_free(struct rtable *rt)
 657 {
 658         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 659 }
 660
 661 static inline void rt_drop(struct rtable *rt)
 662 {
 663         ip_rt_put(rt);
 664         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 665 }
 666
 667 static inline int rt_fast_clean(struct rtable *rth)
 668 {
 669         /* Kill broadcast/multicast entries very aggresively, if they
 670            collide in hash table with more useful entries */
 671         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 672                 rt_is_input_route(rth) && rth->dst.rt_next;
 673 }
 674
 675 static inline int rt_valuable(struct rtable *rth)
 676 {
 677         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 678                 (rth->peer && rth->peer->pmtu_expires);
 679 }
 680
 681 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 682 {
 683         unsigned long age;
 684         int ret = 0;
 685
 686         if (atomic_read(&rth->dst.__refcnt))
 687                 goto out;
 688
 689         age = jiffies - rth->dst.lastuse;
 690         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 691             (age <= tmo2 && rt_valuable(rth)))
 692                 goto out;
 693         ret = 1;
 694 out:    return ret;
 695 }
 696
 697 /* Bits of score are:
 698  * 31: very valuable
 699  * 30: not quite useless
 700  * 29..0: usage counter
 701  */
 702 static inline u32 rt_score(struct rtable *rt)
 703 {
 704         u32 score = jiffies - rt->dst.lastuse;
 705
 706         score = ~score & ~(3<<30);
 707
 708         if (rt_valuable(rt))
 709                 score |= (1<<31);
 710
 711         if (rt_is_output_route(rt) ||
 712             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 713                 score |= (1<<30);
 714
 715         return score;
 716 }
 717
 718 static inline bool rt_caching(const struct net *net)
 719 {
 720         return net->ipv4.current_rt_cache_rebuild_count <=
 721                 net->ipv4.sysctl_rt_cache_rebuild_count;
 722 }
 723
 724 static inline bool compare_hash_inputs(const struct rtable *rt1,
 725                                        const struct rtable *rt2)
 726 {
 727         return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 728                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 729                 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 730 }
 731
 732 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 733 {
 734         return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 735                 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 736                 (rt1->rt_mark ^ rt2->rt_mark) |
 737                 (rt1->rt_key_tos ^ rt2->rt_key_tos) |
 738                 (rt1->rt_route_iif ^ rt2->rt_route_iif) |
 739                 (rt1->rt_oif ^ rt2->rt_oif)) == 0;
 740 }
 741
 742 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 743 {
 744         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 745 }
 746
 747 static inline int rt_is_expired(struct rtable *rth)
 748 {
 749         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 750 }
 751
 752 /*
 753  * Perform a full scan of hash table and free all entries.
 754  * Can be called by a softirq or a process.
 755  * In the later case, we want to be reschedule if necessary
 756  */
 757 static void rt_do_flush(struct net *net, int process_context)
 758 {
 759         unsigned int i;
 760         struct rtable *rth, *next;
 761
 762         for (i = 0; i <= rt_hash_mask; i++) {
 763                 struct rtable __rcu **pprev;
 764                 struct rtable *list;
 765
 766                 if (process_context && need_resched())
 767                         cond_resched();
 768                 rth = rcu_access_pointer(rt_hash_table[i].chain);
 769                 if (!rth)
 770                         continue;
 771
 772                 spin_lock_bh(rt_hash_lock_addr(i));
 773
 774                 list = NULL;
 775                 pprev = &rt_hash_table[i].chain;
 776                 rth = rcu_dereference_protected(*pprev,
 777                         lockdep_is_held(rt_hash_lock_addr(i)));
 778
 779                 while (rth) {
 780                         next = rcu_dereference_protected(rth->dst.rt_next,
 781                                 lockdep_is_held(rt_hash_lock_addr(i)));
 782
 783                         if (!net ||
 784                             net_eq(dev_net(rth->dst.dev), net)) {
 785                                 rcu_assign_pointer(*pprev, next);
 786                                 rcu_assign_pointer(rth->dst.rt_next, list);
 787                                 list = rth;
 788                         } else {
 789                                 pprev = &rth->dst.rt_next;
 790                         }
 791                         rth = next;
 792                 }
 793
 794                 spin_unlock_bh(rt_hash_lock_addr(i));
 795
 796                 for (; list; list = next) {
 797                         next = rcu_dereference_protected(list->dst.rt_next, 1);
 798                         rt_free(list);
 799                 }
 800         }
 801 }
 802
 803 /*
 804  * While freeing expired entries, we compute average chain length
 805  * and standard deviation, using fixed-point arithmetic.
 806  * This to have an estimation of rt_chain_length_max
 807  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 808  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 809  */
 810
 811 #define FRACT_BITS 3
 812 #define ONE (1UL << FRACT_BITS)
 813
 814 /*
 815  * Given a hash chain and an item in this hash chain,
 816  * find if a previous entry has the same hash_inputs
 817  * (but differs on tos, mark or oif)
 818  * Returns 0 if an alias is found.
 819  * Returns ONE if rth has no alias before itself.
 820  */
 821 static int has_noalias(const struct rtable *head, const struct rtable *rth)
 822 {
 823         const struct rtable *aux = head;
 824
 825         while (aux != rth) {
 826                 if (compare_hash_inputs(aux, rth))
 827                         return 0;
 828                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 829         }
 830         return ONE;
 831 }
 832
 833 /*
 834  * Perturbation of rt_genid by a small quantity [1..256]
 835  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 836  * many times (2^24) without giving recent rt_genid.
 837  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 838  */
 839 static void rt_cache_invalidate(struct net *net)
 840 {
 841         unsigned char shuffle;
 842
 843         get_random_bytes(&shuffle, sizeof(shuffle));
 844         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 845         redirect_genid++;
 846 }
 847
 848 /*
 849  * delay < 0  : invalidate cache (fast : entries will be deleted later)
 850  * delay >= 0 : invalidate & flush cache (can be long)
 851  */
 852 void rt_cache_flush(struct net *net, int delay)
 853 {
 854         rt_cache_invalidate(net);
 855         if (delay >= 0)
 856                 rt_do_flush(net, !in_softirq());
 857 }
 858
 859 /* Flush previous cache invalidated entries from the cache */
 860 void rt_cache_flush_batch(struct net *net)
 861 {
 862         rt_do_flush(net, !in_softirq());
 863 }
 864
 865 static void rt_emergency_hash_rebuild(struct net *net)
 866 {
 867         if (net_ratelimit())
 868                 printk(KERN_WARNING "Route hash chain too long!\n");
 869         rt_cache_invalidate(net);
 870 }
 871
 872 /*
 873    Short description of GC goals.
 874
 875    We want to build algorithm, which will keep routing cache
 876    at some equilibrium point, when number of aged off entries
 877    is kept approximately equal to newly generated ones.
 878
 879    Current expiration strength is variable "expire".
 880    We try to adjust it dynamically, so that if networking
 881    is idle expires is large enough to keep enough of warm entries,
 882    and when load increases it reduces to limit cache size.
 883  */
 884
 885 static int rt_garbage_collect(struct dst_ops *ops)
 886 {
 887         static unsigned long expire = RT_GC_TIMEOUT;
 888         static unsigned long last_gc;
 889         static int rover;
 890         static int equilibrium;
 891         struct rtable *rth;
 892         struct rtable __rcu **rthp;
 893         unsigned long now = jiffies;
 894         int goal;
 895         int entries = dst_entries_get_fast(&ipv4_dst_ops);
 896
 897         /*
 898          * Garbage collection is pretty expensive,
 899          * do not make it too frequently.
 900          */
 901
 902         RT_CACHE_STAT_INC(gc_total);
 903
 904         if (now - last_gc < ip_rt_gc_min_interval &&
 905             entries < ip_rt_max_size) {
 906                 RT_CACHE_STAT_INC(gc_ignored);
 907                 goto out;
 908         }
 909
 910         entries = dst_entries_get_slow(&ipv4_dst_ops);
 911         /* Calculate number of entries, which we want to expire now. */
 912         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 913         if (goal <= 0) {
 914                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 915                         equilibrium = ipv4_dst_ops.gc_thresh;
 916                 goal = entries - equilibrium;
 917                 if (goal > 0) {
 918                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 919                         goal = entries - equilibrium;
 920                 }
 921         } else {
 922                 /* We are in dangerous area. Try to reduce cache really
 923                  * aggressively.
 924                  */
 925                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 926                 equilibrium = entries - goal;
 927         }
 928
 929         if (now - last_gc >= ip_rt_gc_min_interval)
 930                 last_gc = now;
 931
 932         if (goal <= 0) {
 933                 equilibrium += goal;
 934                 goto work_done;
 935         }
 936
 937         do {
 938                 int i, k;
 939
 940                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 941                         unsigned long tmo = expire;
 942
 943                         k = (k + 1) & rt_hash_mask;
 944                         rthp = &rt_hash_table[k].chain;
 945                         spin_lock_bh(rt_hash_lock_addr(k));
 946                         while ((rth = rcu_dereference_protected(*rthp,
 947                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 948                                 if (!rt_is_expired(rth) &&
 949                                         !rt_may_expire(rth, tmo, expire)) {
 950                                         tmo >>= 1;
 951                                         rthp = &rth->dst.rt_next;
 952                                         continue;
 953                                 }
 954                                 *rthp = rth->dst.rt_next;
 955                                 rt_free(rth);
 956                                 goal--;
 957                         }
 958                         spin_unlock_bh(rt_hash_lock_addr(k));
 959                         if (goal <= 0)
 960                                 break;
 961                 }
 962                 rover = k;
 963
 964                 if (goal <= 0)
 965                         goto work_done;
 966
 967                 /* Goal is not achieved. We stop process if:
 968
 969                    - if expire reduced to zero. Otherwise, expire is halfed.
 970                    - if table is not full.
 971                    - if we are called from interrupt.
 972                    - jiffies check is just fallback/debug loop breaker.
 973                      We will not spin here for long time in any case.
 974                  */
 975
 976                 RT_CACHE_STAT_INC(gc_goal_miss);
 977
 978                 if (expire == 0)
 979                         break;
 980
 981                 expire >>= 1;
 982
 983                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 984                         goto out;
 985         } while (!in_softirq() && time_before_eq(jiffies, now));
 986
 987         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 988                 goto out;
 989         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
 990                 goto out;
 991         if (net_ratelimit())
 992                 printk(KERN_WARNING "dst cache overflow\n");
 993         RT_CACHE_STAT_INC(gc_dst_overflow);
 994         return 1;
 995
 996 work_done:
 997         expire += ip_rt_gc_min_interval;
 998         if (expire > ip_rt_gc_timeout ||
 999             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1000             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1001                 expire = ip_rt_gc_timeout;
1002 out:    return 0;
1003 }
1004
1005 /*
1006  * Returns number of entries in a hash chain that have different hash_inputs
1007  */
1008 static int slow_chain_length(const struct rtable *head)
1009 {
1010         int length = 0;
1011         const struct rtable *rth = head;
1012
1013         while (rth) {
1014                 length += has_noalias(head, rth);
1015                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1016         }
1017         return length >> FRACT_BITS;
1018 }
1019
1020 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1021 {
1022         struct neigh_table *tbl = &arp_tbl;
1023         static const __be32 inaddr_any = 0;
1024         struct net_device *dev = dst->dev;
1025         const __be32 *pkey = daddr;
1026         struct neighbour *n;
1027
1028 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1029         if (dev->type == ARPHRD_ATM)
1030                 tbl = clip_tbl_hook;
1031 #endif
1032         if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1033                 pkey = &inaddr_any;
1034
1035         n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1036         if (n)
1037                 return n;
1038         return neigh_create(tbl, pkey, dev);
1039 }
1040
1041 static int rt_bind_neighbour(struct rtable *rt)
1042 {
1043         struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1044         if (IS_ERR(n))
1045                 return PTR_ERR(n);
1046         dst_set_neighbour(&rt->dst, n);
1047
1048         return 0;
1049 }
1050
1051 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1052                                      struct sk_buff *skb, int ifindex)
1053 {
1054         struct rtable   *rth, *cand;
1055         struct rtable __rcu **rthp, **candp;
1056         unsigned long   now;
1057         u32             min_score;
1058         int             chain_length;
1059         int attempts = !in_softirq();
1060
1061 restart:
1062         chain_length = 0;
1063         min_score = ~(u32)0;
1064         cand = NULL;
1065         candp = NULL;
1066         now = jiffies;
1067
1068         if (!rt_caching(dev_net(rt->dst.dev))) {
1069                 /*
1070                  * If we're not caching, just tell the caller we
1071                  * were successful and don't touch the route.  The
1072                  * caller hold the sole reference to the cache entry, and
1073                  * it will be released when the caller is done with it.
1074                  * If we drop it here, the callers have no way to resolve routes
1075                  * when we're not caching.  Instead, just point *rp at rt, so
1076                  * the caller gets a single use out of the route
1077                  * Note that we do rt_free on this new route entry, so that
1078                  * once its refcount hits zero, we are still able to reap it
1079                  * (Thanks Alexey)
1080                  * Note: To avoid expensive rcu stuff for this uncached dst,
1081                  * we set DST_NOCACHE so that dst_release() can free dst without
1082                  * waiting a grace period.
1083                  */
1084
1085                 rt->dst.flags |= DST_NOCACHE;
1086                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1087                         int err = rt_bind_neighbour(rt);
1088                         if (err) {
1089                                 if (net_ratelimit())
1090                                         printk(KERN_WARNING
1091                                             "Neighbour table failure & not caching routes.\n");
1092                                 ip_rt_put(rt);
1093                                 return ERR_PTR(err);
1094                         }
1095                 }
1096
1097                 goto skip_hashing;
1098         }
1099
1100         rthp = &rt_hash_table[hash].chain;
1101
1102         spin_lock_bh(rt_hash_lock_addr(hash));
1103         while ((rth = rcu_dereference_protected(*rthp,
1104                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1105                 if (rt_is_expired(rth)) {
1106                         *rthp = rth->dst.rt_next;
1107                         rt_free(rth);
1108                         continue;
1109                 }
1110                 if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1111                         /* Put it first */
1112                         *rthp = rth->dst.rt_next;
1113                         /*
1114                          * Since lookup is lockfree, the deletion
1115                          * must be visible to another weakly ordered CPU before
1116                          * the insertion at the start of the hash chain.
1117                          */
1118                         rcu_assign_pointer(rth->dst.rt_next,
1119                                            rt_hash_table[hash].chain);
1120                         /*
1121                          * Since lookup is lockfree, the update writes
1122                          * must be ordered for consistency on SMP.
1123                          */
1124                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1125
1126                         dst_use(&rth->dst, now);
1127                         spin_unlock_bh(rt_hash_lock_addr(hash));
1128
1129                         rt_drop(rt);
1130                         if (skb)
1131                                 skb_dst_set(skb, &rth->dst);
1132                         return rth;
1133                 }
1134
1135                 if (!atomic_read(&rth->dst.__refcnt)) {
1136                         u32 score = rt_score(rth);
1137
1138                         if (score <= min_score) {
1139                                 cand = rth;
1140                                 candp = rthp;
1141                                 min_score = score;
1142                         }
1143                 }
1144
1145                 chain_length++;
1146
1147                 rthp = &rth->dst.rt_next;
1148         }
1149
1150         if (cand) {
1151                 /* ip_rt_gc_elasticity used to be average length of chain
1152                  * length, when exceeded gc becomes really aggressive.
1153                  *
1154                  * The second limit is less certain. At the moment it allows
1155                  * only 2 entries per bucket. We will see.
1156                  */
1157                 if (chain_length > ip_rt_gc_elasticity) {
1158                         *candp = cand->dst.rt_next;
1159                         rt_free(cand);
1160                 }
1161         } else {
1162                 if (chain_length > rt_chain_length_max &&
1163                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1164                         struct net *net = dev_net(rt->dst.dev);
1165                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1166                         if (!rt_caching(net)) {
1167                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1168                                         rt->dst.dev->name, num);
1169                         }
1170                         rt_emergency_hash_rebuild(net);
1171                         spin_unlock_bh(rt_hash_lock_addr(hash));
1172
1173                         hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1174                                         ifindex, rt_genid(net));
1175                         goto restart;
1176                 }
1177         }
1178
1179         /* Try to bind route to arp only if it is output
1180            route or unicast forwarding path.
1181          */
1182         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1183                 int err = rt_bind_neighbour(rt);
1184                 if (err) {
1185                         spin_unlock_bh(rt_hash_lock_addr(hash));
1186
1187                         if (err != -ENOBUFS) {
1188                                 rt_drop(rt);
1189                                 return ERR_PTR(err);
1190                         }
1191
1192                         /* Neighbour tables are full and nothing
1193                            can be released. Try to shrink route cache,
1194                            it is most likely it holds some neighbour records.
1195                          */
1196                         if (attempts-- > 0) {
1197                                 int saved_elasticity = ip_rt_gc_elasticity;
1198                                 int saved_int = ip_rt_gc_min_interval;
1199                                 ip_rt_gc_elasticity     = 1;
1200                                 ip_rt_gc_min_interval   = 0;
1201                                 rt_garbage_collect(&ipv4_dst_ops);
1202                                 ip_rt_gc_min_interval   = saved_int;
1203                                 ip_rt_gc_elasticity     = saved_elasticity;
1204                                 goto restart;
1205                         }
1206
1207                         if (net_ratelimit())
1208                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1209                         rt_drop(rt);
1210                         return ERR_PTR(-ENOBUFS);
1211                 }
1212         }
1213
1214         rt->dst.rt_next = rt_hash_table[hash].chain;
1215
1216         /*
1217          * Since lookup is lockfree, we must make sure
1218          * previous writes to rt are committed to memory
1219          * before making rt visible to other CPUS.
1220          */
1221         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1222
1223         spin_unlock_bh(rt_hash_lock_addr(hash));
1224
1225 skip_hashing:
1226         if (skb)
1227                 skb_dst_set(skb, &rt->dst);
1228         return rt;
1229 }
1230
1231 static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1232
1233 static u32 rt_peer_genid(void)
1234 {
1235         return atomic_read(&__rt_peer_genid);
1236 }
1237
1238 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1239 {
1240         struct inet_peer *peer;
1241
1242         peer = inet_getpeer_v4(daddr, create);
1243
1244         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1245                 inet_putpeer(peer);
1246         else
1247                 rt->rt_peer_genid = rt_peer_genid();
1248 }
1249
1250 /*
1251  * Peer allocation may fail only in serious out-of-memory conditions.  However
1252  * we still can generate some output.
1253  * Random ID selection looks a bit dangerous because we have no chances to
1254  * select ID being unique in a reasonable period of time.
1255  * But broken packet identifier may be better than no packet at all.
1256  */
1257 static void ip_select_fb_ident(struct iphdr *iph)
1258 {
1259         static DEFINE_SPINLOCK(ip_fb_id_lock);
1260         static u32 ip_fallback_id;
1261         u32 salt;
1262
1263         spin_lock_bh(&ip_fb_id_lock);
1264         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1265         iph->id = htons(salt & 0xFFFF);
1266         ip_fallback_id = salt;
1267         spin_unlock_bh(&ip_fb_id_lock);
1268 }
1269
1270 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1271 {
1272         struct rtable *rt = (struct rtable *) dst;
1273
1274         if (rt) {
1275                 if (rt->peer == NULL)
1276                         rt_bind_peer(rt, rt->rt_dst, 1);
1277
1278                 /* If peer is attached to destination, it is never detached,
1279                    so that we need not to grab a lock to dereference it.
1280                  */
1281                 if (rt->peer) {
1282                         iph->id = htons(inet_getid(rt->peer, more));
1283                         return;
1284                 }
1285         } else
1286                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1287                        __builtin_return_address(0));
1288
1289         ip_select_fb_ident(iph);
1290 }
1291 EXPORT_SYMBOL(__ip_select_ident);
1292
1293 static void rt_del(unsigned hash, struct rtable *rt)
1294 {
1295         struct rtable __rcu **rthp;
1296         struct rtable *aux;
1297
1298         rthp = &rt_hash_table[hash].chain;
1299         spin_lock_bh(rt_hash_lock_addr(hash));
1300         ip_rt_put(rt);
1301         while ((aux = rcu_dereference_protected(*rthp,
1302                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1303                 if (aux == rt || rt_is_expired(aux)) {
1304                         *rthp = aux->dst.rt_next;
1305                         rt_free(aux);
1306                         continue;
1307                 }
1308                 rthp = &aux->dst.rt_next;
1309         }
1310         spin_unlock_bh(rt_hash_lock_addr(hash));
1311 }
1312
1313 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1314 {
1315         struct rtable *rt = (struct rtable *) dst;
1316         __be32 orig_gw = rt->rt_gateway;
1317         struct neighbour *n, *old_n;
1318
1319         dst_confirm(&rt->dst);
1320
1321         rt->rt_gateway = peer->redirect_learned.a4;
1322
1323         n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1324         if (IS_ERR(n)) {
1325                 rt->rt_gateway = orig_gw;
1326                 return;
1327         }
1328         old_n = xchg(&rt->dst._neighbour, n);
1329         if (old_n)
1330                 neigh_release(old_n);
1331         if (!(n->nud_state & NUD_VALID)) {
1332                 neigh_event_send(n, NULL);
1333         } else {
1334                 rt->rt_flags |= RTCF_REDIRECTED;
1335                 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1336         }
1337 }
1338
1339 /* called in rcu_read_lock() section */
1340 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1341                     __be32 saddr, struct net_device *dev)
1342 {
1343         int s, i;
1344         struct in_device *in_dev = __in_dev_get_rcu(dev);
1345         __be32 skeys[2] = { saddr, 0 };
1346         int    ikeys[2] = { dev->ifindex, 0 };
1347         struct inet_peer *peer;
1348         struct net *net;
1349
1350         if (!in_dev)
1351                 return;
1352
1353         net = dev_net(dev);
1354         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1355             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1356             ipv4_is_zeronet(new_gw))
1357                 goto reject_redirect;
1358
1359         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1360                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1361                         goto reject_redirect;
1362                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1363                         goto reject_redirect;
1364         } else {
1365                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1366                         goto reject_redirect;
1367         }
1368
1369         for (s = 0; s < 2; s++) {
1370                 for (i = 0; i < 2; i++) {
1371                         unsigned int hash;
1372                         struct rtable __rcu **rthp;
1373                         struct rtable *rt;
1374
1375                         hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1376
1377                         rthp = &rt_hash_table[hash].chain;
1378
1379                         while ((rt = rcu_dereference(*rthp)) != NULL) {
1380                                 rthp = &rt->dst.rt_next;
1381
1382                                 if (rt->rt_key_dst != daddr ||
1383                                     rt->rt_key_src != skeys[s] ||
1384                                     rt->rt_oif != ikeys[i] ||
1385                                     rt_is_input_route(rt) ||
1386                                     rt_is_expired(rt) ||
1387                                     !net_eq(dev_net(rt->dst.dev), net) ||
1388                                     rt->dst.error ||
1389                                     rt->dst.dev != dev ||
1390                                     rt->rt_gateway != old_gw)
1391                                         continue;
1392
1393                                 if (!rt->peer)
1394                                         rt_bind_peer(rt, rt->rt_dst, 1);
1395
1396                                 peer = rt->peer;
1397                                 if (peer) {
1398                                         if (peer->redirect_learned.a4 != new_gw ||
1399                                             peer->redirect_genid != redirect_genid) {
1400                                                 peer->redirect_learned.a4 = new_gw;
1401                                                 peer->redirect_genid = redirect_genid;
1402                                                 atomic_inc(&__rt_peer_genid);
1403                                         }
1404                                         check_peer_redir(&rt->dst, peer);
1405                                 }
1406                         }
1407                 }
1408         }
1409         return;
1410
1411 reject_redirect:
1412 #ifdef CONFIG_IP_ROUTE_VERBOSE
1413         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1414                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1415                         "  Advised path = %pI4 -> %pI4\n",
1416                        &old_gw, dev->name, &new_gw,
1417                        &saddr, &daddr);
1418 #endif
1419         ;
1420 }
1421
1422 static bool peer_pmtu_expired(struct inet_peer *peer)
1423 {
1424         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1425
1426         return orig &&
1427                time_after_eq(jiffies, orig) &&
1428                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1429 }
1430
1431 static bool peer_pmtu_cleaned(struct inet_peer *peer)
1432 {
1433         unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1434
1435         return orig &&
1436                cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1437 }
1438
1439 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1440 {
1441         struct rtable *rt = (struct rtable *)dst;
1442         struct dst_entry *ret = dst;
1443
1444         if (rt) {
1445                 if (dst->obsolete > 0) {
1446                         ip_rt_put(rt);
1447                         ret = NULL;
1448                 } else if (rt->rt_flags & RTCF_REDIRECTED) {
1449                         unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1450                                                 rt->rt_oif,
1451                                                 rt_genid(dev_net(dst->dev)));
1452                         rt_del(hash, rt);
1453                         ret = NULL;
1454                 } else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1455                         dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1456                 }
1457         }
1458         return ret;
1459 }
1460
1461 /*
1462  * Algorithm:
1463  *      1. The first ip_rt_redirect_number redirects are sent
1464  *         with exponential backoff, then we stop sending them at all,
1465  *         assuming that the host ignores our redirects.
1466  *      2. If we did not see packets requiring redirects
1467  *         during ip_rt_redirect_silence, we assume that the host
1468  *         forgot redirected route and start to send redirects again.
1469  *
1470  * This algorithm is much cheaper and more intelligent than dumb load limiting
1471  * in icmp.c.
1472  *
1473  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1474  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1475  */
1476
1477 void ip_rt_send_redirect(struct sk_buff *skb)
1478 {
1479         struct rtable *rt = skb_rtable(skb);
1480         struct in_device *in_dev;
1481         struct inet_peer *peer;
1482         int log_martians;
1483
1484         rcu_read_lock();
1485         in_dev = __in_dev_get_rcu(rt->dst.dev);
1486         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1487                 rcu_read_unlock();
1488                 return;
1489         }
1490         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1491         rcu_read_unlock();
1492
1493         if (!rt->peer)
1494                 rt_bind_peer(rt, rt->rt_dst, 1);
1495         peer = rt->peer;
1496         if (!peer) {
1497                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1498                 return;
1499         }
1500
1501         /* No redirected packets during ip_rt_redirect_silence;
1502          * reset the algorithm.
1503          */
1504         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1505                 peer->rate_tokens = 0;
1506
1507         /* Too many ignored redirects; do not send anything
1508          * set dst.rate_last to the last seen redirected packet.
1509          */
1510         if (peer->rate_tokens >= ip_rt_redirect_number) {
1511                 peer->rate_last = jiffies;
1512                 return;
1513         }
1514
1515         /* Check for load limit; set rate_last to the latest sent
1516          * redirect.
1517          */
1518         if (peer->rate_tokens == 0 ||
1519             time_after(jiffies,
1520                        (peer->rate_last +
1521                         (ip_rt_redirect_load << peer->rate_tokens)))) {
1522                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1523                 peer->rate_last = jiffies;
1524                 ++peer->rate_tokens;
1525 #ifdef CONFIG_IP_ROUTE_VERBOSE
1526                 if (log_martians &&
1527                     peer->rate_tokens == ip_rt_redirect_number &&
1528                     net_ratelimit())
1529                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1530                                &ip_hdr(skb)->saddr, rt->rt_iif,
1531                                 &rt->rt_dst, &rt->rt_gateway);
1532 #endif
1533         }
1534 }
1535
1536 static int ip_error(struct sk_buff *skb)
1537 {
1538         struct rtable *rt = skb_rtable(skb);
1539         struct inet_peer *peer;
1540         unsigned long now;
1541         bool send;
1542         int code;
1543
1544         switch (rt->dst.error) {
1545         case EINVAL:
1546         default:
1547                 goto out;
1548         case EHOSTUNREACH:
1549                 code = ICMP_HOST_UNREACH;
1550                 break;
1551         case ENETUNREACH:
1552                 code = ICMP_NET_UNREACH;
1553                 IP_INC_STATS_BH(dev_net(rt->dst.dev),
1554                                 IPSTATS_MIB_INNOROUTES);
1555                 break;
1556         case EACCES:
1557                 code = ICMP_PKT_FILTERED;
1558                 break;
1559         }
1560
1561         if (!rt->peer)
1562                 rt_bind_peer(rt, rt->rt_dst, 1);
1563         peer = rt->peer;
1564
1565         send = true;
1566         if (peer) {
1567                 now = jiffies;
1568                 peer->rate_tokens += now - peer->rate_last;
1569                 if (peer->rate_tokens > ip_rt_error_burst)
1570                         peer->rate_tokens = ip_rt_error_burst;
1571                 peer->rate_last = now;
1572                 if (peer->rate_tokens >= ip_rt_error_cost)
1573                         peer->rate_tokens -= ip_rt_error_cost;
1574                 else
1575                         send = false;
1576         }
1577         if (send)
1578                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1579
1580 out:    kfree_skb(skb);
1581         return 0;
1582 }
1583
1584 /*
1585  *      The last two values are not from the RFC but
1586  *      are needed for AMPRnet AX.25 paths.
1587  */
1588
1589 static const unsigned short mtu_plateau[] =
1590 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1591
1592 static inline unsigned short guess_mtu(unsigned short old_mtu)
1593 {
1594         int i;
1595
1596         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1597                 if (old_mtu > mtu_plateau[i])
1598                         return mtu_plateau[i];
1599         return 68;
1600 }
1601
1602 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1603                                  unsigned short new_mtu,
1604                                  struct net_device *dev)
1605 {
1606         unsigned short old_mtu = ntohs(iph->tot_len);
1607         unsigned short est_mtu = 0;
1608         struct inet_peer *peer;
1609
1610         peer = inet_getpeer_v4(iph->daddr, 1);
1611         if (peer) {
1612                 unsigned short mtu = new_mtu;
1613
1614                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1615                         /* BSD 4.2 derived systems incorrectly adjust
1616                          * tot_len by the IP header length, and report
1617                          * a zero MTU in the ICMP message.
1618                          */
1619                         if (mtu == 0 &&
1620                             old_mtu >= 68 + (iph->ihl << 2))
1621                                 old_mtu -= iph->ihl << 2;
1622                         mtu = guess_mtu(old_mtu);
1623                 }
1624
1625                 if (mtu < ip_rt_min_pmtu)
1626                         mtu = ip_rt_min_pmtu;
1627                 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1628                         unsigned long pmtu_expires;
1629
1630                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1631                         if (!pmtu_expires)
1632                                 pmtu_expires = 1UL;
1633
1634                         est_mtu = mtu;
1635                         peer->pmtu_learned = mtu;
1636                         peer->pmtu_expires = pmtu_expires;
1637                         atomic_inc(&__rt_peer_genid);
1638                 }
1639
1640                 inet_putpeer(peer);
1641         }
1642         return est_mtu ? : new_mtu;
1643 }
1644
1645 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1646 {
1647         unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
1648
1649         if (!expires)
1650                 return;
1651         if (time_before(jiffies, expires)) {
1652                 u32 orig_dst_mtu = dst_mtu(dst);
1653                 if (peer->pmtu_learned < orig_dst_mtu) {
1654                         if (!peer->pmtu_orig)
1655                                 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1656                         dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1657                 }
1658         } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1659                 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1660 }
1661
1662 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1663 {
1664         struct rtable *rt = (struct rtable *) dst;
1665         struct inet_peer *peer;
1666
1667         dst_confirm(dst);
1668
1669         if (!rt->peer)
1670                 rt_bind_peer(rt, rt->rt_dst, 1);
1671         peer = rt->peer;
1672         if (peer) {
1673                 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1674
1675                 if (mtu < ip_rt_min_pmtu)
1676                         mtu = ip_rt_min_pmtu;
1677                 if (!pmtu_expires || mtu < peer->pmtu_learned) {
1678
1679                         pmtu_expires = jiffies + ip_rt_mtu_expires;
1680                         if (!pmtu_expires)
1681                                 pmtu_expires = 1UL;
1682
1683                         peer->pmtu_learned = mtu;
1684                         peer->pmtu_expires = pmtu_expires;
1685
1686                         atomic_inc(&__rt_peer_genid);
1687                         rt->rt_peer_genid = rt_peer_genid();
1688                 }
1689                 check_peer_pmtu(dst, peer);
1690         }
1691 }
1692
1693
1694 static void ipv4_validate_peer(struct rtable *rt)
1695 {
1696         if (rt->rt_peer_genid != rt_peer_genid()) {
1697                 struct inet_peer *peer;
1698
1699                 if (!rt->peer)
1700                         rt_bind_peer(rt, rt->rt_dst, 0);
1701
1702                 peer = rt->peer;
1703                 if (peer) {
1704                         check_peer_pmtu(&rt->dst, peer);
1705
1706                         if (peer->redirect_genid != redirect_genid)
1707                                 peer->redirect_learned.a4 = 0;
1708                         if (peer->redirect_learned.a4 &&
1709                             peer->redirect_learned.a4 != rt->rt_gateway)
1710                                 check_peer_redir(&rt->dst, peer);
1711                 }
1712
1713                 rt->rt_peer_genid = rt_peer_genid();
1714         }
1715 }
1716
1717 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1718 {
1719         struct rtable *rt = (struct rtable *) dst;
1720
1721         if (rt_is_expired(rt))
1722                 return NULL;
1723         ipv4_validate_peer(rt);
1724         return dst;
1725 }
1726
1727 static void ipv4_dst_destroy(struct dst_entry *dst)
1728 {
1729         struct rtable *rt = (struct rtable *) dst;
1730         struct inet_peer *peer = rt->peer;
1731
1732         if (rt->fi) {
1733                 fib_info_put(rt->fi);
1734                 rt->fi = NULL;
1735         }
1736         if (peer) {
1737                 rt->peer = NULL;
1738                 inet_putpeer(peer);
1739         }
1740 }
1741
1742
1743 static void ipv4_link_failure(struct sk_buff *skb)
1744 {
1745         struct rtable *rt;
1746
1747         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1748
1749         rt = skb_rtable(skb);
1750         if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1751                 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1752 }
1753
1754 static int ip_rt_bug(struct sk_buff *skb)
1755 {
1756         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1757                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1758                 skb->dev ? skb->dev->name : "?");
1759         kfree_skb(skb);
1760         WARN_ON(1);
1761         return 0;
1762 }
1763
1764 /*
1765    We do not cache source address of outgoing interface,
1766    because it is used only by IP RR, TS and SRR options,
1767    so that it out of fast path.
1768
1769    BTW remember: "addr" is allowed to be not aligned
1770    in IP options!
1771  */
1772
1773 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1774 {
1775         __be32 src;
1776
1777         if (rt_is_output_route(rt))
1778                 src = ip_hdr(skb)->saddr;
1779         else {
1780                 struct fib_result res;
1781                 struct flowi4 fl4;
1782                 struct iphdr *iph;
1783
1784                 iph = ip_hdr(skb);
1785
1786                 memset(&fl4, 0, sizeof(fl4));
1787                 fl4.daddr = iph->daddr;
1788                 fl4.saddr = iph->saddr;
1789                 fl4.flowi4_tos = RT_TOS(iph->tos);
1790                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1791                 fl4.flowi4_iif = skb->dev->ifindex;
1792                 fl4.flowi4_mark = skb->mark;
1793
1794                 rcu_read_lock();
1795                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1796                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1797                 else
1798                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1799                                         RT_SCOPE_UNIVERSE);
1800                 rcu_read_unlock();
1801         }
1802         memcpy(addr, &src, 4);
1803 }
1804
1805 #ifdef CONFIG_IP_ROUTE_CLASSID
1806 static void set_class_tag(struct rtable *rt, u32 tag)
1807 {
1808         if (!(rt->dst.tclassid & 0xFFFF))
1809                 rt->dst.tclassid |= tag & 0xFFFF;
1810         if (!(rt->dst.tclassid & 0xFFFF0000))
1811                 rt->dst.tclassid |= tag & 0xFFFF0000;
1812 }
1813 #endif
1814
1815 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1816 {
1817         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1818
1819         if (advmss == 0) {
1820                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1821                                ip_rt_min_advmss);
1822                 if (advmss > 65535 - 40)
1823                         advmss = 65535 - 40;
1824         }
1825         return advmss;
1826 }
1827
1828 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1829 {
1830         const struct rtable *rt = (const struct rtable *) dst;
1831         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1832
1833         if (mtu && rt_is_output_route(rt))
1834                 return mtu;
1835
1836         mtu = dst->dev->mtu;
1837
1838         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1839
1840                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1841                         mtu = 576;
1842         }
1843
1844         if (mtu > IP_MAX_MTU)
1845                 mtu = IP_MAX_MTU;
1846
1847         return mtu;
1848 }
1849
1850 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1851                             struct fib_info *fi)
1852 {
1853         struct inet_peer *peer;
1854         int create = 0;
1855
1856         /* If a peer entry exists for this destination, we must hook
1857          * it up in order to get at cached metrics.
1858          */
1859         if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1860                 create = 1;
1861
1862         rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1863         if (peer) {
1864                 rt->rt_peer_genid = rt_peer_genid();
1865                 if (inet_metrics_new(peer))
1866                         memcpy(peer->metrics, fi->fib_metrics,
1867                                sizeof(u32) * RTAX_MAX);
1868                 dst_init_metrics(&rt->dst, peer->metrics, false);
1869
1870                 check_peer_pmtu(&rt->dst, peer);
1871                 if (peer->redirect_genid != redirect_genid)
1872                         peer->redirect_learned.a4 = 0;
1873                 if (peer->redirect_learned.a4 &&
1874                     peer->redirect_learned.a4 != rt->rt_gateway) {
1875                         rt->rt_gateway = peer->redirect_learned.a4;
1876                         rt->rt_flags |= RTCF_REDIRECTED;
1877                 }
1878         } else {
1879                 if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1880                         rt->fi = fi;
1881                         atomic_inc(&fi->fib_clntref);
1882                 }
1883                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1884         }
1885 }
1886
1887 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
1888                            const struct fib_result *res,
1889                            struct fib_info *fi, u16 type, u32 itag)
1890 {
1891         struct dst_entry *dst = &rt->dst;
1892
1893         if (fi) {
1894                 if (FIB_RES_GW(*res) &&
1895                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1896                         rt->rt_gateway = FIB_RES_GW(*res);
1897                 rt_init_metrics(rt, fl4, fi);
1898 #ifdef CONFIG_IP_ROUTE_CLASSID
1899                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1900 #endif
1901         }
1902
1903         if (dst_mtu(dst) > IP_MAX_MTU)
1904                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1905         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1906                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1907
1908 #ifdef CONFIG_IP_ROUTE_CLASSID
1909 #ifdef CONFIG_IP_MULTIPLE_TABLES
1910         set_class_tag(rt, fib_rules_tclass(res));
1911 #endif
1912         set_class_tag(rt, itag);
1913 #endif
1914 }
1915
1916 static struct rtable *rt_dst_alloc(struct net_device *dev,
1917                                    bool nopolicy, bool noxfrm)
1918 {
1919         return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1920                          DST_HOST |
1921                          (nopolicy ? DST_NOPOLICY : 0) |
1922                          (noxfrm ? DST_NOXFRM : 0));
1923 }
1924
1925 /* called in rcu_read_lock() section */
1926 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1927                                 u8 tos, struct net_device *dev, int our)
1928 {
1929         unsigned int hash;
1930         struct rtable *rth;
1931         __be32 spec_dst;
1932         struct in_device *in_dev = __in_dev_get_rcu(dev);
1933         u32 itag = 0;
1934         int err;
1935
1936         /* Primary sanity checks. */
1937
1938         if (in_dev == NULL)
1939                 return -EINVAL;
1940
1941         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1942             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1943                 goto e_inval;
1944
1945         if (ipv4_is_zeronet(saddr)) {
1946                 if (!ipv4_is_local_multicast(daddr))
1947                         goto e_inval;
1948                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1949         } else {
1950                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1951                                           &itag);
1952                 if (err < 0)
1953                         goto e_err;
1954         }
1955         rth = rt_dst_alloc(init_net.loopback_dev,
1956                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1957         if (!rth)
1958                 goto e_nobufs;
1959
1960 #ifdef CONFIG_IP_ROUTE_CLASSID
1961         rth->dst.tclassid = itag;
1962 #endif
1963         rth->dst.output = ip_rt_bug;
1964
1965         rth->rt_key_dst = daddr;
1966         rth->rt_key_src = saddr;
1967         rth->rt_genid   = rt_genid(dev_net(dev));
1968         rth->rt_flags   = RTCF_MULTICAST;
1969         rth->rt_type    = RTN_MULTICAST;
1970         rth->rt_key_tos = tos;
1971         rth->rt_dst     = daddr;
1972         rth->rt_src     = saddr;
1973         rth->rt_route_iif = dev->ifindex;
1974         rth->rt_iif     = dev->ifindex;
1975         rth->rt_oif     = 0;
1976         rth->rt_mark    = skb->mark;
1977         rth->rt_gateway = daddr;
1978         rth->rt_spec_dst= spec_dst;
1979         rth->rt_peer_genid = 0;
1980         rth->peer = NULL;
1981         rth->fi = NULL;
1982         if (our) {
1983                 rth->dst.input= ip_local_deliver;
1984                 rth->rt_flags |= RTCF_LOCAL;
1985         }
1986
1987 #ifdef CONFIG_IP_MROUTE
1988         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1989                 rth->dst.input = ip_mr_input;
1990 #endif
1991         RT_CACHE_STAT_INC(in_slow_mc);
1992
1993         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1994         rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1995         return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1996
1997 e_nobufs:
1998         return -ENOBUFS;
1999 e_inval:
2000         return -EINVAL;
2001 e_err:
2002         return err;
2003 }
2004
2005
2006 static void ip_handle_martian_source(struct net_device *dev,
2007                                      struct in_device *in_dev,
2008                                      struct sk_buff *skb,
2009                                      __be32 daddr,
2010                                      __be32 saddr)
2011 {
2012         RT_CACHE_STAT_INC(in_martian_src);
2013 #ifdef CONFIG_IP_ROUTE_VERBOSE
2014         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2015                 /*
2016                  *      RFC1812 recommendation, if source is martian,
2017                  *      the only hint is MAC header.
2018                  */
2019                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
2020                         &daddr, &saddr, dev->name);
2021                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2022                         int i;
2023                         const unsigned char *p = skb_mac_header(skb);
2024                         printk(KERN_WARNING "ll header: ");
2025                         for (i = 0; i < dev->hard_header_len; i++, p++) {
2026                                 printk("%02x", *p);
2027                                 if (i < (dev->hard_header_len - 1))
2028                                         printk(":");
2029                         }
2030                         printk("\n");
2031                 }
2032         }
2033 #endif
2034 }
2035
2036 /* called in rcu_read_lock() section */
2037 static int __mkroute_input(struct sk_buff *skb,
2038                            const struct fib_result *res,
2039                            struct in_device *in_dev,
2040                            __be32 daddr, __be32 saddr, u32 tos,
2041                            struct rtable **result)
2042 {
2043         struct rtable *rth;
2044         int err;
2045         struct in_device *out_dev;
2046         unsigned int flags = 0;
2047         __be32 spec_dst;
2048         u32 itag;
2049
2050         /* get a working reference to the output device */
2051         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2052         if (out_dev == NULL) {
2053                 if (net_ratelimit())
2054                         printk(KERN_CRIT "Bug in ip_route_input" \
2055                                "_slow(). Please, report\n");
2056                 return -EINVAL;
2057         }
2058
2059
2060         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2061                                   in_dev->dev, &spec_dst, &itag);
2062         if (err < 0) {
2063                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2064                                          saddr);
2065
2066                 goto cleanup;
2067         }
2068
2069         if (err)
2070                 flags |= RTCF_DIRECTSRC;
2071
2072         if (out_dev == in_dev && err &&
2073             (IN_DEV_SHARED_MEDIA(out_dev) ||
2074              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2075                 flags |= RTCF_DOREDIRECT;
2076
2077         if (skb->protocol != htons(ETH_P_IP)) {
2078                 /* Not IP (i.e. ARP). Do not create route, if it is
2079                  * invalid for proxy arp. DNAT routes are always valid.
2080                  *
2081                  * Proxy arp feature have been extended to allow, ARP
2082                  * replies back to the same interface, to support
2083                  * Private VLAN switch technologies. See arp.c.
2084                  */
2085                 if (out_dev == in_dev &&
2086                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2087                         err = -EINVAL;
2088                         goto cleanup;
2089                 }
2090         }
2091
2092         rth = rt_dst_alloc(out_dev->dev,
2093                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2094                            IN_DEV_CONF_GET(out_dev, NOXFRM));
2095         if (!rth) {
2096                 err = -ENOBUFS;
2097                 goto cleanup;
2098         }
2099
2100         rth->rt_key_dst = daddr;
2101         rth->rt_key_src = saddr;
2102         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2103         rth->rt_flags = flags;
2104         rth->rt_type = res->type;
2105         rth->rt_key_tos = tos;
2106         rth->rt_dst     = daddr;
2107         rth->rt_src     = saddr;
2108         rth->rt_route_iif = in_dev->dev->ifindex;
2109         rth->rt_iif     = in_dev->dev->ifindex;
2110         rth->rt_oif     = 0;
2111         rth->rt_mark    = skb->mark;
2112         rth->rt_gateway = daddr;
2113         rth->rt_spec_dst= spec_dst;
2114         rth->rt_peer_genid = 0;
2115         rth->peer = NULL;
2116         rth->fi = NULL;
2117
2118         rth->dst.input = ip_forward;
2119         rth->dst.output = ip_output;
2120
2121         rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2122
2123         *result = rth;
2124         err = 0;
2125  cleanup:
2126         return err;
2127 }
2128
2129 static int ip_mkroute_input(struct sk_buff *skb,
2130                             struct fib_result *res,
2131                             const struct flowi4 *fl4,
2132                             struct in_device *in_dev,
2133                             __be32 daddr, __be32 saddr, u32 tos)
2134 {
2135         struct rtable* rth = NULL;
2136         int err;
2137         unsigned hash;
2138
2139 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2140         if (res->fi && res->fi->fib_nhs > 1)
2141                 fib_select_multipath(res);
2142 #endif
2143
2144         /* create a routing cache entry */
2145         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2146         if (err)
2147                 return err;
2148
2149         /* put it into the cache */
2150         hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2151                        rt_genid(dev_net(rth->dst.dev)));
2152         rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2153         if (IS_ERR(rth))
2154                 return PTR_ERR(rth);
2155         return 0;
2156 }
2157
2158 /*
2159  *      NOTE. We drop all the packets that has local source
2160  *      addresses, because every properly looped back packet
2161  *      must have correct destination already attached by output routine.
2162  *
2163  *      Such approach solves two big problems:
2164  *      1. Not simplex devices are handled properly.
2165  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2166  *      called with rcu_read_lock()
2167  */
2168
2169 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2170                                u8 tos, struct net_device *dev)
2171 {
2172         struct fib_result res;
2173         struct in_device *in_dev = __in_dev_get_rcu(dev);
2174         struct flowi4   fl4;
2175         unsigned        flags = 0;
2176         u32             itag = 0;
2177         struct rtable * rth;
2178         unsigned        hash;
2179         __be32          spec_dst;
2180         int             err = -EINVAL;
2181         struct net    * net = dev_net(dev);
2182
2183         /* IP on this device is disabled. */
2184
2185         if (!in_dev)
2186                 goto out;
2187
2188         /* Check for the most weird martians, which can be not detected
2189            by fib_lookup.
2190          */
2191
2192         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2193             ipv4_is_loopback(saddr))
2194                 goto martian_source;
2195
2196         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2197                 goto brd_input;
2198
2199         /* Accept zero addresses only to limited broadcast;
2200          * I even do not know to fix it or not. Waiting for complains :-)
2201          */
2202         if (ipv4_is_zeronet(saddr))
2203                 goto martian_source;
2204
2205         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2206                 goto martian_destination;
2207
2208         /*
2209          *      Now we are ready to route packet.
2210          */
2211         fl4.flowi4_oif = 0;
2212         fl4.flowi4_iif = dev->ifindex;
2213         fl4.flowi4_mark = skb->mark;
2214         fl4.flowi4_tos = tos;
2215         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2216         fl4.daddr = daddr;
2217         fl4.saddr = saddr;
2218         err = fib_lookup(net, &fl4, &res);
2219         if (err != 0) {
2220                 if (!IN_DEV_FORWARD(in_dev))
2221                         goto e_hostunreach;
2222                 goto no_route;
2223         }
2224
2225         RT_CACHE_STAT_INC(in_slow_tot);
2226
2227         if (res.type == RTN_BROADCAST)
2228                 goto brd_input;
2229
2230         if (res.type == RTN_LOCAL) {
2231                 err = fib_validate_source(skb, saddr, daddr, tos,
2232                                           net->loopback_dev->ifindex,
2233                                           dev, &spec_dst, &itag);
2234                 if (err < 0)
2235                         goto martian_source_keep_err;
2236                 if (err)
2237                         flags |= RTCF_DIRECTSRC;
2238                 spec_dst = daddr;
2239                 goto local_input;
2240         }
2241
2242         if (!IN_DEV_FORWARD(in_dev))
2243                 goto e_hostunreach;
2244         if (res.type != RTN_UNICAST)
2245                 goto martian_destination;
2246
2247         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2248 out:    return err;
2249
2250 brd_input:
2251         if (skb->protocol != htons(ETH_P_IP))
2252                 goto e_inval;
2253
2254         if (ipv4_is_zeronet(saddr))
2255                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2256         else {
2257                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2258                                           &itag);
2259                 if (err < 0)
2260                         goto martian_source_keep_err;
2261                 if (err)
2262                         flags |= RTCF_DIRECTSRC;
2263         }
2264         flags |= RTCF_BROADCAST;
2265         res.type = RTN_BROADCAST;
2266         RT_CACHE_STAT_INC(in_brd);
2267
2268 local_input:
2269         rth = rt_dst_alloc(net->loopback_dev,
2270                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2271         if (!rth)
2272                 goto e_nobufs;
2273
2274         rth->dst.input= ip_local_deliver;
2275         rth->dst.output= ip_rt_bug;
2276 #ifdef CONFIG_IP_ROUTE_CLASSID
2277         rth->dst.tclassid = itag;
2278 #endif
2279
2280         rth->rt_key_dst = daddr;
2281         rth->rt_key_src = saddr;
2282         rth->rt_genid = rt_genid(net);
2283         rth->rt_flags   = flags|RTCF_LOCAL;
2284         rth->rt_type    = res.type;
2285         rth->rt_key_tos = tos;
2286         rth->rt_dst     = daddr;
2287         rth->rt_src     = saddr;
2288 #ifdef CONFIG_IP_ROUTE_CLASSID
2289         rth->dst.tclassid = itag;
2290 #endif
2291         rth->rt_route_iif = dev->ifindex;
2292         rth->rt_iif     = dev->ifindex;
2293         rth->rt_oif     = 0;
2294         rth->rt_mark    = skb->mark;
2295         rth->rt_gateway = daddr;
2296         rth->rt_spec_dst= spec_dst;
2297         rth->rt_peer_genid = 0;
2298         rth->peer = NULL;
2299         rth->fi = NULL;
2300         if (res.type == RTN_UNREACHABLE) {
2301                 rth->dst.input= ip_error;
2302                 rth->dst.error= -err;
2303                 rth->rt_flags   &= ~RTCF_LOCAL;
2304         }
2305         hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2306         rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
2307         err = 0;
2308         if (IS_ERR(rth))
2309                 err = PTR_ERR(rth);
2310         goto out;
2311
2312 no_route:
2313         RT_CACHE_STAT_INC(in_no_route);
2314         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2315         res.type = RTN_UNREACHABLE;
2316         if (err == -ESRCH)
2317                 err = -ENETUNREACH;
2318         goto local_input;
2319
2320         /*
2321          *      Do not cache martian addresses: they should be logged (RFC1812)
2322          */
2323 martian_destination:
2324         RT_CACHE_STAT_INC(in_martian_dst);
2325 #ifdef CONFIG_IP_ROUTE_VERBOSE
2326         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2327                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2328                         &daddr, &saddr, dev->name);
2329 #endif
2330
2331 e_hostunreach:
2332         err = -EHOSTUNREACH;
2333         goto out;
2334
2335 e_inval:
2336         err = -EINVAL;
2337         goto out;
2338
2339 e_nobufs:
2340         err = -ENOBUFS;
2341         goto out;
2342
2343 martian_source:
2344         err = -EINVAL;
2345 martian_source_keep_err:
2346         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2347         goto out;
2348 }
2349
2350 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2351                            u8 tos, struct net_device *dev, bool noref)
2352 {
2353         struct rtable * rth;
2354         unsigned        hash;
2355         int iif = dev->ifindex;
2356         struct net *net;
2357         int res;
2358
2359         net = dev_net(dev);
2360
2361         rcu_read_lock();
2362
2363         if (!rt_caching(net))
2364                 goto skip_cache;
2365
2366         tos &= IPTOS_RT_MASK;
2367         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2368
2369         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2370              rth = rcu_dereference(rth->dst.rt_next)) {
2371                 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2372                      ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2373                      (rth->rt_route_iif ^ iif) |
2374                      (rth->rt_key_tos ^ tos)) == 0 &&
2375                     rth->rt_mark == skb->mark &&
2376                     net_eq(dev_net(rth->dst.dev), net) &&
2377                     !rt_is_expired(rth)) {
2378                         ipv4_validate_peer(rth);
2379                         if (noref) {
2380                                 dst_use_noref(&rth->dst, jiffies);
2381                                 skb_dst_set_noref(skb, &rth->dst);
2382                         } else {
2383                                 dst_use(&rth->dst, jiffies);
2384                                 skb_dst_set(skb, &rth->dst);
2385                         }
2386                         RT_CACHE_STAT_INC(in_hit);
2387                         rcu_read_unlock();
2388                         return 0;
2389                 }
2390                 RT_CACHE_STAT_INC(in_hlist_search);
2391         }
2392
2393 skip_cache:
2394         /* Multicast recognition logic is moved from route cache to here.
2395            The problem was that too many Ethernet cards have broken/missing
2396            hardware multicast filters :-( As result the host on multicasting
2397            network acquires a lot of useless route cache entries, sort of
2398            SDR messages from all the world. Now we try to get rid of them.
2399            Really, provided software IP multicast filter is organized
2400            reasonably (at least, hashed), it does not result in a slowdown
2401            comparing with route cache reject entries.
2402            Note, that multicast routers are not affected, because
2403            route cache entry is created eventually.
2404          */
2405         if (ipv4_is_multicast(daddr)) {
2406                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2407
2408                 if (in_dev) {
2409                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2410                                                   ip_hdr(skb)->protocol);
2411                         if (our
2412 #ifdef CONFIG_IP_MROUTE
2413                                 ||
2414                             (!ipv4_is_local_multicast(daddr) &&
2415                              IN_DEV_MFORWARD(in_dev))
2416 #endif
2417                            ) {
2418                                 int res = ip_route_input_mc(skb, daddr, saddr,
2419                                                             tos, dev, our);
2420                                 rcu_read_unlock();
2421                                 return res;
2422                         }
2423                 }
2424                 rcu_read_unlock();
2425                 return -EINVAL;
2426         }
2427         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2428         rcu_read_unlock();
2429         return res;
2430 }
2431 EXPORT_SYMBOL(ip_route_input_common);
2432
2433 /* called with rcu_read_lock() */
2434 static struct rtable *__mkroute_output(const struct fib_result *res,
2435                                        const struct flowi4 *fl4,
2436                                        __be32 orig_daddr, __be32 orig_saddr,
2437                                        int orig_oif, __u8 orig_rtos,
2438                                        struct net_device *dev_out,
2439                                        unsigned int flags)
2440 {
2441         struct fib_info *fi = res->fi;
2442         struct in_device *in_dev;
2443         u16 type = res->type;
2444         struct rtable *rth;
2445
2446         if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2447                 return ERR_PTR(-EINVAL);
2448
2449         if (ipv4_is_lbcast(fl4->daddr))
2450                 type = RTN_BROADCAST;
2451         else if (ipv4_is_multicast(fl4->daddr))
2452                 type = RTN_MULTICAST;
2453         else if (ipv4_is_zeronet(fl4->daddr))
2454                 return ERR_PTR(-EINVAL);
2455
2456         if (dev_out->flags & IFF_LOOPBACK)
2457                 flags |= RTCF_LOCAL;
2458
2459         in_dev = __in_dev_get_rcu(dev_out);
2460         if (!in_dev)
2461                 return ERR_PTR(-EINVAL);
2462
2463         if (type == RTN_BROADCAST) {
2464                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2465                 fi = NULL;
2466         } else if (type == RTN_MULTICAST) {
2467                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2468                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2469                                      fl4->flowi4_proto))
2470                         flags &= ~RTCF_LOCAL;
2471                 /* If multicast route do not exist use
2472                  * default one, but do not gateway in this case.
2473                  * Yes, it is hack.
2474                  */
2475                 if (fi && res->prefixlen < 4)
2476                         fi = NULL;
2477         }
2478
2479         rth = rt_dst_alloc(dev_out,
2480                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2481                            IN_DEV_CONF_GET(in_dev, NOXFRM));
2482         if (!rth)
2483                 return ERR_PTR(-ENOBUFS);
2484
2485         rth->dst.output = ip_output;
2486
2487         rth->rt_key_dst = orig_daddr;
2488         rth->rt_key_src = orig_saddr;
2489         rth->rt_genid = rt_genid(dev_net(dev_out));
2490         rth->rt_flags   = flags;
2491         rth->rt_type    = type;
2492         rth->rt_key_tos = orig_rtos;
2493         rth->rt_dst     = fl4->daddr;
2494         rth->rt_src     = fl4->saddr;
2495         rth->rt_route_iif = 0;
2496         rth->rt_iif     = orig_oif ? : dev_out->ifindex;
2497         rth->rt_oif     = orig_oif;
2498         rth->rt_mark    = fl4->flowi4_mark;
2499         rth->rt_gateway = fl4->daddr;
2500         rth->rt_spec_dst= fl4->saddr;
2501         rth->rt_peer_genid = 0;
2502         rth->peer = NULL;
2503         rth->fi = NULL;
2504
2505         RT_CACHE_STAT_INC(out_slow_tot);
2506
2507         if (flags & RTCF_LOCAL) {
2508                 rth->dst.input = ip_local_deliver;
2509                 rth->rt_spec_dst = fl4->daddr;
2510         }
2511         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2512                 rth->rt_spec_dst = fl4->saddr;
2513                 if (flags & RTCF_LOCAL &&
2514                     !(dev_out->flags & IFF_LOOPBACK)) {
2515                         rth->dst.output = ip_mc_output;
2516                         RT_CACHE_STAT_INC(out_slow_mc);
2517                 }
2518 #ifdef CONFIG_IP_MROUTE
2519                 if (type == RTN_MULTICAST) {
2520                         if (IN_DEV_MFORWARD(in_dev) &&
2521                             !ipv4_is_local_multicast(fl4->daddr)) {
2522                                 rth->dst.input = ip_mr_input;
2523                                 rth->dst.output = ip_mc_output;
2524                         }
2525                 }
2526 #endif
2527         }
2528
2529         rt_set_nexthop(rth, fl4, res, fi, type, 0);
2530
2531         return rth;
2532 }
2533
2534 /*
2535  * Major route resolver routine.
2536  * called with rcu_read_lock();
2537  */
2538
2539 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
2540 {
2541         struct net_device *dev_out = NULL;
2542         __u8 tos = RT_FL_TOS(fl4);
2543         unsigned int flags = 0;
2544         struct fib_result res;
2545         struct rtable *rth;
2546         __be32 orig_daddr;
2547         __be32 orig_saddr;
2548         int orig_oif;
2549
2550         res.fi          = NULL;
2551 #ifdef CONFIG_IP_MULTIPLE_TABLES
2552         res.r           = NULL;
2553 #endif
2554
2555         orig_daddr = fl4->daddr;
2556         orig_saddr = fl4->saddr;
2557         orig_oif = fl4->flowi4_oif;
2558
2559         fl4->flowi4_iif = net->loopback_dev->ifindex;
2560         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2561         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2562                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2563
2564         rcu_read_lock();
2565         if (fl4->saddr) {
2566                 rth = ERR_PTR(-EINVAL);
2567                 if (ipv4_is_multicast(fl4->saddr) ||
2568                     ipv4_is_lbcast(fl4->saddr) ||
2569                     ipv4_is_zeronet(fl4->saddr))
2570                         goto out;
2571
2572                 /* I removed check for oif == dev_out->oif here.
2573                    It was wrong for two reasons:
2574                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2575                       is assigned to multiple interfaces.
2576                    2. Moreover, we are allowed to send packets with saddr
2577                       of another iface. --ANK
2578                  */
2579
2580                 if (fl4->flowi4_oif == 0 &&
2581                     (ipv4_is_multicast(fl4->daddr) ||
2582                      ipv4_is_lbcast(fl4->daddr))) {
2583                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2584                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2585                         if (dev_out == NULL)
2586                                 goto out;
2587
2588                         /* Special hack: user can direct multicasts
2589                            and limited broadcast via necessary interface
2590                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2591                            This hack is not just for fun, it allows
2592                            vic,vat and friends to work.
2593                            They bind socket to loopback, set ttl to zero
2594                            and expect that it will work.
2595                            From the viewpoint of routing cache they are broken,
2596                            because we are not allowed to build multicast path
2597                            with loopback source addr (look, routing cache
2598                            cannot know, that ttl is zero, so that packet
2599                            will not leave this host and route is valid).
2600                            Luckily, this hack is good workaround.
2601                          */
2602
2603                         fl4->flowi4_oif = dev_out->ifindex;
2604                         goto make_route;
2605                 }
2606
2607                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2608                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2609                         if (!__ip_dev_find(net, fl4->saddr, false))
2610                                 goto out;
2611                 }
2612         }
2613
2614
2615         if (fl4->flowi4_oif) {
2616                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2617                 rth = ERR_PTR(-ENODEV);
2618                 if (dev_out == NULL)
2619                         goto out;
2620
2621                 /* RACE: Check return value of inet_select_addr instead. */
2622                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2623                         rth = ERR_PTR(-ENETUNREACH);
2624                         goto out;
2625                 }
2626                 if (ipv4_is_local_multicast(fl4->daddr) ||
2627                     ipv4_is_lbcast(fl4->daddr)) {
2628                         if (!fl4->saddr)
2629                                 fl4->saddr = inet_select_addr(dev_out, 0,
2630                                                               RT_SCOPE_LINK);
2631                         goto make_route;
2632                 }
2633                 if (fl4->saddr) {
2634                         if (ipv4_is_multicast(fl4->daddr))
2635                                 fl4->saddr = inet_select_addr(dev_out, 0,
2636                                                               fl4->flowi4_scope);
2637                         else if (!fl4->daddr)
2638                                 fl4->saddr = inet_select_addr(dev_out, 0,
2639                                                               RT_SCOPE_HOST);
2640                 }
2641         }
2642
2643         if (!fl4->daddr) {
2644                 fl4->daddr = fl4->saddr;
2645                 if (!fl4->daddr)
2646                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2647                 dev_out = net->loopback_dev;
2648                 fl4->flowi4_oif = net->loopback_dev->ifindex;
2649                 res.type = RTN_LOCAL;
2650                 flags |= RTCF_LOCAL;
2651                 goto make_route;
2652         }
2653
2654         if (fib_lookup(net, fl4, &res)) {
2655                 res.fi = NULL;
2656                 if (fl4->flowi4_oif) {
2657                         /* Apparently, routing tables are wrong. Assume,
2658                            that the destination is on link.
2659
2660                            WHY? DW.
2661                            Because we are allowed to send to iface
2662                            even if it has NO routes and NO assigned
2663                            addresses. When oif is specified, routing
2664                            tables are looked up with only one purpose:
2665                            to catch if destination is gatewayed, rather than
2666                            direct. Moreover, if MSG_DONTROUTE is set,
2667                            we send packet, ignoring both routing tables
2668                            and ifaddr state. --ANK
2669
2670
2671                            We could make it even if oif is unknown,
2672                            likely IPv6, but we do not.
2673                          */
2674
2675                         if (fl4->saddr == 0)
2676                                 fl4->saddr = inet_select_addr(dev_out, 0,
2677                                                               RT_SCOPE_LINK);
2678                         res.type = RTN_UNICAST;
2679                         goto make_route;
2680                 }
2681                 rth = ERR_PTR(-ENETUNREACH);
2682                 goto out;
2683         }
2684
2685         if (res.type == RTN_LOCAL) {
2686                 if (!fl4->saddr) {
2687                         if (res.fi->fib_prefsrc)
2688                                 fl4->saddr = res.fi->fib_prefsrc;
2689                         else
2690                                 fl4->saddr = fl4->daddr;
2691                 }
2692                 dev_out = net->loopback_dev;
2693                 fl4->flowi4_oif = dev_out->ifindex;
2694                 res.fi = NULL;
2695                 flags |= RTCF_LOCAL;
2696                 goto make_route;
2697         }
2698
2699 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2700         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2701                 fib_select_multipath(&res);
2702         else
2703 #endif
2704         if (!res.prefixlen &&
2705             res.table->tb_num_default > 1 &&
2706             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2707                 fib_select_default(&res);
2708
2709         if (!fl4->saddr)
2710                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2711
2712         dev_out = FIB_RES_DEV(res);
2713         fl4->flowi4_oif = dev_out->ifindex;
2714
2715
2716 make_route:
2717         rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2718                                tos, dev_out, flags);
2719         if (!IS_ERR(rth)) {
2720                 unsigned int hash;
2721
2722                 hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2723                                rt_genid(dev_net(dev_out)));
2724                 rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2725         }
2726
2727 out:
2728         rcu_read_unlock();
2729         return rth;
2730 }
2731
2732 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2733 {
2734         struct rtable *rth;
2735         unsigned int hash;
2736
2737         if (!rt_caching(net))
2738                 goto slow_output;
2739
2740         hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2741
2742         rcu_read_lock_bh();
2743         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2744                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2745                 if (rth->rt_key_dst == flp4->daddr &&
2746                     rth->rt_key_src == flp4->saddr &&
2747                     rt_is_output_route(rth) &&
2748                     rth->rt_oif == flp4->flowi4_oif &&
2749                     rth->rt_mark == flp4->flowi4_mark &&
2750                     !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2751                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2752                     net_eq(dev_net(rth->dst.dev), net) &&
2753                     !rt_is_expired(rth)) {
2754                         ipv4_validate_peer(rth);
2755                         dst_use(&rth->dst, jiffies);
2756                         RT_CACHE_STAT_INC(out_hit);
2757                         rcu_read_unlock_bh();
2758                         if (!flp4->saddr)
2759                                 flp4->saddr = rth->rt_src;
2760                         if (!flp4->daddr)
2761                                 flp4->daddr = rth->rt_dst;
2762                         return rth;
2763                 }
2764                 RT_CACHE_STAT_INC(out_hlist_search);
2765         }
2766         rcu_read_unlock_bh();
2767
2768 slow_output:
2769         return ip_route_output_slow(net, flp4);
2770 }
2771 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2772
2773 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2774 {
2775         return NULL;
2776 }
2777
2778 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2779 {
2780         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2781
2782         return mtu ? : dst->dev->mtu;
2783 }
2784
2785 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2786 {
2787 }
2788
2789 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2790                                           unsigned long old)
2791 {
2792         return NULL;
2793 }
2794
2795 static struct dst_ops ipv4_dst_blackhole_ops = {
2796         .family                 =       AF_INET,
2797         .protocol               =       cpu_to_be16(ETH_P_IP),
2798         .destroy                =       ipv4_dst_destroy,
2799         .check                  =       ipv4_blackhole_dst_check,
2800         .mtu                    =       ipv4_blackhole_mtu,
2801         .default_advmss         =       ipv4_default_advmss,
2802         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2803         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2804         .neigh_lookup           =       ipv4_neigh_lookup,
2805 };
2806
2807 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2808 {
2809         struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2810         struct rtable *ort = (struct rtable *) dst_orig;
2811
2812         if (rt) {
2813                 struct dst_entry *new = &rt->dst;
2814
2815                 new->__use = 1;
2816                 new->input = dst_discard;
2817                 new->output = dst_discard;
2818                 dst_copy_metrics(new, &ort->dst);
2819
2820                 new->dev = ort->dst.dev;
2821                 if (new->dev)
2822                         dev_hold(new->dev);
2823
2824                 rt->rt_key_dst = ort->rt_key_dst;
2825                 rt->rt_key_src = ort->rt_key_src;
2826                 rt->rt_key_tos = ort->rt_key_tos;
2827                 rt->rt_route_iif = ort->rt_route_iif;
2828                 rt->rt_iif = ort->rt_iif;
2829                 rt->rt_oif = ort->rt_oif;
2830                 rt->rt_mark = ort->rt_mark;
2831
2832                 rt->rt_genid = rt_genid(net);
2833                 rt->rt_flags = ort->rt_flags;
2834                 rt->rt_type = ort->rt_type;
2835                 rt->rt_dst = ort->rt_dst;
2836                 rt->rt_src = ort->rt_src;
2837                 rt->rt_gateway = ort->rt_gateway;
2838                 rt->rt_spec_dst = ort->rt_spec_dst;
2839                 rt->peer = ort->peer;
2840                 if (rt->peer)
2841                         atomic_inc(&rt->peer->refcnt);
2842                 rt->fi = ort->fi;
2843                 if (rt->fi)
2844                         atomic_inc(&rt->fi->fib_clntref);
2845
2846                 dst_free(new);
2847         }
2848
2849         dst_release(dst_orig);
2850
2851         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2852 }
2853
2854 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2855                                     struct sock *sk)
2856 {
2857         struct rtable *rt = __ip_route_output_key(net, flp4);
2858
2859         if (IS_ERR(rt))
2860                 return rt;
2861
2862         if (flp4->flowi4_proto)
2863                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2864                                                    flowi4_to_flowi(flp4),
2865                                                    sk, 0);
2866
2867         return rt;
2868 }
2869 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2870
2871 static int rt_fill_info(struct net *net,
2872                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2873                         int nowait, unsigned int flags)
2874 {
2875         struct rtable *rt = skb_rtable(skb);
2876         struct rtmsg *r;
2877         struct nlmsghdr *nlh;
2878         unsigned long expires = 0;
2879         const struct inet_peer *peer = rt->peer;
2880         u32 id = 0, ts = 0, tsage = 0, error;
2881
2882         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2883         if (nlh == NULL)
2884                 return -EMSGSIZE;
2885
2886         r = nlmsg_data(nlh);
2887         r->rtm_family    = AF_INET;
2888         r->rtm_dst_len  = 32;
2889         r->rtm_src_len  = 0;
2890         r->rtm_tos      = rt->rt_key_tos;
2891         r->rtm_table    = RT_TABLE_MAIN;
2892         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2893         r->rtm_type     = rt->rt_type;
2894         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2895         r->rtm_protocol = RTPROT_UNSPEC;
2896         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2897         if (rt->rt_flags & RTCF_NOTIFY)
2898                 r->rtm_flags |= RTM_F_NOTIFY;
2899
2900         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2901
2902         if (rt->rt_key_src) {
2903                 r->rtm_src_len = 32;
2904                 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
2905         }
2906         if (rt->dst.dev)
2907                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2908 #ifdef CONFIG_IP_ROUTE_CLASSID
2909         if (rt->dst.tclassid)
2910                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2911 #endif
2912         if (rt_is_input_route(rt))
2913                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2914         else if (rt->rt_src != rt->rt_key_src)
2915                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2916
2917         if (rt->rt_dst != rt->rt_gateway)
2918                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2919
2920         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2921                 goto nla_put_failure;
2922
2923         if (rt->rt_mark)
2924                 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
2925
2926         error = rt->dst.error;
2927         if (peer) {
2928                 inet_peer_refcheck(rt->peer);
2929                 id = atomic_read(&peer->ip_id_count) & 0xffff;
2930                 if (peer->tcp_ts_stamp) {
2931                         ts = peer->tcp_ts;
2932                         tsage = get_seconds() - peer->tcp_ts_stamp;
2933                 }
2934                 expires = ACCESS_ONCE(peer->pmtu_expires);
2935                 if (expires) {
2936                         if (time_before(jiffies, expires))
2937                                 expires -= jiffies;
2938                         else
2939                                 expires = 0;
2940                 }
2941         }
2942
2943         if (rt_is_input_route(rt)) {
2944 #ifdef CONFIG_IP_MROUTE
2945                 __be32 dst = rt->rt_dst;
2946
2947                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2948                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2949                         int err = ipmr_get_route(net, skb,
2950                                                  rt->rt_src, rt->rt_dst,
2951                                                  r, nowait);
2952                         if (err <= 0) {
2953                                 if (!nowait) {
2954                                         if (err == 0)
2955                                                 return 0;
2956                                         goto nla_put_failure;
2957                                 } else {
2958                                         if (err == -EMSGSIZE)
2959                                                 goto nla_put_failure;
2960                                         error = err;
2961                                 }
2962                         }
2963                 } else
2964 #endif
2965                         NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
2966         }
2967
2968         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2969                                expires, error) < 0)
2970                 goto nla_put_failure;
2971
2972         return nlmsg_end(skb, nlh);
2973
2974 nla_put_failure:
2975         nlmsg_cancel(skb, nlh);
2976         return -EMSGSIZE;
2977 }
2978
2979 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2980 {
2981         struct net *net = sock_net(in_skb->sk);
2982         struct rtmsg *rtm;
2983         struct nlattr *tb[RTA_MAX+1];
2984         struct rtable *rt = NULL;
2985         __be32 dst = 0;
2986         __be32 src = 0;
2987         u32 iif;
2988         int err;
2989         int mark;
2990         struct sk_buff *skb;
2991
2992         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2993         if (err < 0)
2994                 goto errout;
2995
2996         rtm = nlmsg_data(nlh);
2997
2998         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2999         if (skb == NULL) {
3000                 err = -ENOBUFS;
3001                 goto errout;
3002         }
3003
3004         /* Reserve room for dummy headers, this skb can pass
3005            through good chunk of routing engine.
3006          */
3007         skb_reset_mac_header(skb);
3008         skb_reset_network_header(skb);
3009
3010         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3011         ip_hdr(skb)->protocol = IPPROTO_ICMP;
3012         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3013
3014         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3015         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3016         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3017         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3018
3019         if (iif) {
3020                 struct net_device *dev;
3021
3022                 dev = __dev_get_by_index(net, iif);
3023                 if (dev == NULL) {
3024                         err = -ENODEV;
3025                         goto errout_free;
3026                 }
3027
3028                 skb->protocol   = htons(ETH_P_IP);
3029                 skb->dev        = dev;
3030                 skb->mark       = mark;
3031                 local_bh_disable();
3032                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3033                 local_bh_enable();
3034
3035                 rt = skb_rtable(skb);
3036                 if (err == 0 && rt->dst.error)
3037                         err = -rt->dst.error;
3038         } else {
3039                 struct flowi4 fl4 = {
3040                         .daddr = dst,
3041                         .saddr = src,
3042                         .flowi4_tos = rtm->rtm_tos,
3043                         .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3044                         .flowi4_mark = mark,
3045                 };
3046                 rt = ip_route_output_key(net, &fl4);
3047
3048                 err = 0;
3049                 if (IS_ERR(rt))
3050                         err = PTR_ERR(rt);
3051         }
3052
3053         if (err)
3054                 goto errout_free;
3055
3056         skb_dst_set(skb, &rt->dst);
3057         if (rtm->rtm_flags & RTM_F_NOTIFY)
3058                 rt->rt_flags |= RTCF_NOTIFY;
3059
3060         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3061                            RTM_NEWROUTE, 0, 0);
3062         if (err <= 0)
3063                 goto errout_free;
3064
3065         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3066 errout:
3067         return err;
3068
3069 errout_free:
3070         kfree_skb(skb);
3071         goto errout;
3072 }
3073
3074 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3075 {
3076         struct rtable *rt;
3077         int h, s_h;
3078         int idx, s_idx;
3079         struct net *net;
3080
3081         net = sock_net(skb->sk);
3082
3083         s_h = cb->args[0];
3084         if (s_h < 0)
3085                 s_h = 0;
3086         s_idx = idx = cb->args[1];
3087         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3088                 if (!rt_hash_table[h].chain)
3089                         continue;
3090                 rcu_read_lock_bh();
3091                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3092                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3093                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3094                                 continue;
3095                         if (rt_is_expired(rt))
3096                                 continue;
3097                         skb_dst_set_noref(skb, &rt->dst);
3098                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3099                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3100                                          1, NLM_F_MULTI) <= 0) {
3101                                 skb_dst_drop(skb);
3102                                 rcu_read_unlock_bh();
3103                                 goto done;
3104                         }
3105                         skb_dst_drop(skb);
3106                 }
3107                 rcu_read_unlock_bh();
3108         }
3109
3110 done:
3111         cb->args[0] = h;
3112         cb->args[1] = idx;
3113         return skb->len;
3114 }
3115
3116 void ip_rt_multicast_event(struct in_device *in_dev)
3117 {
3118         rt_cache_flush(dev_net(in_dev->dev), 0);
3119 }
3120
3121 #ifdef CONFIG_SYSCTL
3122 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3123                                         void __user *buffer,
3124                                         size_t *lenp, loff_t *ppos)
3125 {
3126         if (write) {
3127                 int flush_delay;
3128                 ctl_table ctl;
3129                 struct net *net;
3130
3131                 memcpy(&ctl, __ctl, sizeof(ctl));
3132                 ctl.data = &flush_delay;
3133                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3134
3135                 net = (struct net *)__ctl->extra1;
3136                 rt_cache_flush(net, flush_delay);
3137                 return 0;
3138         }
3139
3140         return -EINVAL;
3141 }
3142
3143 static ctl_table ipv4_route_table[] = {
3144         {
3145                 .procname       = "gc_thresh",
3146                 .data           = &ipv4_dst_ops.gc_thresh,
3147                 .maxlen         = sizeof(int),
3148                 .mode           = 0644,
3149                 .proc_handler   = proc_dointvec,
3150         },
3151         {
3152                 .procname       = "max_size",
3153                 .data           = &ip_rt_max_size,
3154                 .maxlen         = sizeof(int),
3155                 .mode           = 0644,
3156                 .proc_handler   = proc_dointvec,
3157         },
3158         {
3159                 /*  Deprecated. Use gc_min_interval_ms */
3160
3161                 .procname       = "gc_min_interval",
3162                 .data           = &ip_rt_gc_min_interval,
3163                 .maxlen         = sizeof(int),
3164                 .mode           = 0644,
3165                 .proc_handler   = proc_dointvec_jiffies,
3166         },
3167         {
3168                 .procname       = "gc_min_interval_ms",
3169                 .data           = &ip_rt_gc_min_interval,
3170                 .maxlen         = sizeof(int),
3171                 .mode           = 0644,
3172                 .proc_handler   = proc_dointvec_ms_jiffies,
3173         },
3174         {
3175                 .procname       = "gc_timeout",
3176                 .data           = &ip_rt_gc_timeout,
3177                 .maxlen         = sizeof(int),
3178                 .mode           = 0644,
3179                 .proc_handler   = proc_dointvec_jiffies,
3180         },
3181         {
3182                 .procname       = "redirect_load",
3183                 .data           = &ip_rt_redirect_load,
3184                 .maxlen         = sizeof(int),
3185                 .mode           = 0644,
3186                 .proc_handler   = proc_dointvec,
3187         },
3188         {
3189                 .procname       = "redirect_number",
3190                 .data           = &ip_rt_redirect_number,
3191                 .maxlen         = sizeof(int),
3192                 .mode           = 0644,
3193                 .proc_handler   = proc_dointvec,
3194         },
3195         {
3196                 .procname       = "redirect_silence",
3197                 .data           = &ip_rt_redirect_silence,
3198                 .maxlen         = sizeof(int),
3199                 .mode           = 0644,
3200                 .proc_handler   = proc_dointvec,
3201         },
3202         {
3203                 .procname       = "error_cost",
3204                 .data           = &ip_rt_error_cost,
3205                 .maxlen         = sizeof(int),
3206                 .mode           = 0644,
3207                 .proc_handler   = proc_dointvec,
3208         },
3209         {
3210                 .procname       = "error_burst",
3211                 .data           = &ip_rt_error_burst,
3212                 .maxlen         = sizeof(int),
3213                 .mode           = 0644,
3214                 .proc_handler   = proc_dointvec,
3215         },
3216         {
3217                 .procname       = "gc_elasticity",
3218                 .data           = &ip_rt_gc_elasticity,
3219                 .maxlen         = sizeof(int),
3220                 .mode           = 0644,
3221                 .proc_handler   = proc_dointvec,
3222         },
3223         {
3224                 .procname       = "mtu_expires",
3225                 .data           = &ip_rt_mtu_expires,
3226                 .maxlen         = sizeof(int),
3227                 .mode           = 0644,
3228                 .proc_handler   = proc_dointvec_jiffies,
3229         },
3230         {
3231                 .procname       = "min_pmtu",
3232                 .data           = &ip_rt_min_pmtu,
3233                 .maxlen         = sizeof(int),
3234                 .mode           = 0644,
3235                 .proc_handler   = proc_dointvec,
3236         },
3237         {
3238                 .procname       = "min_adv_mss",
3239                 .data           = &ip_rt_min_advmss,
3240                 .maxlen         = sizeof(int),
3241                 .mode           = 0644,
3242                 .proc_handler   = proc_dointvec,
3243         },
3244         { }
3245 };
3246
3247 static struct ctl_table empty[1];
3248
3249 static struct ctl_table ipv4_skeleton[] =
3250 {
3251         { .procname = "route",
3252           .mode = 0555, .child = ipv4_route_table},
3253         { .procname = "neigh",
3254           .mode = 0555, .child = empty},
3255         { }
3256 };
3257
3258 static __net_initdata struct ctl_path ipv4_path[] = {
3259         { .procname = "net", },
3260         { .procname = "ipv4", },
3261         { },
3262 };
3263
3264 static struct ctl_table ipv4_route_flush_table[] = {
3265         {
3266                 .procname       = "flush",
3267                 .maxlen         = sizeof(int),
3268                 .mode           = 0200,
3269                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3270         },
3271         { },
3272 };
3273
3274 static __net_initdata struct ctl_path ipv4_route_path[] = {
3275         { .procname = "net", },
3276         { .procname = "ipv4", },
3277         { .procname = "route", },
3278         { },
3279 };
3280
3281 static __net_init int sysctl_route_net_init(struct net *net)
3282 {
3283         struct ctl_table *tbl;
3284
3285         tbl = ipv4_route_flush_table;
3286         if (!net_eq(net, &init_net)) {
3287                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3288                 if (tbl == NULL)
3289                         goto err_dup;
3290         }
3291         tbl[0].extra1 = net;
3292
3293         net->ipv4.route_hdr =
3294                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3295         if (net->ipv4.route_hdr == NULL)
3296                 goto err_reg;
3297         return 0;
3298
3299 err_reg:
3300         if (tbl != ipv4_route_flush_table)
3301                 kfree(tbl);
3302 err_dup:
3303         return -ENOMEM;
3304 }
3305
3306 static __net_exit void sysctl_route_net_exit(struct net *net)
3307 {
3308         struct ctl_table *tbl;
3309
3310         tbl = net->ipv4.route_hdr->ctl_table_arg;
3311         unregister_net_sysctl_table(net->ipv4.route_hdr);
3312         BUG_ON(tbl == ipv4_route_flush_table);
3313         kfree(tbl);
3314 }
3315
3316 static __net_initdata struct pernet_operations sysctl_route_ops = {
3317         .init = sysctl_route_net_init,
3318         .exit = sysctl_route_net_exit,
3319 };
3320 #endif
3321
3322 static __net_init int rt_genid_init(struct net *net)
3323 {
3324         get_random_bytes(&net->ipv4.rt_genid,
3325                          sizeof(net->ipv4.rt_genid));
3326         get_random_bytes(&net->ipv4.dev_addr_genid,
3327                          sizeof(net->ipv4.dev_addr_genid));
3328         return 0;
3329 }
3330
3331 static __net_initdata struct pernet_operations rt_genid_ops = {
3332         .init = rt_genid_init,
3333 };
3334
3335
3336 #ifdef CONFIG_IP_ROUTE_CLASSID
3337 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3338 #endif /* CONFIG_IP_ROUTE_CLASSID */
3339
3340 static __initdata unsigned long rhash_entries;
3341 static int __init set_rhash_entries(char *str)
3342 {
3343         if (!str)
3344                 return 0;
3345         rhash_entries = simple_strtoul(str, &str, 0);
3346         return 1;
3347 }
3348 __setup("rhash_entries=", set_rhash_entries);
3349
3350 int __init ip_rt_init(void)
3351 {
3352         int rc = 0;
3353
3354 #ifdef CONFIG_IP_ROUTE_CLASSID
3355         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3356         if (!ip_rt_acct)
3357                 panic("IP: failed to allocate ip_rt_acct\n");
3358 #endif
3359
3360         ipv4_dst_ops.kmem_cachep =
3361                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3362                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3363
3364         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3365
3366         if (dst_entries_init(&ipv4_dst_ops) < 0)
3367                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3368
3369         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3370                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3371
3372         rt_hash_table = (struct rt_hash_bucket *)
3373                 alloc_large_system_hash("IP route cache",
3374                                         sizeof(struct rt_hash_bucket),
3375                                         rhash_entries,
3376                                         (totalram_pages >= 128 * 1024) ?
3377                                         15 : 17,
3378                                         0,
3379                                         &rt_hash_log,
3380                                         &rt_hash_mask,
3381                                         rhash_entries ? 0 : 512 * 1024);
3382         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3383         rt_hash_lock_init();
3384
3385         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3386         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3387
3388         devinet_init();
3389         ip_fib_init();
3390
3391         if (ip_rt_proc_init())
3392                 printk(KERN_ERR "Unable to create route proc files\n");
3393 #ifdef CONFIG_XFRM
3394         xfrm_init();
3395         xfrm4_init(ip_rt_max_size);
3396 #endif
3397         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3398
3399 #ifdef CONFIG_SYSCTL
3400         register_pernet_subsys(&sysctl_route_ops);
3401 #endif
3402         register_pernet_subsys(&rt_genid_ops);
3403         return rc;
3404 }
3405
3406 #ifdef CONFIG_SYSCTL
3407 /*
3408  * We really need to sanitize the damn ipv4 init order, then all
3409  * this nonsense will go away.
3410  */
3411 void __init ip_static_sysctl_init(void)
3412 {
3413         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3414 }
3415 #endif