ipv4: using prefetch requires including prefetch.h
[pandora-kernel.git] / net / ipv4 / route.c
index fb47c8f..85cc053 100644 (file)
@@ -91,6 +91,7 @@
 #include <linux/rcupdate.h>
 #include <linux/times.h>
 #include <linux/slab.h>
+#include <linux/prefetch.h>
 #include <net/dst.h>
 #include <net/net_namespace.h>
 #include <net/protocol.h>
 #include <net/secure_seq.h>
 
 #define RT_FL_TOS(oldflp4) \
-    ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
+       ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 
 #define IP_MAX_MTU     0xFFF0
 
 
 static int ip_rt_max_size;
 static int ip_rt_gc_timeout __read_mostly      = RT_GC_TIMEOUT;
+static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
 static int ip_rt_redirect_number __read_mostly = 9;
 static int ip_rt_redirect_load __read_mostly   = HZ / 50;
@@ -131,6 +133,10 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
 static int ip_rt_min_pmtu __read_mostly                = 512 + 20 + 20;
 static int ip_rt_min_advmss __read_mostly      = 256;
 static int rt_chain_length_max __read_mostly   = 20;
+static int redirect_genid;
+
+static struct delayed_work expires_work;
+static unsigned long expires_ljiffies;
 
 /*
  *     Interface to generic destination cache.
@@ -416,9 +422,13 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
        else {
                struct rtable *r = v;
                struct neighbour *n;
-               int len;
+               int len, HHUptod;
 
+               rcu_read_lock();
                n = dst_get_neighbour(&r->dst);
+               HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
+               rcu_read_unlock();
+
                seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
                              "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
                        r->dst.dev ? r->dst.dev->name : "*",
@@ -432,7 +442,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v)
                              dst_metric(&r->dst, RTAX_RTTVAR)),
                        r->rt_key_tos,
                        -1,
-                       (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0,
+                       HHUptod,
                        r->rt_spec_dst, &len);
 
                seq_printf(seq, "%*s\n", 127 - len, "");
@@ -825,6 +835,97 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
        return ONE;
 }
 
+static void rt_check_expire(void)
+{
+       static unsigned int rover;
+       unsigned int i = rover, goal;
+       struct rtable *rth;
+       struct rtable __rcu **rthp;
+       unsigned long samples = 0;
+       unsigned long sum = 0, sum2 = 0;
+       unsigned long delta;
+       u64 mult;
+
+       delta = jiffies - expires_ljiffies;
+       expires_ljiffies = jiffies;
+       mult = ((u64)delta) << rt_hash_log;
+       if (ip_rt_gc_timeout > 1)
+               do_div(mult, ip_rt_gc_timeout);
+       goal = (unsigned int)mult;
+       if (goal > rt_hash_mask)
+               goal = rt_hash_mask + 1;
+       for (; goal > 0; goal--) {
+               unsigned long tmo = ip_rt_gc_timeout;
+               unsigned long length;
+
+               i = (i + 1) & rt_hash_mask;
+               rthp = &rt_hash_table[i].chain;
+
+               if (need_resched())
+                       cond_resched();
+
+               samples++;
+
+               if (rcu_dereference_raw(*rthp) == NULL)
+                       continue;
+               length = 0;
+               spin_lock_bh(rt_hash_lock_addr(i));
+               while ((rth = rcu_dereference_protected(*rthp,
+                                       lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
+                       prefetch(rth->dst.rt_next);
+                       if (rt_is_expired(rth)) {
+                               *rthp = rth->dst.rt_next;
+                               rt_free(rth);
+                               continue;
+                       }
+                       if (rth->dst.expires) {
+                               /* Entry is expired even if it is in use */
+                               if (time_before_eq(jiffies, rth->dst.expires)) {
+nofree:
+                                       tmo >>= 1;
+                                       rthp = &rth->dst.rt_next;
+                                       /*
+                                        * We only count entries on
+                                        * a chain with equal hash inputs once
+                                        * so that entries for different QOS
+                                        * levels, and other non-hash input
+                                        * attributes don't unfairly skew
+                                        * the length computation
+                                        */
+                                       length += has_noalias(rt_hash_table[i].chain, rth);
+                                       continue;
+                               }
+                       } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
+                               goto nofree;
+
+                       /* Cleanup aged off entries. */
+                       *rthp = rth->dst.rt_next;
+                       rt_free(rth);
+               }
+               spin_unlock_bh(rt_hash_lock_addr(i));
+               sum += length;
+               sum2 += length*length;
+       }
+       if (samples) {
+               unsigned long avg = sum / samples;
+               unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
+               rt_chain_length_max = max_t(unsigned long,
+                                       ip_rt_gc_elasticity,
+                                       (avg + 4*sd) >> FRACT_BITS);
+       }
+       rover = i;
+}
+
+/*
+ * rt_worker_func() is run in process context.
+ * we call rt_check_expire() to scan part of the hash table
+ */
+static void rt_worker_func(struct work_struct *work)
+{
+       rt_check_expire();
+       schedule_delayed_work(&expires_work, ip_rt_gc_interval);
+}
+
 /*
  * Perturbation of rt_genid by a small quantity [1..256]
  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
@@ -837,6 +938,7 @@ static void rt_cache_invalidate(struct net *net)
 
        get_random_bytes(&shuffle, sizeof(shuffle));
        atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
+       redirect_genid++;
 }
 
 /*
@@ -1304,7 +1406,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
        spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
-static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
+static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
 {
        struct rtable *rt = (struct rtable *) dst;
        __be32 orig_gw = rt->rt_gateway;
@@ -1315,21 +1417,19 @@ static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
        rt->rt_gateway = peer->redirect_learned.a4;
 
        n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
-       if (IS_ERR(n))
-               return PTR_ERR(n);
+       if (IS_ERR(n)) {
+               rt->rt_gateway = orig_gw;
+               return;
+       }
        old_n = xchg(&rt->dst._neighbour, n);
        if (old_n)
                neigh_release(old_n);
-       if (!n || !(n->nud_state & NUD_VALID)) {
-               if (n)
-                       neigh_event_send(n, NULL);
-               rt->rt_gateway = orig_gw;
-               return -EAGAIN;
+       if (!(n->nud_state & NUD_VALID)) {
+               neigh_event_send(n, NULL);
        } else {
                rt->rt_flags |= RTCF_REDIRECTED;
                call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
        }
-       return 0;
 }
 
 /* called in rcu_read_lock() section */
@@ -1391,8 +1491,10 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 
                                peer = rt->peer;
                                if (peer) {
-                                       if (peer->redirect_learned.a4 != new_gw) {
+                                       if (peer->redirect_learned.a4 != new_gw ||
+                                           peer->redirect_genid != redirect_genid) {
                                                peer->redirect_learned.a4 = new_gw;
+                                               peer->redirect_genid = redirect_genid;
                                                atomic_inc(&__rt_peer_genid);
                                        }
                                        check_peer_redir(&rt->dst, peer);
@@ -1685,12 +1787,8 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
 }
 
 
-static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
+static void ipv4_validate_peer(struct rtable *rt)
 {
-       struct rtable *rt = (struct rtable *) dst;
-
-       if (rt_is_expired(rt))
-               return NULL;
        if (rt->rt_peer_genid != rt_peer_genid()) {
                struct inet_peer *peer;
 
@@ -1699,17 +1797,26 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 
                peer = rt->peer;
                if (peer) {
-                       check_peer_pmtu(dst, peer);
+                       check_peer_pmtu(&rt->dst, peer);
 
+                       if (peer->redirect_genid != redirect_genid)
+                               peer->redirect_learned.a4 = 0;
                        if (peer->redirect_learned.a4 &&
-                           peer->redirect_learned.a4 != rt->rt_gateway) {
-                               if (check_peer_redir(dst, peer))
-                                       return NULL;
-                       }
+                           peer->redirect_learned.a4 != rt->rt_gateway)
+                               check_peer_redir(&rt->dst, peer);
                }
 
                rt->rt_peer_genid = rt_peer_genid();
        }
+}
+
+static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
+{
+       struct rtable *rt = (struct rtable *) dst;
+
+       if (rt_is_expired(rt))
+               return NULL;
+       ipv4_validate_peer(rt);
        return dst;
 }
 
@@ -1857,6 +1964,8 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
                dst_init_metrics(&rt->dst, peer->metrics, false);
 
                check_peer_pmtu(&rt->dst, peer);
+               if (peer->redirect_genid != redirect_genid)
+                       peer->redirect_learned.a4 = 0;
                if (peer->redirect_learned.a4 &&
                    peer->redirect_learned.a4 != rt->rt_gateway) {
                        rt->rt_gateway = peer->redirect_learned.a4;
@@ -2362,6 +2471,7 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                    rth->rt_mark == skb->mark &&
                    net_eq(dev_net(rth->dst.dev), net) &&
                    !rt_is_expired(rth)) {
+                       ipv4_validate_peer(rth);
                        if (noref) {
                                dst_use_noref(&rth->dst, jiffies);
                                skb_dst_set_noref(skb, &rth->dst);
@@ -2420,11 +2530,11 @@ EXPORT_SYMBOL(ip_route_input_common);
 static struct rtable *__mkroute_output(const struct fib_result *res,
                                       const struct flowi4 *fl4,
                                       __be32 orig_daddr, __be32 orig_saddr,
-                                      int orig_oif, struct net_device *dev_out,
+                                      int orig_oif, __u8 orig_rtos,
+                                      struct net_device *dev_out,
                                       unsigned int flags)
 {
        struct fib_info *fi = res->fi;
-       u32 tos = RT_FL_TOS(fl4);
        struct in_device *in_dev;
        u16 type = res->type;
        struct rtable *rth;
@@ -2475,7 +2585,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
        rth->rt_genid = rt_genid(dev_net(dev_out));
        rth->rt_flags   = flags;
        rth->rt_type    = type;
-       rth->rt_key_tos = tos;
+       rth->rt_key_tos = orig_rtos;
        rth->rt_dst     = fl4->daddr;
        rth->rt_src     = fl4->saddr;
        rth->rt_route_iif = 0;
@@ -2525,7 +2635,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
 {
        struct net_device *dev_out = NULL;
-       u32 tos = RT_FL_TOS(fl4);
+       __u8 tos = RT_FL_TOS(fl4);
        unsigned int flags = 0;
        struct fib_result res;
        struct rtable *rth;
@@ -2701,7 +2811,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
 
 make_route:
        rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
-                              dev_out, flags);
+                              tos, dev_out, flags);
        if (!IS_ERR(rth)) {
                unsigned int hash;
 
@@ -2737,6 +2847,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
                            (IPTOS_RT_MASK | RTO_ONLINK)) &&
                    net_eq(dev_net(rth->dst.dev), net) &&
                    !rt_is_expired(rth)) {
+                       ipv4_validate_peer(rth);
                        dst_use(&rth->dst, jiffies);
                        RT_CACHE_STAT_INC(out_hit);
                        rcu_read_unlock_bh();
@@ -3163,6 +3274,13 @@ static ctl_table ipv4_route_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec_jiffies,
        },
+       {
+               .procname       = "gc_interval",
+               .data           = &ip_rt_gc_interval,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_jiffies,
+       },
        {
                .procname       = "redirect_load",
                .data           = &ip_rt_redirect_load,
@@ -3373,6 +3491,11 @@ int __init ip_rt_init(void)
        devinet_init();
        ip_fib_init();
 
+       INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
+       expires_ljiffies = jiffies;
+       schedule_delayed_work(&expires_work,
+               net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
+
        if (ip_rt_proc_init())
                printk(KERN_ERR "Unable to create route proc files\n");
 #ifdef CONFIG_XFRM