ipv4: using prefetch requires including prefetch.h
[pandora-kernel.git] / net / ipv4 / route.c
index ca5e237..85cc053 100644 (file)
@@ -91,6 +91,7 @@
 #include <linux/rcupdate.h>
 #include <linux/times.h>
 #include <linux/slab.h>
+#include <linux/prefetch.h>
 #include <net/dst.h>
 #include <net/net_namespace.h>
 #include <net/protocol.h>
 #include <net/secure_seq.h>
 
 #define RT_FL_TOS(oldflp4) \
-    ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
+       ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 
 #define IP_MAX_MTU     0xFFF0
 
 
 static int ip_rt_max_size;
 static int ip_rt_gc_timeout __read_mostly      = RT_GC_TIMEOUT;
+static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
 static int ip_rt_redirect_number __read_mostly = 9;
 static int ip_rt_redirect_load __read_mostly   = HZ / 50;
@@ -133,6 +135,9 @@ static int ip_rt_min_advmss __read_mostly   = 256;
 static int rt_chain_length_max __read_mostly   = 20;
 static int redirect_genid;
 
+static struct delayed_work expires_work;
+static unsigned long expires_ljiffies;
+
 /*
  *     Interface to generic destination cache.
  */
@@ -830,6 +835,97 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
        return ONE;
 }
 
+static void rt_check_expire(void)
+{
+       static unsigned int rover;
+       unsigned int i = rover, goal;
+       struct rtable *rth;
+       struct rtable __rcu **rthp;
+       unsigned long samples = 0;
+       unsigned long sum = 0, sum2 = 0;
+       unsigned long delta;
+       u64 mult;
+
+       delta = jiffies - expires_ljiffies;
+       expires_ljiffies = jiffies;
+       mult = ((u64)delta) << rt_hash_log;
+       if (ip_rt_gc_timeout > 1)
+               do_div(mult, ip_rt_gc_timeout);
+       goal = (unsigned int)mult;
+       if (goal > rt_hash_mask)
+               goal = rt_hash_mask + 1;
+       for (; goal > 0; goal--) {
+               unsigned long tmo = ip_rt_gc_timeout;
+               unsigned long length;
+
+               i = (i + 1) & rt_hash_mask;
+               rthp = &rt_hash_table[i].chain;
+
+               if (need_resched())
+                       cond_resched();
+
+               samples++;
+
+               if (rcu_dereference_raw(*rthp) == NULL)
+                       continue;
+               length = 0;
+               spin_lock_bh(rt_hash_lock_addr(i));
+               while ((rth = rcu_dereference_protected(*rthp,
+                                       lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
+                       prefetch(rth->dst.rt_next);
+                       if (rt_is_expired(rth)) {
+                               *rthp = rth->dst.rt_next;
+                               rt_free(rth);
+                               continue;
+                       }
+                       if (rth->dst.expires) {
+                               /* Entry is expired even if it is in use */
+                               if (time_before_eq(jiffies, rth->dst.expires)) {
+nofree:
+                                       tmo >>= 1;
+                                       rthp = &rth->dst.rt_next;
+                                       /*
+                                        * We only count entries on
+                                        * a chain with equal hash inputs once
+                                        * so that entries for different QOS
+                                        * levels, and other non-hash input
+                                        * attributes don't unfairly skew
+                                        * the length computation
+                                        */
+                                       length += has_noalias(rt_hash_table[i].chain, rth);
+                                       continue;
+                               }
+                       } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
+                               goto nofree;
+
+                       /* Cleanup aged off entries. */
+                       *rthp = rth->dst.rt_next;
+                       rt_free(rth);
+               }
+               spin_unlock_bh(rt_hash_lock_addr(i));
+               sum += length;
+               sum2 += length*length;
+       }
+       if (samples) {
+               unsigned long avg = sum / samples;
+               unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
+               rt_chain_length_max = max_t(unsigned long,
+                                       ip_rt_gc_elasticity,
+                                       (avg + 4*sd) >> FRACT_BITS);
+       }
+       rover = i;
+}
+
+/*
+ * rt_worker_func() is run in process context.
+ * we call rt_check_expire() to scan part of the hash table
+ */
+static void rt_worker_func(struct work_struct *work)
+{
+       rt_check_expire();
+       schedule_delayed_work(&expires_work, ip_rt_gc_interval);
+}
+
 /*
  * Perturbation of rt_genid by a small quantity [1..256]
  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
@@ -1310,7 +1406,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
        spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 
-static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
+static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
 {
        struct rtable *rt = (struct rtable *) dst;
        __be32 orig_gw = rt->rt_gateway;
@@ -1321,21 +1417,19 @@ static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
        rt->rt_gateway = peer->redirect_learned.a4;
 
        n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
-       if (IS_ERR(n))
-               return PTR_ERR(n);
+       if (IS_ERR(n)) {
+               rt->rt_gateway = orig_gw;
+               return;
+       }
        old_n = xchg(&rt->dst._neighbour, n);
        if (old_n)
                neigh_release(old_n);
-       if (!n || !(n->nud_state & NUD_VALID)) {
-               if (n)
-                       neigh_event_send(n, NULL);
-               rt->rt_gateway = orig_gw;
-               return -EAGAIN;
+       if (!(n->nud_state & NUD_VALID)) {
+               neigh_event_send(n, NULL);
        } else {
                rt->rt_flags |= RTCF_REDIRECTED;
                call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
        }
-       return 0;
 }
 
 /* called in rcu_read_lock() section */
@@ -1693,7 +1787,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
 }
 
 
-static struct rtable *ipv4_validate_peer(struct rtable *rt)
+static void ipv4_validate_peer(struct rtable *rt)
 {
        if (rt->rt_peer_genid != rt_peer_genid()) {
                struct inet_peer *peer;
@@ -1708,15 +1802,12 @@ static struct rtable *ipv4_validate_peer(struct rtable *rt)
                        if (peer->redirect_genid != redirect_genid)
                                peer->redirect_learned.a4 = 0;
                        if (peer->redirect_learned.a4 &&
-                           peer->redirect_learned.a4 != rt->rt_gateway) {
-                               if (check_peer_redir(&rt->dst, peer))
-                                       return NULL;
-                       }
+                           peer->redirect_learned.a4 != rt->rt_gateway)
+                               check_peer_redir(&rt->dst, peer);
                }
 
                rt->rt_peer_genid = rt_peer_genid();
        }
-       return rt;
 }
 
 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
@@ -1725,7 +1816,7 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 
        if (rt_is_expired(rt))
                return NULL;
-       dst = (struct dst_entry *) ipv4_validate_peer(rt);
+       ipv4_validate_peer(rt);
        return dst;
 }
 
@@ -2380,9 +2471,7 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                    rth->rt_mark == skb->mark &&
                    net_eq(dev_net(rth->dst.dev), net) &&
                    !rt_is_expired(rth)) {
-                       rth = ipv4_validate_peer(rth);
-                       if (!rth)
-                               continue;
+                       ipv4_validate_peer(rth);
                        if (noref) {
                                dst_use_noref(&rth->dst, jiffies);
                                skb_dst_set_noref(skb, &rth->dst);
@@ -2441,11 +2530,11 @@ EXPORT_SYMBOL(ip_route_input_common);
 static struct rtable *__mkroute_output(const struct fib_result *res,
                                       const struct flowi4 *fl4,
                                       __be32 orig_daddr, __be32 orig_saddr,
-                                      int orig_oif, struct net_device *dev_out,
+                                      int orig_oif, __u8 orig_rtos,
+                                      struct net_device *dev_out,
                                       unsigned int flags)
 {
        struct fib_info *fi = res->fi;
-       u32 tos = RT_FL_TOS(fl4);
        struct in_device *in_dev;
        u16 type = res->type;
        struct rtable *rth;
@@ -2496,7 +2585,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
        rth->rt_genid = rt_genid(dev_net(dev_out));
        rth->rt_flags   = flags;
        rth->rt_type    = type;
-       rth->rt_key_tos = tos;
+       rth->rt_key_tos = orig_rtos;
        rth->rt_dst     = fl4->daddr;
        rth->rt_src     = fl4->saddr;
        rth->rt_route_iif = 0;
@@ -2546,7 +2635,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
 {
        struct net_device *dev_out = NULL;
-       u32 tos = RT_FL_TOS(fl4);
+       __u8 tos = RT_FL_TOS(fl4);
        unsigned int flags = 0;
        struct fib_result res;
        struct rtable *rth;
@@ -2722,7 +2811,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
 
 make_route:
        rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
-                              dev_out, flags);
+                              tos, dev_out, flags);
        if (!IS_ERR(rth)) {
                unsigned int hash;
 
@@ -2758,9 +2847,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
                            (IPTOS_RT_MASK | RTO_ONLINK)) &&
                    net_eq(dev_net(rth->dst.dev), net) &&
                    !rt_is_expired(rth)) {
-                       rth = ipv4_validate_peer(rth);
-                       if (!rth)
-                               continue;
+                       ipv4_validate_peer(rth);
                        dst_use(&rth->dst, jiffies);
                        RT_CACHE_STAT_INC(out_hit);
                        rcu_read_unlock_bh();
@@ -3187,6 +3274,13 @@ static ctl_table ipv4_route_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec_jiffies,
        },
+       {
+               .procname       = "gc_interval",
+               .data           = &ip_rt_gc_interval,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_jiffies,
+       },
        {
                .procname       = "redirect_load",
                .data           = &ip_rt_redirect_load,
@@ -3397,6 +3491,11 @@ int __init ip_rt_init(void)
        devinet_init();
        ip_fib_init();
 
+       INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
+       expires_ljiffies = jiffies;
+       schedule_delayed_work(&expires_work,
+               net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
+
        if (ip_rt_proc_init())
                printk(KERN_ERR "Unable to create route proc files\n");
 #ifdef CONFIG_XFRM