tcp: enforce tcp_min_snd_mss in tcp_mtu_probing()
[pandora-kernel.git] / net / ipv4 / route.c
index 46af623..d3e45fc 100644 (file)
@@ -91,6 +91,7 @@
 #include <linux/rcupdate.h>
 #include <linux/times.h>
 #include <linux/slab.h>
+#include <linux/prefetch.h>
 #include <net/dst.h>
 #include <net/net_namespace.h>
 #include <net/protocol.h>
 
 static int ip_rt_max_size;
 static int ip_rt_gc_timeout __read_mostly      = RT_GC_TIMEOUT;
+static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
 static int ip_rt_redirect_number __read_mostly = 9;
 static int ip_rt_redirect_load __read_mostly   = HZ / 50;
@@ -133,6 +135,9 @@ static int ip_rt_min_advmss __read_mostly   = 256;
 static int rt_chain_length_max __read_mostly   = 20;
 static int redirect_genid;
 
+static struct delayed_work expires_work;
+static unsigned long expires_ljiffies;
+
 /*
  *     Interface to generic destination cache.
  */
@@ -146,6 +151,9 @@ static void          ipv4_link_failure(struct sk_buff *skb);
 static void             ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 static int rt_garbage_collect(struct dst_ops *ops);
 
+static void __rt_garbage_collect(struct work_struct *w);
+static DECLARE_WORK(rt_gc_worker, __rt_garbage_collect);
+
 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
                            int how)
 {
@@ -830,6 +838,97 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
        return ONE;
 }
 
+static void rt_check_expire(void)
+{
+       static unsigned int rover;
+       unsigned int i = rover, goal;
+       struct rtable *rth;
+       struct rtable __rcu **rthp;
+       unsigned long samples = 0;
+       unsigned long sum = 0, sum2 = 0;
+       unsigned long delta;
+       u64 mult;
+
+       delta = jiffies - expires_ljiffies;
+       expires_ljiffies = jiffies;
+       mult = ((u64)delta) << rt_hash_log;
+       if (ip_rt_gc_timeout > 1)
+               do_div(mult, ip_rt_gc_timeout);
+       goal = (unsigned int)mult;
+       if (goal > rt_hash_mask)
+               goal = rt_hash_mask + 1;
+       for (; goal > 0; goal--) {
+               unsigned long tmo = ip_rt_gc_timeout;
+               unsigned long length;
+
+               i = (i + 1) & rt_hash_mask;
+               rthp = &rt_hash_table[i].chain;
+
+               if (need_resched())
+                       cond_resched();
+
+               samples++;
+
+               if (rcu_dereference_raw(*rthp) == NULL)
+                       continue;
+               length = 0;
+               spin_lock_bh(rt_hash_lock_addr(i));
+               while ((rth = rcu_dereference_protected(*rthp,
+                                       lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
+                       prefetch(rth->dst.rt_next);
+                       if (rt_is_expired(rth)) {
+                               *rthp = rth->dst.rt_next;
+                               rt_free(rth);
+                               continue;
+                       }
+                       if (rth->dst.expires) {
+                               /* Entry is expired even if it is in use */
+                               if (time_before_eq(jiffies, rth->dst.expires)) {
+nofree:
+                                       tmo >>= 1;
+                                       rthp = &rth->dst.rt_next;
+                                       /*
+                                        * We only count entries on
+                                        * a chain with equal hash inputs once
+                                        * so that entries for different QOS
+                                        * levels, and other non-hash input
+                                        * attributes don't unfairly skew
+                                        * the length computation
+                                        */
+                                       length += has_noalias(rt_hash_table[i].chain, rth);
+                                       continue;
+                               }
+                       } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
+                               goto nofree;
+
+                       /* Cleanup aged off entries. */
+                       *rthp = rth->dst.rt_next;
+                       rt_free(rth);
+               }
+               spin_unlock_bh(rt_hash_lock_addr(i));
+               sum += length;
+               sum2 += length*length;
+       }
+       if (samples) {
+               unsigned long avg = sum / samples;
+               unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
+               rt_chain_length_max = max_t(unsigned long,
+                                       ip_rt_gc_elasticity,
+                                       (avg + 4*sd) >> FRACT_BITS);
+       }
+       rover = i;
+}
+
+/*
+ * rt_worker_func() is run in process context.
+ * we call rt_check_expire() to scan part of the hash table
+ */
+static void rt_worker_func(struct work_struct *work)
+{
+       rt_check_expire();
+       schedule_delayed_work(&expires_work, ip_rt_gc_interval);
+}
+
 /*
  * Perturbation of rt_genid by a small quantity [1..256]
  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
@@ -843,6 +942,7 @@ static void rt_cache_invalidate(struct net *net)
        get_random_bytes(&shuffle, sizeof(shuffle));
        atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
        redirect_genid++;
+       inetpeer_invalidate_tree(AF_INET);
 }
 
 /*
@@ -882,12 +982,13 @@ static void rt_emergency_hash_rebuild(struct net *net)
    and when load increases it reduces to limit cache size.
  */
 
-static int rt_garbage_collect(struct dst_ops *ops)
+static void __do_rt_garbage_collect(int elasticity, int min_interval)
 {
        static unsigned long expire = RT_GC_TIMEOUT;
        static unsigned long last_gc;
        static int rover;
        static int equilibrium;
+       static DEFINE_SPINLOCK(rt_gc_lock);
        struct rtable *rth;
        struct rtable __rcu **rthp;
        unsigned long now = jiffies;
@@ -899,9 +1000,11 @@ static int rt_garbage_collect(struct dst_ops *ops)
         * do not make it too frequently.
         */
 
+       spin_lock_bh(&rt_gc_lock);
+
        RT_CACHE_STAT_INC(gc_total);
 
-       if (now - last_gc < ip_rt_gc_min_interval &&
+       if (now - last_gc < min_interval &&
            entries < ip_rt_max_size) {
                RT_CACHE_STAT_INC(gc_ignored);
                goto out;
@@ -909,7 +1012,7 @@ static int rt_garbage_collect(struct dst_ops *ops)
 
        entries = dst_entries_get_slow(&ipv4_dst_ops);
        /* Calculate number of entries, which we want to expire now. */
-       goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
+       goal = entries - (elasticity << rt_hash_log);
        if (goal <= 0) {
                if (equilibrium < ipv4_dst_ops.gc_thresh)
                        equilibrium = ipv4_dst_ops.gc_thresh;
@@ -926,7 +1029,7 @@ static int rt_garbage_collect(struct dst_ops *ops)
                equilibrium = entries - goal;
        }
 
-       if (now - last_gc >= ip_rt_gc_min_interval)
+       if (now - last_gc >= min_interval)
                last_gc = now;
 
        if (goal <= 0) {
@@ -991,15 +1094,34 @@ static int rt_garbage_collect(struct dst_ops *ops)
        if (net_ratelimit())
                printk(KERN_WARNING "dst cache overflow\n");
        RT_CACHE_STAT_INC(gc_dst_overflow);
-       return 1;
+       goto out;
 
 work_done:
-       expire += ip_rt_gc_min_interval;
+       expire += min_interval;
        if (expire > ip_rt_gc_timeout ||
            dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
            dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
                expire = ip_rt_gc_timeout;
-out:   return 0;
+out:
+       spin_unlock_bh(&rt_gc_lock);
+}
+
+static void __rt_garbage_collect(struct work_struct *w)
+{
+       __do_rt_garbage_collect(ip_rt_gc_elasticity, ip_rt_gc_min_interval);
+}
+
+static int rt_garbage_collect(struct dst_ops *ops)
+{
+       if (!work_pending(&rt_gc_worker))
+               schedule_work(&rt_gc_worker);
+
+       if (dst_entries_get_fast(&ipv4_dst_ops) >= ip_rt_max_size ||
+           dst_entries_get_slow(&ipv4_dst_ops) >= ip_rt_max_size) {
+               RT_CACHE_STAT_INC(gc_dst_overflow);
+               return 1;
+       }
+       return 0;
 }
 
 /*
@@ -1056,7 +1178,7 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
        unsigned long   now;
        u32             min_score;
        int             chain_length;
-       int attempts = !in_softirq();
+       int attempts = 1;
 
 restart:
        chain_length = 0;
@@ -1193,14 +1315,15 @@ restart:
                           can be released. Try to shrink route cache,
                           it is most likely it holds some neighbour records.
                         */
-                       if (attempts-- > 0) {
-                               int saved_elasticity = ip_rt_gc_elasticity;
-                               int saved_int = ip_rt_gc_min_interval;
-                               ip_rt_gc_elasticity     = 1;
-                               ip_rt_gc_min_interval   = 0;
-                               rt_garbage_collect(&ipv4_dst_ops);
-                               ip_rt_gc_min_interval   = saved_int;
-                               ip_rt_gc_elasticity     = saved_elasticity;
+                       if (!in_softirq() && attempts-- > 0) {
+                               static DEFINE_SPINLOCK(lock);
+
+                               if (spin_trylock(&lock)) {
+                                       __do_rt_garbage_collect(1, 0);
+                                       spin_unlock(&lock);
+                               } else {
+                                       spin_unlock_wait(&lock);
+                               }
                                goto restart;
                        }
 
@@ -1247,46 +1370,53 @@ void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
                rt->rt_peer_genid = rt_peer_genid();
 }
 
-/*
- * Peer allocation may fail only in serious out-of-memory conditions.  However
- * we still can generate some output.
- * Random ID selection looks a bit dangerous because we have no chances to
- * select ID being unique in a reasonable period of time.
- * But broken packet identifier may be better than no packet at all.
+#define IP_IDENTS_SZ 2048u
+struct ip_ident_bucket {
+       atomic_t        id;
+       u32             stamp32;
+};
+
+static struct ip_ident_bucket *ip_idents __read_mostly;
+
+/* In order to protect privacy, we add a perturbation to identifiers
+ * if one generator is seldom used. This makes hard for an attacker
+ * to infer how many packets were sent between two points in time.
  */
-static void ip_select_fb_ident(struct iphdr *iph)
+u32 ip_idents_reserve(u32 hash, int segs)
 {
-       static DEFINE_SPINLOCK(ip_fb_id_lock);
-       static u32 ip_fallback_id;
-       u32 salt;
+       struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
+       u32 old = ACCESS_ONCE(bucket->stamp32);
+       u32 now = (u32)jiffies;
+       u32 delta = 0;
+
+       if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
+               u64 x = random32();
+
+               x *= (now - old);
+               delta = (u32)(x >> 32);
+       }
 
-       spin_lock_bh(&ip_fb_id_lock);
-       salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
-       iph->id = htons(salt & 0xFFFF);
-       ip_fallback_id = salt;
-       spin_unlock_bh(&ip_fb_id_lock);
+       return atomic_add_return(segs + delta, &bucket->id) - segs;
 }
+EXPORT_SYMBOL(ip_idents_reserve);
 
-void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
+void __ip_select_ident(struct iphdr *iph, int segs)
 {
-       struct rtable *rt = (struct rtable *) dst;
-
-       if (rt) {
-               if (rt->peer == NULL)
-                       rt_bind_peer(rt, rt->rt_dst, 1);
+       static u32 ip_idents_hashrnd __read_mostly;
+       static bool hashrnd_initialized = false;
+       u32 hash, id;
 
-               /* If peer is attached to destination, it is never detached,
-                  so that we need not to grab a lock to dereference it.
-                */
-               if (rt->peer) {
-                       iph->id = htons(inet_getid(rt->peer, more));
-                       return;
-               }
-       } else
-               printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
-                      __builtin_return_address(0));
+       if (unlikely(!hashrnd_initialized)) {
+               hashrnd_initialized = true;
+               get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
+       }
 
-       ip_select_fb_ident(iph);
+       hash = jhash_3words((__force u32)iph->daddr,
+                           (__force u32)iph->saddr,
+                           iph->protocol,
+                           ip_idents_hashrnd);
+       id = ip_idents_reserve(hash, segs);
+       iph->id = htons(id);
 }
 EXPORT_SYMBOL(__ip_select_ident);
 
@@ -2045,7 +2175,7 @@ static int __mkroute_input(struct sk_buff *skb,
        struct in_device *out_dev;
        unsigned int flags = 0;
        __be32 spec_dst;
-       u32 itag;
+       u32 itag = 0;
 
        /* get a working reference to the output device */
        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
@@ -2474,6 +2604,18 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
                 */
                if (fi && res->prefixlen < 4)
                        fi = NULL;
+       } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
+                  (orig_oif != dev_out->ifindex)) {
+               /* For local routes that require a particular output interface
+                * we do not want to cache the result.  Caching the result
+                * causes incorrect behaviour when there are multiple source
+                * addresses on the interface, the end result being that if the
+                * intended recipient is waiting on that interface for the
+                * packet he won't receive it because it will be delivered on
+                * the loopback interface and the IP_PKTINFO ipi_ifindex will
+                * be set to the loopback interface as well.
+                */
+               fi = NULL;
        }
 
        rth = rt_dst_alloc(dev_out,
@@ -2630,7 +2772,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
                                                              RT_SCOPE_LINK);
                        goto make_route;
                }
-               if (fl4->saddr) {
+               if (!fl4->saddr) {
                        if (ipv4_is_multicast(fl4->daddr))
                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              fl4->flowi4_scope);
@@ -2926,7 +3068,6 @@ static int rt_fill_info(struct net *net,
        error = rt->dst.error;
        if (peer) {
                inet_peer_refcheck(rt->peer);
-               id = atomic_read(&peer->ip_id_count) & 0xffff;
                if (peer->tcp_ts_stamp) {
                        ts = peer->tcp_ts;
                        tsage = get_seconds() - peer->tcp_ts_stamp;
@@ -2948,7 +3089,8 @@ static int rt_fill_info(struct net *net,
                    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
                        int err = ipmr_get_route(net, skb,
                                                 rt->rt_src, rt->rt_dst,
-                                                r, nowait);
+                                                r, nowait, pid);
+
                        if (err <= 0) {
                                if (!nowait) {
                                        if (err == 0)
@@ -3178,6 +3320,13 @@ static ctl_table ipv4_route_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec_jiffies,
        },
+       {
+               .procname       = "gc_interval",
+               .data           = &ip_rt_gc_interval,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_jiffies,
+       },
        {
                .procname       = "redirect_load",
                .data           = &ip_rt_redirect_load,
@@ -3351,6 +3500,12 @@ int __init ip_rt_init(void)
 {
        int rc = 0;
 
+       ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
+       if (!ip_idents)
+               panic("IP: failed to allocate ip_idents\n");
+
+       get_random_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
+
 #ifdef CONFIG_IP_ROUTE_CLASSID
        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
        if (!ip_rt_acct)
@@ -3388,6 +3543,11 @@ int __init ip_rt_init(void)
        devinet_init();
        ip_fib_init();
 
+       INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
+       expires_ljiffies = jiffies;
+       schedule_delayed_work(&expires_work,
+               net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
+
        if (ip_rt_proc_init())
                printk(KERN_ERR "Unable to create route proc files\n");
 #ifdef CONFIG_XFRM