ipv4: disable bh while doing route gc
[pandora-kernel.git] / net / ipv4 / route.c
index 94cdbc5..8e79a9e 100644 (file)
@@ -151,6 +151,9 @@ static void          ipv4_link_failure(struct sk_buff *skb);
 static void             ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 static int rt_garbage_collect(struct dst_ops *ops);
 
+static void __rt_garbage_collect(struct work_struct *w);
+static DECLARE_WORK(rt_gc_worker, __rt_garbage_collect);
+
 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
                            int how)
 {
@@ -939,6 +942,7 @@ static void rt_cache_invalidate(struct net *net)
        get_random_bytes(&shuffle, sizeof(shuffle));
        atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
        redirect_genid++;
+       inetpeer_invalidate_tree(AF_INET);
 }
 
 /*
@@ -978,12 +982,13 @@ static void rt_emergency_hash_rebuild(struct net *net)
    and when load increases it reduces to limit cache size.
  */
 
-static int rt_garbage_collect(struct dst_ops *ops)
+static void __do_rt_garbage_collect(int elasticity, int min_interval)
 {
        static unsigned long expire = RT_GC_TIMEOUT;
        static unsigned long last_gc;
        static int rover;
        static int equilibrium;
+       static DEFINE_SPINLOCK(rt_gc_lock);
        struct rtable *rth;
        struct rtable __rcu **rthp;
        unsigned long now = jiffies;
@@ -995,9 +1000,11 @@ static int rt_garbage_collect(struct dst_ops *ops)
         * do not make it too frequently.
         */
 
+       spin_lock_bh(&rt_gc_lock);
+
        RT_CACHE_STAT_INC(gc_total);
 
-       if (now - last_gc < ip_rt_gc_min_interval &&
+       if (now - last_gc < min_interval &&
            entries < ip_rt_max_size) {
                RT_CACHE_STAT_INC(gc_ignored);
                goto out;
@@ -1005,7 +1012,7 @@ static int rt_garbage_collect(struct dst_ops *ops)
 
        entries = dst_entries_get_slow(&ipv4_dst_ops);
        /* Calculate number of entries, which we want to expire now. */
-       goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
+       goal = entries - (elasticity << rt_hash_log);
        if (goal <= 0) {
                if (equilibrium < ipv4_dst_ops.gc_thresh)
                        equilibrium = ipv4_dst_ops.gc_thresh;
@@ -1022,7 +1029,7 @@ static int rt_garbage_collect(struct dst_ops *ops)
                equilibrium = entries - goal;
        }
 
-       if (now - last_gc >= ip_rt_gc_min_interval)
+       if (now - last_gc >= min_interval)
                last_gc = now;
 
        if (goal <= 0) {
@@ -1087,15 +1094,34 @@ static int rt_garbage_collect(struct dst_ops *ops)
        if (net_ratelimit())
                printk(KERN_WARNING "dst cache overflow\n");
        RT_CACHE_STAT_INC(gc_dst_overflow);
-       return 1;
+       goto out;
 
 work_done:
-       expire += ip_rt_gc_min_interval;
+       expire += min_interval;
        if (expire > ip_rt_gc_timeout ||
            dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
            dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
                expire = ip_rt_gc_timeout;
-out:   return 0;
+out:
+       spin_unlock_bh(&rt_gc_lock);
+}
+
+static void __rt_garbage_collect(struct work_struct *w)
+{
+       __do_rt_garbage_collect(ip_rt_gc_elasticity, ip_rt_gc_min_interval);
+}
+
+static int rt_garbage_collect(struct dst_ops *ops)
+{
+       if (!work_pending(&rt_gc_worker))
+               schedule_work(&rt_gc_worker);
+
+       if (dst_entries_get_fast(&ipv4_dst_ops) >= ip_rt_max_size ||
+           dst_entries_get_slow(&ipv4_dst_ops) >= ip_rt_max_size) {
+               RT_CACHE_STAT_INC(gc_dst_overflow);
+               return 1;
+       }
+       return 0;
 }
 
 /*
@@ -1152,7 +1178,7 @@ static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
        unsigned long   now;
        u32             min_score;
        int             chain_length;
-       int attempts = !in_softirq();
+       int attempts = 1;
 
 restart:
        chain_length = 0;
@@ -1289,14 +1315,15 @@ restart:
                           can be released. Try to shrink route cache,
                           it is most likely it holds some neighbour records.
                         */
-                       if (attempts-- > 0) {
-                               int saved_elasticity = ip_rt_gc_elasticity;
-                               int saved_int = ip_rt_gc_min_interval;
-                               ip_rt_gc_elasticity     = 1;
-                               ip_rt_gc_min_interval   = 0;
-                               rt_garbage_collect(&ipv4_dst_ops);
-                               ip_rt_gc_min_interval   = saved_int;
-                               ip_rt_gc_elasticity     = saved_elasticity;
+                       if (!in_softirq() && attempts-- > 0) {
+                               static DEFINE_SPINLOCK(lock);
+
+                               if (spin_trylock(&lock)) {
+                                       __do_rt_garbage_collect(1, 0);
+                                       spin_unlock(&lock);
+                               } else {
+                                       spin_unlock_wait(&lock);
+                               }
                                goto restart;
                        }
 
@@ -1343,46 +1370,53 @@ void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
                rt->rt_peer_genid = rt_peer_genid();
 }
 
-/*
- * Peer allocation may fail only in serious out-of-memory conditions.  However
- * we still can generate some output.
- * Random ID selection looks a bit dangerous because we have no chances to
- * select ID being unique in a reasonable period of time.
- * But broken packet identifier may be better than no packet at all.
+#define IP_IDENTS_SZ 2048u
+struct ip_ident_bucket {
+       atomic_t        id;
+       u32             stamp32;
+};
+
+static struct ip_ident_bucket *ip_idents __read_mostly;
+
+/* In order to protect privacy, we add a perturbation to identifiers
+ * if one generator is seldom used. This makes hard for an attacker
+ * to infer how many packets were sent between two points in time.
  */
-static void ip_select_fb_ident(struct iphdr *iph)
+u32 ip_idents_reserve(u32 hash, int segs)
 {
-       static DEFINE_SPINLOCK(ip_fb_id_lock);
-       static u32 ip_fallback_id;
-       u32 salt;
+       struct ip_ident_bucket *bucket = ip_idents + hash % IP_IDENTS_SZ;
+       u32 old = ACCESS_ONCE(bucket->stamp32);
+       u32 now = (u32)jiffies;
+       u32 delta = 0;
 
-       spin_lock_bh(&ip_fb_id_lock);
-       salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
-       iph->id = htons(salt & 0xFFFF);
-       ip_fallback_id = salt;
-       spin_unlock_bh(&ip_fb_id_lock);
+       if (old != now && cmpxchg(&bucket->stamp32, old, now) == old) {
+               u64 x = random32();
+
+               x *= (now - old);
+               delta = (u32)(x >> 32);
+       }
+
+       return atomic_add_return(segs + delta, &bucket->id) - segs;
 }
+EXPORT_SYMBOL(ip_idents_reserve);
 
-void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
+void __ip_select_ident(struct iphdr *iph, int segs)
 {
-       struct rtable *rt = (struct rtable *) dst;
+       static u32 ip_idents_hashrnd __read_mostly;
+       static bool hashrnd_initialized = false;
+       u32 hash, id;
 
-       if (rt && !(rt->dst.flags & DST_NOPEER)) {
-               if (rt->peer == NULL)
-                       rt_bind_peer(rt, rt->rt_dst, 1);
-
-               /* If peer is attached to destination, it is never detached,
-                  so that we need not to grab a lock to dereference it.
-                */
-               if (rt->peer) {
-                       iph->id = htons(inet_getid(rt->peer, more));
-                       return;
-               }
-       } else if (!rt)
-               printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
-                      __builtin_return_address(0));
+       if (unlikely(!hashrnd_initialized)) {
+               hashrnd_initialized = true;
+               get_random_bytes(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
+       }
 
-       ip_select_fb_ident(iph);
+       hash = jhash_3words((__force u32)iph->daddr,
+                           (__force u32)iph->saddr,
+                           iph->protocol,
+                           ip_idents_hashrnd);
+       id = ip_idents_reserve(hash, segs);
+       iph->id = htons(id);
 }
 EXPORT_SYMBOL(__ip_select_ident);
 
@@ -2141,7 +2175,7 @@ static int __mkroute_input(struct sk_buff *skb,
        struct in_device *out_dev;
        unsigned int flags = 0;
        __be32 spec_dst;
-       u32 itag;
+       u32 itag = 0;
 
        /* get a working reference to the output device */
        out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
@@ -2726,7 +2760,7 @@ static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
                                                              RT_SCOPE_LINK);
                        goto make_route;
                }
-               if (fl4->saddr) {
+               if (!fl4->saddr) {
                        if (ipv4_is_multicast(fl4->daddr))
                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              fl4->flowi4_scope);
@@ -3022,7 +3056,6 @@ static int rt_fill_info(struct net *net,
        error = rt->dst.error;
        if (peer) {
                inet_peer_refcheck(rt->peer);
-               id = atomic_read(&peer->ip_id_count) & 0xffff;
                if (peer->tcp_ts_stamp) {
                        ts = peer->tcp_ts;
                        tsage = get_seconds() - peer->tcp_ts_stamp;
@@ -3454,6 +3487,12 @@ int __init ip_rt_init(void)
 {
        int rc = 0;
 
+       ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
+       if (!ip_idents)
+               panic("IP: failed to allocate ip_idents\n");
+
+       get_random_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
+
 #ifdef CONFIG_IP_ROUTE_CLASSID
        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
        if (!ip_rt_acct)