tcp/dccp: get rid of central timewait timer
authorEric Dumazet <edumazet@google.com>
Mon, 13 Apr 2015 01:51:09 +0000 (18:51 -0700)
committerDavid S. Miller <davem@davemloft.net>
Mon, 13 Apr 2015 20:40:05 +0000 (16:40 -0400)
Using a timer wheel for timewait sockets was nice ~15 years ago when
memory was expensive and machines had a single processor.

This does not scale, code is ugly and source of huge latencies
(Typically 30 ms have been seen, cpus spinning on death_lock spinlock.)

We can afford to use an extra 64 bytes per timewait sock and spread
timewait load to all cpus to have better behavior.

Tested:

On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1
on the target (lpaa24)

Before patch :

lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
419594

lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
437171

While test is running, we can observe 25 or even 33 ms latencies.

lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms
rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2

lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms
rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2

After patch :

About 90% increase of throughput :

lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
810442

lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
800992

And latencies are kept to minimal values during this load, even
if network utilization is 90% higher :

lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms
rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/net/inet_timewait_sock.h
net/dccp/minisocks.c
net/ipv4/inet_diag.c
net/ipv4/inet_hashtables.c
net/ipv4/inet_timewait_sock.c
net/ipv4/proc.c
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_minisocks.c
net/ipv6/inet6_hashtables.c
net/ipv6/tcp_ipv6.c
net/netfilter/xt_TPROXY.c

index b7ce100..360c480 100644 (file)
 
 struct inet_hashinfo;
 
-#define INET_TWDR_RECYCLE_SLOTS_LOG    5
-#define INET_TWDR_RECYCLE_SLOTS                (1 << INET_TWDR_RECYCLE_SLOTS_LOG)
-
-/*
- * If time > 4sec, it is "slow" path, no recycling is required,
- * so that we select tick to get range about 4 seconds.
- */
-#if HZ <= 16 || HZ > 4096
-# error Unsupported: HZ <= 16 or HZ > 4096
-#elif HZ <= 32
-# define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
-#elif HZ <= 64
-# define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
-#elif HZ <= 128
-# define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
-#elif HZ <= 256
-# define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
-#elif HZ <= 512
-# define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
-#elif HZ <= 1024
-# define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
-#elif HZ <= 2048
-# define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
-#else
-# define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
-#endif
-
-static inline u32 inet_tw_time_stamp(void)
-{
-       return jiffies;
-}
-
-/* TIME_WAIT reaping mechanism. */
-#define INET_TWDR_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
-
-#define INET_TWDR_TWKILL_QUOTA 100
-
 struct inet_timewait_death_row {
-       /* Short-time timewait calendar */
-       int                     twcal_hand;
-       unsigned long           twcal_jiffie;
-       struct timer_list       twcal_timer;
-       struct hlist_head       twcal_row[INET_TWDR_RECYCLE_SLOTS];
-
-       spinlock_t              death_lock;
-       int                     tw_count;
-       int                     period;
-       u32                     thread_slots;
-       struct work_struct      twkill_work;
-       struct timer_list       tw_timer;
-       int                     slot;
-       struct hlist_head       cells[INET_TWDR_TWKILL_SLOTS];
-       struct inet_hashinfo    *hashinfo;
+       atomic_t                tw_count;
+
+       struct inet_hashinfo    *hashinfo ____cacheline_aligned_in_smp;
        int                     sysctl_tw_recycle;
        int                     sysctl_max_tw_buckets;
 };
 
-void inet_twdr_hangman(unsigned long data);
-void inet_twdr_twkill_work(struct work_struct *work);
-void inet_twdr_twcal_tick(unsigned long data);
-
 struct inet_bind_bucket;
 
 /*
@@ -133,52 +80,18 @@ struct inet_timewait_sock {
        __be16                  tw_sport;
        kmemcheck_bitfield_begin(flags);
        /* And these are ours. */
-       unsigned int            tw_pad0         : 1,    /* 1 bit hole */
+       unsigned int            tw_kill         : 1,
                                tw_transparent  : 1,
                                tw_flowlabel    : 20,
                                tw_pad          : 2,    /* 2 bits hole */
                                tw_tos          : 8;
        kmemcheck_bitfield_end(flags);
-       u32                     tw_ttd;
+       struct timer_list       tw_timer;
        struct inet_bind_bucket *tw_tb;
-       struct hlist_node       tw_death_node;
+       struct inet_timewait_death_row *tw_dr;
 };
 #define tw_tclass tw_tos
 
-static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw)
-{
-       return !hlist_unhashed(&tw->tw_death_node);
-}
-
-static inline void inet_twsk_dead_node_init(struct inet_timewait_sock *tw)
-{
-       tw->tw_death_node.pprev = NULL;
-}
-
-static inline void __inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
-{
-       __hlist_del(&tw->tw_death_node);
-       inet_twsk_dead_node_init(tw);
-}
-
-static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
-{
-       if (inet_twsk_dead_hashed(tw)) {
-               __inet_twsk_del_dead_node(tw);
-               return 1;
-       }
-       return 0;
-}
-
-#define inet_twsk_for_each(tw, node, head) \
-       hlist_nulls_for_each_entry(tw, node, head, tw_node)
-
-#define inet_twsk_for_each_inmate(tw, jail) \
-       hlist_for_each_entry(tw, jail, tw_death_node)
-
-#define inet_twsk_for_each_inmate_safe(tw, safe, jail) \
-       hlist_for_each_entry_safe(tw, safe, jail, tw_death_node)
-
 static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
 {
        return (struct inet_timewait_sock *)sk;
@@ -193,16 +106,14 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
                          struct inet_hashinfo *hashinfo);
 
 struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
+                                          struct inet_timewait_death_row *dr,
                                           const int state);
 
 void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
                           struct inet_hashinfo *hashinfo);
 
-void inet_twsk_schedule(struct inet_timewait_sock *tw,
-                       struct inet_timewait_death_row *twdr,
-                       const int timeo, const int timewait_len);
-void inet_twsk_deschedule(struct inet_timewait_sock *tw,
-                         struct inet_timewait_death_row *twdr);
+void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo);
+void inet_twsk_deschedule(struct inet_timewait_sock *tw);
 
 void inet_twsk_purge(struct inet_hashinfo *hashinfo,
                     struct inet_timewait_death_row *twdr, int family);
index 332f7d6..5f56666 100644 (file)
 
 struct inet_timewait_death_row dccp_death_row = {
        .sysctl_max_tw_buckets = NR_FILE * 2,
-       .period         = DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
-       .death_lock     = __SPIN_LOCK_UNLOCKED(dccp_death_row.death_lock),
        .hashinfo       = &dccp_hashinfo,
-       .tw_timer       = TIMER_INITIALIZER(inet_twdr_hangman, 0,
-                                           (unsigned long)&dccp_death_row),
-       .twkill_work    = __WORK_INITIALIZER(dccp_death_row.twkill_work,
-                                            inet_twdr_twkill_work),
-/* Short-time timewait calendar */
-
-       .twcal_hand     = -1,
-       .twcal_timer    = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
-                                           (unsigned long)&dccp_death_row),
 };
 
 EXPORT_SYMBOL_GPL(dccp_death_row);
 
 void dccp_time_wait(struct sock *sk, int state, int timeo)
 {
-       struct inet_timewait_sock *tw = NULL;
+       struct inet_timewait_sock *tw;
 
-       if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets)
-               tw = inet_twsk_alloc(sk, state);
+       tw = inet_twsk_alloc(sk, &dccp_death_row, state);
 
        if (tw != NULL) {
                const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -71,8 +59,7 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
                if (state == DCCP_TIME_WAIT)
                        timeo = DCCP_TIMEWAIT_LEN;
 
-               inet_twsk_schedule(tw, &dccp_death_row, timeo,
-                                  DCCP_TIMEWAIT_LEN);
+               inet_twsk_schedule(tw, timeo);
                inet_twsk_put(tw);
        } else {
                /* Sorry, if we're out of memory, just CLOSE this
index 76322c9..70e8b3c 100644 (file)
@@ -248,7 +248,7 @@ static int inet_twsk_diag_fill(struct sock *sk,
        struct inet_timewait_sock *tw = inet_twsk(sk);
        struct inet_diag_msg *r;
        struct nlmsghdr *nlh;
-       s32 tmo;
+       long tmo;
 
        nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
                        nlmsg_flags);
@@ -258,7 +258,7 @@ static int inet_twsk_diag_fill(struct sock *sk,
        r = nlmsg_data(nlh);
        BUG_ON(tw->tw_state != TCP_TIME_WAIT);
 
-       tmo = tw->tw_ttd - inet_tw_time_stamp();
+       tmo = tw->tw_timer.expires - jiffies;
        if (tmo < 0)
                tmo = 0;
 
index d4630bf..c6fb80b 100644 (file)
@@ -388,7 +388,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
                *twp = tw;
        } else if (tw) {
                /* Silly. Should hash-dance instead... */
-               inet_twsk_deschedule(tw, death_row);
+               inet_twsk_deschedule(tw);
 
                inet_twsk_put(tw);
        }
@@ -565,7 +565,7 @@ ok:
                spin_unlock(&head->lock);
 
                if (tw) {
-                       inet_twsk_deschedule(tw, death_row);
+                       inet_twsk_deschedule(tw);
                        while (twrefcnt) {
                                twrefcnt--;
                                inet_twsk_put(tw);
index 118f0f1..00ec8d5 100644 (file)
@@ -67,9 +67,9 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 }
 
 /* Must be called with locally disabled BHs. */
-static void __inet_twsk_kill(struct inet_timewait_sock *tw,
-                            struct inet_hashinfo *hashinfo)
+static void inet_twsk_kill(struct inet_timewait_sock *tw)
 {
+       struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
        struct inet_bind_hashbucket *bhead;
        int refcnt;
        /* Unlink from established hashes. */
@@ -89,6 +89,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
 
        BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt));
        atomic_sub(refcnt, &tw->tw_refcnt);
+       atomic_dec(&tw->tw_dr->tw_count);
+       inet_twsk_put(tw);
 }
 
 void inet_twsk_free(struct inet_timewait_sock *tw)
@@ -168,16 +170,34 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 }
 EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
 
-struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
+void tw_timer_handler(unsigned long data)
 {
-       struct inet_timewait_sock *tw =
-               kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
-                                GFP_ATOMIC);
+       struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data;
+
+       if (tw->tw_kill)
+               NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
+       else
+               NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
+       inet_twsk_kill(tw);
+}
+
+struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
+                                          struct inet_timewait_death_row *dr,
+                                          const int state)
+{
+       struct inet_timewait_sock *tw;
+
+       if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets)
+               return NULL;
+
+       tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
+                             GFP_ATOMIC);
        if (tw) {
                const struct inet_sock *inet = inet_sk(sk);
 
                kmemcheck_annotate_bitfield(tw, flags);
 
+               tw->tw_dr           = dr;
                /* Give us an identity. */
                tw->tw_daddr        = inet->inet_daddr;
                tw->tw_rcv_saddr    = inet->inet_rcv_saddr;
@@ -196,13 +216,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
                tw->tw_prot         = sk->sk_prot_creator;
                atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
                twsk_net_set(tw, sock_net(sk));
+               setup_timer(&tw->tw_timer, tw_timer_handler, (unsigned long)tw);
                /*
                 * Because we use RCU lookups, we should not set tw_refcnt
                 * to a non null value before everything is setup for this
                 * timewait socket.
                 */
                atomic_set(&tw->tw_refcnt, 0);
-               inet_twsk_dead_node_init(tw);
+
                __module_get(tw->tw_prot->owner);
        }
 
@@ -210,139 +231,20 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
 }
 EXPORT_SYMBOL_GPL(inet_twsk_alloc);
 
-/* Returns non-zero if quota exceeded.  */
-static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
-                                   const int slot)
-{
-       struct inet_timewait_sock *tw;
-       unsigned int killed;
-       int ret;
-
-       /* NOTE: compare this to previous version where lock
-        * was released after detaching chain. It was racy,
-        * because tw buckets are scheduled in not serialized context
-        * in 2.3 (with netfilter), and with softnet it is common, because
-        * soft irqs are not sequenced.
-        */
-       killed = 0;
-       ret = 0;
-rescan:
-       inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) {
-               __inet_twsk_del_dead_node(tw);
-               spin_unlock(&twdr->death_lock);
-               __inet_twsk_kill(tw, twdr->hashinfo);
-#ifdef CONFIG_NET_NS
-               NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
-#endif
-               inet_twsk_put(tw);
-               killed++;
-               spin_lock(&twdr->death_lock);
-               if (killed > INET_TWDR_TWKILL_QUOTA) {
-                       ret = 1;
-                       break;
-               }
-
-               /* While we dropped twdr->death_lock, another cpu may have
-                * killed off the next TW bucket in the list, therefore
-                * do a fresh re-read of the hlist head node with the
-                * lock reacquired.  We still use the hlist traversal
-                * macro in order to get the prefetches.
-                */
-               goto rescan;
-       }
-
-       twdr->tw_count -= killed;
-#ifndef CONFIG_NET_NS
-       NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed);
-#endif
-       return ret;
-}
-
-void inet_twdr_hangman(unsigned long data)
-{
-       struct inet_timewait_death_row *twdr;
-       unsigned int need_timer;
-
-       twdr = (struct inet_timewait_death_row *)data;
-       spin_lock(&twdr->death_lock);
-
-       if (twdr->tw_count == 0)
-               goto out;
-
-       need_timer = 0;
-       if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
-               twdr->thread_slots |= (1 << twdr->slot);
-               schedule_work(&twdr->twkill_work);
-               need_timer = 1;
-       } else {
-               /* We purged the entire slot, anything left?  */
-               if (twdr->tw_count)
-                       need_timer = 1;
-               twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
-       }
-       if (need_timer)
-               mod_timer(&twdr->tw_timer, jiffies + twdr->period);
-out:
-       spin_unlock(&twdr->death_lock);
-}
-EXPORT_SYMBOL_GPL(inet_twdr_hangman);
-
-void inet_twdr_twkill_work(struct work_struct *work)
-{
-       struct inet_timewait_death_row *twdr =
-               container_of(work, struct inet_timewait_death_row, twkill_work);
-       int i;
-
-       BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) >
-                       (sizeof(twdr->thread_slots) * 8));
-
-       while (twdr->thread_slots) {
-               spin_lock_bh(&twdr->death_lock);
-               for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
-                       if (!(twdr->thread_slots & (1 << i)))
-                               continue;
-
-                       while (inet_twdr_do_twkill_work(twdr, i) != 0) {
-                               if (need_resched()) {
-                                       spin_unlock_bh(&twdr->death_lock);
-                                       schedule();
-                                       spin_lock_bh(&twdr->death_lock);
-                               }
-                       }
-
-                       twdr->thread_slots &= ~(1 << i);
-               }
-               spin_unlock_bh(&twdr->death_lock);
-       }
-}
-EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
-
 /* These are always called from BH context.  See callers in
  * tcp_input.c to verify this.
  */
 
 /* This is for handling early-kills of TIME_WAIT sockets. */
-void inet_twsk_deschedule(struct inet_timewait_sock *tw,
-                         struct inet_timewait_death_row *twdr)
+void inet_twsk_deschedule(struct inet_timewait_sock *tw)
 {
-       spin_lock(&twdr->death_lock);
-       if (inet_twsk_del_dead_node(tw)) {
-               inet_twsk_put(tw);
-               if (--twdr->tw_count == 0)
-                       del_timer(&twdr->tw_timer);
-       }
-       spin_unlock(&twdr->death_lock);
-       __inet_twsk_kill(tw, twdr->hashinfo);
+       if (del_timer_sync(&tw->tw_timer))
+               inet_twsk_kill(tw);
 }
 EXPORT_SYMBOL(inet_twsk_deschedule);
 
-void inet_twsk_schedule(struct inet_timewait_sock *tw,
-                      struct inet_timewait_death_row *twdr,
-                      const int timeo, const int timewait_len)
+void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo)
 {
-       struct hlist_head *list;
-       int slot;
-
        /* timeout := RTO * 3.5
         *
         * 3.5 = 1+2+0.5 to wait for two retransmits.
@@ -367,115 +269,15 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
         * is greater than TS tick!) and detect old duplicates with help
         * of PAWS.
         */
-       slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
 
-       spin_lock(&twdr->death_lock);
-
-       /* Unlink it, if it was scheduled */
-       if (inet_twsk_del_dead_node(tw))
-               twdr->tw_count--;
-       else
+       tw->tw_kill = timeo <= 4*HZ;
+       if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) {
                atomic_inc(&tw->tw_refcnt);
-
-       if (slot >= INET_TWDR_RECYCLE_SLOTS) {
-               /* Schedule to slow timer */
-               if (timeo >= timewait_len) {
-                       slot = INET_TWDR_TWKILL_SLOTS - 1;
-               } else {
-                       slot = DIV_ROUND_UP(timeo, twdr->period);
-                       if (slot >= INET_TWDR_TWKILL_SLOTS)
-                               slot = INET_TWDR_TWKILL_SLOTS - 1;
-               }
-               tw->tw_ttd = inet_tw_time_stamp() + timeo;
-               slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
-               list = &twdr->cells[slot];
-       } else {
-               tw->tw_ttd = inet_tw_time_stamp() + (slot << INET_TWDR_RECYCLE_TICK);
-
-               if (twdr->twcal_hand < 0) {
-                       twdr->twcal_hand = 0;
-                       twdr->twcal_jiffie = jiffies;
-                       twdr->twcal_timer.expires = twdr->twcal_jiffie +
-                                             (slot << INET_TWDR_RECYCLE_TICK);
-                       add_timer(&twdr->twcal_timer);
-               } else {
-                       if (time_after(twdr->twcal_timer.expires,
-                                      jiffies + (slot << INET_TWDR_RECYCLE_TICK)))
-                               mod_timer(&twdr->twcal_timer,
-                                         jiffies + (slot << INET_TWDR_RECYCLE_TICK));
-                       slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
-               }
-               list = &twdr->twcal_row[slot];
+               atomic_inc(&tw->tw_dr->tw_count);
        }
-
-       hlist_add_head(&tw->tw_death_node, list);
-
-       if (twdr->tw_count++ == 0)
-               mod_timer(&twdr->tw_timer, jiffies + twdr->period);
-       spin_unlock(&twdr->death_lock);
 }
 EXPORT_SYMBOL_GPL(inet_twsk_schedule);
 
-void inet_twdr_twcal_tick(unsigned long data)
-{
-       struct inet_timewait_death_row *twdr;
-       int n, slot;
-       unsigned long j;
-       unsigned long now = jiffies;
-       int killed = 0;
-       int adv = 0;
-
-       twdr = (struct inet_timewait_death_row *)data;
-
-       spin_lock(&twdr->death_lock);
-       if (twdr->twcal_hand < 0)
-               goto out;
-
-       slot = twdr->twcal_hand;
-       j = twdr->twcal_jiffie;
-
-       for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
-               if (time_before_eq(j, now)) {
-                       struct hlist_node *safe;
-                       struct inet_timewait_sock *tw;
-
-                       inet_twsk_for_each_inmate_safe(tw, safe,
-                                                      &twdr->twcal_row[slot]) {
-                               __inet_twsk_del_dead_node(tw);
-                               __inet_twsk_kill(tw, twdr->hashinfo);
-#ifdef CONFIG_NET_NS
-                               NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
-#endif
-                               inet_twsk_put(tw);
-                               killed++;
-                       }
-               } else {
-                       if (!adv) {
-                               adv = 1;
-                               twdr->twcal_jiffie = j;
-                               twdr->twcal_hand = slot;
-                       }
-
-                       if (!hlist_empty(&twdr->twcal_row[slot])) {
-                               mod_timer(&twdr->twcal_timer, j);
-                               goto out;
-                       }
-               }
-               j += 1 << INET_TWDR_RECYCLE_TICK;
-               slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);
-       }
-       twdr->twcal_hand = -1;
-
-out:
-       if ((twdr->tw_count -= killed) == 0)
-               del_timer(&twdr->tw_timer);
-#ifndef CONFIG_NET_NS
-       NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed);
-#endif
-       spin_unlock(&twdr->death_lock);
-}
-EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
-
 void inet_twsk_purge(struct inet_hashinfo *hashinfo,
                     struct inet_timewait_death_row *twdr, int family)
 {
@@ -509,7 +311,7 @@ restart:
 
                        rcu_read_unlock();
                        local_bh_disable();
-                       inet_twsk_deschedule(tw, twdr);
+                       inet_twsk_deschedule(tw);
                        local_bh_enable();
                        inet_twsk_put(tw);
                        goto restart_rcu;
index d8953ef..e1f3b91 100644 (file)
@@ -63,7 +63,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
        socket_seq_show(seq);
        seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
                   sock_prot_inuse_get(net, &tcp_prot), orphans,
-                  tcp_death_row.tw_count, sockets,
+                  atomic_read(&tcp_death_row.tw_count), sockets,
                   proto_memory_allocated(&tcp_prot));
        seq_printf(seq, "UDP: inuse %d mem %ld\n",
                   sock_prot_inuse_get(net, &udp_prot),
index 37578d5..3571f2b 100644 (file)
@@ -1685,7 +1685,7 @@ do_time_wait:
                                                        iph->daddr, th->dest,
                                                        inet_iif(skb));
                if (sk2) {
-                       inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+                       inet_twsk_deschedule(inet_twsk(sk));
                        inet_twsk_put(inet_twsk(sk));
                        sk = sk2;
                        goto process;
@@ -2242,9 +2242,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
                               struct seq_file *f, int i)
 {
+       long delta = tw->tw_timer.expires - jiffies;
        __be32 dest, src;
        __u16 destp, srcp;
-       s32 delta = tw->tw_ttd - inet_tw_time_stamp();
 
        dest  = tw->tw_daddr;
        src   = tw->tw_rcv_saddr;
index 2088fdc..63d6311 100644 (file)
@@ -34,18 +34,7 @@ int sysctl_tcp_abort_on_overflow __read_mostly;
 
 struct inet_timewait_death_row tcp_death_row = {
        .sysctl_max_tw_buckets = NR_FILE * 2,
-       .period         = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
-       .death_lock     = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
        .hashinfo       = &tcp_hashinfo,
-       .tw_timer       = TIMER_INITIALIZER(inet_twdr_hangman, 0,
-                                           (unsigned long)&tcp_death_row),
-       .twkill_work    = __WORK_INITIALIZER(tcp_death_row.twkill_work,
-                                            inet_twdr_twkill_work),
-/* Short-time timewait calendar */
-
-       .twcal_hand     = -1,
-       .twcal_timer    = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
-                                           (unsigned long)&tcp_death_row),
 };
 EXPORT_SYMBOL_GPL(tcp_death_row);
 
@@ -158,7 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
                if (!th->fin ||
                    TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
 kill_with_rst:
-                       inet_twsk_deschedule(tw, &tcp_death_row);
+                       inet_twsk_deschedule(tw);
                        inet_twsk_put(tw);
                        return TCP_TW_RST;
                }
@@ -174,11 +163,9 @@ kill_with_rst:
                if (tcp_death_row.sysctl_tw_recycle &&
                    tcptw->tw_ts_recent_stamp &&
                    tcp_tw_remember_stamp(tw))
-                       inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,
-                                          TCP_TIMEWAIT_LEN);
+                       inet_twsk_schedule(tw, tw->tw_timeout);
                else
-                       inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
-                                          TCP_TIMEWAIT_LEN);
+                       inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
                return TCP_TW_ACK;
        }
 
@@ -211,13 +198,12 @@ kill_with_rst:
                         */
                        if (sysctl_tcp_rfc1337 == 0) {
 kill:
-                               inet_twsk_deschedule(tw, &tcp_death_row);
+                               inet_twsk_deschedule(tw);
                                inet_twsk_put(tw);
                                return TCP_TW_SUCCESS;
                        }
                }
-               inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
-                                  TCP_TIMEWAIT_LEN);
+               inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
 
                if (tmp_opt.saw_tstamp) {
                        tcptw->tw_ts_recent       = tmp_opt.rcv_tsval;
@@ -267,8 +253,7 @@ kill:
                 * Do not reschedule in the last case.
                 */
                if (paws_reject || th->ack)
-                       inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
-                                          TCP_TIMEWAIT_LEN);
+                       inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN);
 
                return tcp_timewait_check_oow_rate_limit(
                        tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
@@ -283,16 +268,15 @@ EXPORT_SYMBOL(tcp_timewait_state_process);
  */
 void tcp_time_wait(struct sock *sk, int state, int timeo)
 {
-       struct inet_timewait_sock *tw = NULL;
        const struct inet_connection_sock *icsk = inet_csk(sk);
        const struct tcp_sock *tp = tcp_sk(sk);
+       struct inet_timewait_sock *tw;
        bool recycle_ok = false;
 
        if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp)
                recycle_ok = tcp_remember_stamp(sk);
 
-       if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets)
-               tw = inet_twsk_alloc(sk, state);
+       tw = inet_twsk_alloc(sk, &tcp_death_row, state);
 
        if (tw) {
                struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
@@ -355,8 +339,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
                                timeo = TCP_TIMEWAIT_LEN;
                }
 
-               inet_twsk_schedule(tw, &tcp_death_row, timeo,
-                                  TCP_TIMEWAIT_LEN);
+               inet_twsk_schedule(tw, timeo);
                inet_twsk_put(tw);
        } else {
                /* Sorry, if we're out of memory, just CLOSE this
index 033f178..871641b 100644 (file)
@@ -246,7 +246,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
                *twp = tw;
        } else if (tw) {
                /* Silly. Should hash-dance instead... */
-               inet_twsk_deschedule(tw, death_row);
+               inet_twsk_deschedule(tw);
 
                inet_twsk_put(tw);
        }
index f73a97f..ad51df8 100644 (file)
@@ -1486,7 +1486,7 @@ do_time_wait:
                                            ntohs(th->dest), tcp_v6_iif(skb));
                if (sk2) {
                        struct inet_timewait_sock *tw = inet_twsk(sk);
-                       inet_twsk_deschedule(tw, &tcp_death_row);
+                       inet_twsk_deschedule(tw);
                        inet_twsk_put(tw);
                        sk = sk2;
                        tcp_v6_restore_cb(skb);
@@ -1728,9 +1728,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
 static void get_timewait6_sock(struct seq_file *seq,
                               struct inet_timewait_sock *tw, int i)
 {
+       long delta = tw->tw_timer.expires - jiffies;
        const struct in6_addr *dest, *src;
        __u16 destp, srcp;
-       s32 delta = tw->tw_ttd - inet_tw_time_stamp();
 
        dest = &tw->tw_v6_daddr;
        src  = &tw->tw_v6_rcv_saddr;
index c205b26..cca96ce 100644 (file)
@@ -272,7 +272,7 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport,
                                            hp->source, lport ? lport : hp->dest,
                                            skb->dev, NFT_LOOKUP_LISTENER);
                if (sk2) {
-                       inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+                       inet_twsk_deschedule(inet_twsk(sk));
                        inet_twsk_put(inet_twsk(sk));
                        sk = sk2;
                }
@@ -437,7 +437,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
                                            tgi->lport ? tgi->lport : hp->dest,
                                            skb->dev, NFT_LOOKUP_LISTENER);
                if (sk2) {
-                       inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
+                       inet_twsk_deschedule(inet_twsk(sk));
                        inet_twsk_put(inet_twsk(sk));
                        sk = sk2;
                }