tcp/dccp: get rid of central timewait timer
authorEric Dumazet <edumazet@google.com>
Mon, 13 Apr 2015 01:51:09 +0000 (18:51 -0700)
committerDavid S. Miller <davem@davemloft.net>
Mon, 13 Apr 2015 20:40:05 +0000 (16:40 -0400)
Using a timer wheel for timewait sockets was nice ~15 years ago when
memory was expensive and machines had a single processor.

This does not scale, code is ugly and source of huge latencies
(Typically 30 ms have been seen, cpus spinning on death_lock spinlock.)

We can afford to use an extra 64 bytes per timewait sock and spread
timewait load to all cpus to have better behavior.

Tested:

On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1
on the target (lpaa24)

Before patch :

lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
419594

lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
437171

While test is running, we can observe 25 or even 33 ms latencies.

lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms
rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2

lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms
rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2

After patch :

About 90% increase of throughput :

lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
810442

lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0
800992

And latencies are kept to minimal values during this load, even
if network utilization is 90% higher :

lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23
...
1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms
rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/net/inet_timewait_sock.h
net/dccp/minisocks.c
net/ipv4/inet_diag.c
net/ipv4/inet_hashtables.c
net/ipv4/inet_timewait_sock.c
net/ipv4/proc.c
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_minisocks.c
net/ipv6/inet6_hashtables.c
net/ipv6/tcp_ipv6.c
net/netfilter/xt_TPROXY.c

index b7ce100..360c480 100644 (file)
 
 struct inet_hashinfo;
 
-#define INET_TWDR_RECYCLE_SLOTS_LOG    5
-#define INET_TWDR_RECYCLE_SLOTS                (1 << INET_TWDR_RECYCLE_SLOTS_LOG)
-
-/*
- * If time > 4sec, it is "slow" path, no recycling is required,
- * so that we select tick to get range about 4 seconds.
- */
-#if HZ <= 16 || HZ > 4096
-# error Unsupported: HZ <= 16 or HZ > 4096
-#elif HZ <= 32
-# define INET_TWDR_RECYCLE_TICK (5 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
-#elif HZ <= 64
-# define INET_TWDR_RECYCLE_TICK (6 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
-#elif HZ <= 128
-# define INET_TWDR_RECYCLE_TICK (7 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
-#elif HZ <= 256
-# define INET_TWDR_RECYCLE_TICK (8 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
-#elif HZ <= 512
-# define INET_TWDR_RECYCLE_TICK (9 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
-#elif HZ <= 1024
-# define INET_TWDR_RECYCLE_TICK (10 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
-#elif HZ <= 2048
-# define INET_TWDR_RECYCLE_TICK (11 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
-#else
-# define INET_TWDR_RECYCLE_TICK (12 + 2 - INET_TWDR_RECYCLE_SLOTS_LOG)
-#endif
-
-static inline u32 inet_tw_time_stamp(void)
-{
-       return jiffies;
-}
-
-/* TIME_WAIT reaping mechanism. */
-#define INET_TWDR_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
-
-#define INET_TWDR_TWKILL_QUOTA 100
-
 struct inet_timewait_death_row {
-       /* Short-time timewait calendar */
-       int                     twcal_hand;
-       unsigned long           twcal_jiffie;
-       struct timer_list       twcal_timer;
-       struct hlist_head       twcal_row[INET_TWDR_RECYCLE_SLOTS];
-
-       spinlock_t              death_lock;
-       int                     tw_count;
-       int                     period;
-       u32                     thread_slots;
-       struct work_struct      twkill_work;
-       struct timer_list       tw_timer;
-       int                     slot;
-       struct hlist_head       cells[INET_TWDR_TWKILL_SLOTS];
-       struct inet_hashinfo    *hashinfo;
+       atomic_t                tw_count;
+
+       struct inet_hashinfo    *hashinfo ____cacheline_aligned_in_smp;
        int                     sysctl_tw_recycle;
        int                     sysctl_max_tw_buckets;
 };
 
-void inet_twdr_hangman(unsigned long data);
-void inet_twdr_twkill_work(struct work_struct *work);
-void inet_twdr_twcal_tick(unsigned long data);
-
 struct inet_bind_bucket;
 
 /*
@@ -133,52 +80,18 @@ struct inet_timewait_sock {
        __be16                  tw_sport;
        kmemcheck_bitfield_begin(flags);
        /* And these are ours. */
-       unsigned int            tw_pad0         : 1,    /* 1 bit hole */
+       unsigned int            tw_kill         : 1,
                                tw_transparent  : 1,
                                tw_flowlabel    : 20,
                                tw_pad          : 2,    /* 2 bits hole */
                                tw_tos          : 8;
        kmemcheck_bitfield_end(flags);
-       u32                     tw_ttd;
+       struct timer_list       tw_timer;
        struct inet_bind_bucket *tw_tb;
-       struct hlist_node       tw_death_node;
+       struct inet_timewait_death_row *tw_dr;
 };
 #define tw_tclass tw_tos
 
-static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw)
-{
-       return !hlist_unhashed(&tw->tw_death_node);
-}
-
-static inline void inet_twsk_dead_node_init(struct inet_timewait_sock *tw)
-{
-       tw->tw_death_node.pprev = NULL;
-}
-
-static inline void __inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
-{
-       __hlist_del(&tw->tw_death_node);
-       inet_twsk_dead_node_init(tw);
-}
-
-static inline int inet_twsk_del_dead_node(struct inet_timewait_sock *tw)
-{
-       if (inet_twsk_dead_hashed(tw)) {
-               __inet_twsk_del_dead_node(tw);
-               return 1;
-       }
-       return 0;
-}
-
-#define inet_twsk_for_each(tw, node, head) \
-       hlist_nulls_for_each_entry(tw, node, head, tw_node)
-
-#define inet_twsk_for_each_inmate(tw, jail) \
-       hlist_for_each_entry(tw, jail, tw_death_node)
-
-#define inet_twsk_for_each_inmate_safe(tw, safe, jail) \
-       hlist_for_each_entry_safe(tw, safe, jail, tw_death_node)
-
 static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
 {
        return (struct inet_timewait_sock *)sk;
@@ -193,16 +106,14 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
                          struct inet_hashinfo *hashinfo);
 
 struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
+                                          struct inet_timewait_death_row *dr,
                                           const int state);
 
 void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
                           struct inet_hashinfo *hashinfo);
 
-void inet_twsk_schedule(struct inet_timewait_sock *tw,
-                       struct inet_timewait_death_row *twdr,
-                       const int timeo, const int timewait_len);
-void inet_twsk_deschedule(struct inet_timewait_sock *tw,
-                         struct inet_timewait_death_row *twdr);
+void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo);
+void inet_twsk_deschedule(struct inet_timewait_sock *tw);
 
 void inet_twsk_purge(struct inet_hashinfo *hashinfo,
                     struct inet_timewait_death_row *twdr, int family);
index 332f7d6..5f56666 100644 (file)
 
 struct inet_timewait_death_row dccp_death_row = {
        .sysctl_max_tw_buckets = NR_FILE * 2,
-       .period         = DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
-       .death_lock     = __SPIN_LOCK_UNLOCKED(dccp_death_row.death_lock),
        .hashinfo       = &dccp_hashinfo,
-       .tw_timer       = TIMER_INITIALIZER(inet_twdr_hangman, 0,
-                                           (unsigned long)&dccp_death_row),
-       .twkill_work    = __WORK_INITIALIZER(dccp_death_row.twkill_work,
-                                            inet_twdr_twkill_work),
-/* Short-time timewait calendar */
-
-       .twcal_hand     = -1,
-       .twcal_timer    = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
-                                           (unsigned long)&dccp_death_row),
 };
 
 EXPORT_SYMBOL_GPL(dccp_death_row);
 
 void dccp_time_wait(struct sock *sk, int state, int timeo)
 {
-       struct inet_timewait_sock *tw = NULL;
+       struct inet_timewait_sock *tw;
 
-       if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets)
-               tw = inet_twsk_alloc(sk, state);
+       tw = inet_twsk_alloc(sk, &dccp_death_row, state);
 
        if (tw != NULL) {
                const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -71,8 +59,7 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
                if (state == DCCP_TIME_WAIT)
                        timeo = DCCP_TIMEWAIT_LEN;
 
-               inet_twsk_schedule(tw, &dccp_death_row, timeo,
-                                  DCCP_TIMEWAIT_LEN);
+               inet_twsk_schedule(tw, timeo);
                inet_twsk_put(tw);
        } else {
                /* Sorry, if we're out of memory, just CLOSE this
Simple merge
Simple merge
Simple merge
diff --cc net/ipv4/proc.c
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge