xfrm: Reinject transport-mode packets through tasklet
[pandora-kernel.git] / net / ipv4 / tcp.c
index 46febca..9d0a94b 100644 (file)
@@ -374,7 +374,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 {
        unsigned int mask;
        struct sock *sk = sock->sk;
-       struct tcp_sock *tp = tcp_sk(sk);
+       const struct tcp_sock *tp = tcp_sk(sk);
 
        sock_poll_wait(file, sk_sleep(sk), wait);
        if (sk->sk_state == TCP_LISTEN)
@@ -481,14 +481,12 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
                         !tp->urg_data ||
                         before(tp->urg_seq, tp->copied_seq) ||
                         !before(tp->urg_seq, tp->rcv_nxt)) {
-                       struct sk_buff *skb;
 
                        answ = tp->rcv_nxt - tp->copied_seq;
 
-                       /* Subtract 1, if FIN is in queue. */
-                       skb = skb_peek_tail(&sk->sk_receive_queue);
-                       if (answ && skb)
-                               answ -= tcp_hdr(skb)->fin;
+                       /* Subtract 1, if FIN was received */
+                       if (answ && sock_flag(sk, SOCK_DONE))
+                               answ--;
                } else
                        answ = tp->urg_seq - tp->copied_seq;
                release_sock(sk);
@@ -524,11 +522,11 @@ EXPORT_SYMBOL(tcp_ioctl);
 
 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 {
-       TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
+       TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
        tp->pushed_seq = tp->write_seq;
 }
 
-static inline int forced_push(struct tcp_sock *tp)
+static inline int forced_push(const struct tcp_sock *tp)
 {
        return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 }
@@ -540,7 +538,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
 
        skb->csum    = 0;
        tcb->seq     = tcb->end_seq = tp->write_seq;
-       tcb->flags   = TCPHDR_ACK;
+       tcb->tcp_flags = TCPHDR_ACK;
        tcb->sacked  = 0;
        skb_header_release(skb);
        tcp_add_write_queue_tail(sk, skb);
@@ -661,6 +659,12 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
                                ret = -EAGAIN;
                                break;
                        }
+                       /* if __tcp_splice_read() got nothing while we have
+                        * an skb in receive queue, we do not want to loop.
+                        * This might happen with URG data.
+                        */
+                       if (!skb_queue_empty(&sk->sk_receive_queue))
+                               break;
                        sk_wait_data(sk, &timeo);
                        if (signal_pending(current)) {
                                ret = sock_intr_errno(timeo);
@@ -701,11 +705,12 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
        skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
        if (skb) {
                if (sk_wmem_schedule(sk, skb->truesize)) {
+                       skb_reserve(skb, sk->sk_prot->max_header);
                        /*
                         * Make sure that we have exactly size bytes
                         * available to the caller, no more, no less.
                         */
-                       skb_reserve(skb, skb_tailroom(skb) - size);
+                       skb->reserved_tailroom = skb->end - skb->tail - size;
                        return skb;
                }
                __kfree_skb(skb);
@@ -813,7 +818,7 @@ new_segment:
                        goto wait_for_memory;
 
                if (can_coalesce) {
-                       skb_shinfo(skb)->frags[i - 1].size += copy;
+                       skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
                } else {
                        get_page(page);
                        skb_fill_page_desc(skb, i, page, offset, copy);
@@ -830,7 +835,7 @@ new_segment:
                skb_shinfo(skb)->gso_segs = 0;
 
                if (!copied)
-                       TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
+                       TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
 
                copied += copy;
                poffset += copy;
@@ -850,8 +855,7 @@ new_segment:
 wait_for_sndbuf:
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 wait_for_memory:
-               if (copied)
-                       tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+               tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 
                if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
                        goto do_error;
@@ -860,7 +864,7 @@ wait_for_memory:
        }
 
 out:
-       if (copied)
+       if (copied && !(flags & MSG_SENDPAGE_NOTLAST))
                tcp_push(sk, flags, mss_now, tp->nonagle);
        return copied;
 
@@ -891,9 +895,9 @@ EXPORT_SYMBOL(tcp_sendpage);
 #define TCP_PAGE(sk)   (sk->sk_sndmsg_page)
 #define TCP_OFF(sk)    (sk->sk_sndmsg_off)
 
-static inline int select_size(struct sock *sk, int sg)
+static inline int select_size(const struct sock *sk, int sg)
 {
-       struct tcp_sock *tp = tcp_sk(sk);
+       const struct tcp_sock *tp = tcp_sk(sk);
        int tmp = tp->mss_cache;
 
        if (sg) {
@@ -995,10 +999,9 @@ new_segment:
                                copy = seglen;
 
                        /* Where to copy to? */
-                       if (skb_tailroom(skb) > 0) {
+                       if (skb_availroom(skb) > 0) {
                                /* We have some space in skb head. Superb! */
-                               if (copy > skb_tailroom(skb))
-                                       copy = skb_tailroom(skb);
+                               copy = min_t(int, copy, skb_availroom(skb));
                                err = skb_add_data_nocache(sk, skb, from, copy);
                                if (err)
                                        goto do_fault;
@@ -1058,8 +1061,7 @@ new_segment:
 
                                /* Update the skb. */
                                if (merge) {
-                                       skb_shinfo(skb)->frags[i - 1].size +=
-                                                                       copy;
+                                       skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
                                } else {
                                        skb_fill_page_desc(skb, i, page, off, copy);
                                        if (TCP_PAGE(sk)) {
@@ -1074,7 +1076,7 @@ new_segment:
                        }
 
                        if (!copied)
-                               TCP_SKB_CB(skb)->flags &= ~TCPHDR_PSH;
+                               TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
 
                        tp->write_seq += copy;
                        TCP_SKB_CB(skb)->end_seq += copy;
@@ -1194,13 +1196,11 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
        struct tcp_sock *tp = tcp_sk(sk);
        int time_to_ack = 0;
 
-#if TCP_DEBUG
        struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 
        WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
             "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
             tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
-#endif
 
        if (inet_csk_ack_scheduled(sk)) {
                const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1591,8 +1591,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                }
 
 #ifdef CONFIG_NET_DMA
-               if (tp->ucopy.dma_chan)
-                       dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+               if (tp->ucopy.dma_chan) {
+                       if (tp->rcv_wnd == 0 &&
+                           !skb_queue_empty(&sk->sk_async_wait_queue)) {
+                               tcp_service_net_dma(sk, true);
+                               tcp_cleanup_rbuf(sk, copied);
+                       } else
+                               dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+               }
 #endif
                if (copied >= target) {
                        /* Do not sleep, just process backlog. */
@@ -2100,6 +2106,10 @@ int tcp_disconnect(struct sock *sk, int flags)
        tcp_set_ca_state(sk, TCP_CA_Open);
        tcp_clear_retrans(tp);
        inet_csk_delack_init(sk);
+       /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
+        * issue in __tcp_select_window()
+        */
+       icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
        tcp_init_send_head(sk);
        memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
        __sk_dst_reset(sk);
@@ -2395,7 +2405,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                /* Cap the max timeout in ms TCP will retry/retrans
                 * before giving up and aborting (ETIMEDOUT) a connection.
                 */
-               icsk->icsk_user_timeout = msecs_to_jiffies(val);
+               if (val < 0)
+                       err = -EINVAL;
+               else
+                       icsk->icsk_user_timeout = msecs_to_jiffies(val);
                break;
        default:
                err = -ENOPROTOOPT;
@@ -2409,7 +2422,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
                   unsigned int optlen)
 {
-       struct inet_connection_sock *icsk = inet_csk(sk);
+       const struct inet_connection_sock *icsk = inet_csk(sk);
 
        if (level != SOL_TCP)
                return icsk->icsk_af_ops->setsockopt(sk, level, optname,
@@ -2431,9 +2444,9 @@ EXPORT_SYMBOL(compat_tcp_setsockopt);
 #endif
 
 /* Return information about state of tcp endpoint in API format. */
-void tcp_get_info(struct sock *sk, struct tcp_info *info)
+void tcp_get_info(const struct sock *sk, struct tcp_info *info)
 {
-       struct tcp_sock *tp = tcp_sk(sk);
+       const struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
        u32 now = tcp_time_stamp;
 
@@ -2455,8 +2468,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
                info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
        }
 
-       if (tp->ecn_flags&TCP_ECN_OK)
+       if (tp->ecn_flags & TCP_ECN_OK)
                info->tcpi_options |= TCPI_OPT_ECN;
+       if (tp->ecn_flags & TCP_ECN_SEEN)
+               info->tcpi_options |= TCPI_OPT_ECN_SEEN;
 
        info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
        info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
@@ -2856,111 +2871,42 @@ int tcp_gro_complete(struct sk_buff *skb)
 EXPORT_SYMBOL(tcp_gro_complete);
 
 #ifdef CONFIG_TCP_MD5SIG
-static unsigned long tcp_md5sig_users;
-static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool;
-static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
+static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
+static DEFINE_MUTEX(tcp_md5sig_mutex);
+static bool tcp_md5sig_pool_populated = false;
 
-static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool)
+static void __tcp_alloc_md5sig_pool(void)
 {
        int cpu;
-       for_each_possible_cpu(cpu) {
-               struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu);
-               if (p) {
-                       if (p->md5_desc.tfm)
-                               crypto_free_hash(p->md5_desc.tfm);
-                       kfree(p);
-               }
-       }
-       free_percpu(pool);
-}
 
-void tcp_free_md5sig_pool(void)
-{
-       struct tcp_md5sig_pool * __percpu *pool = NULL;
+       for_each_possible_cpu(cpu) {
+               if (!per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm) {
+                       struct crypto_hash *hash;
 
-       spin_lock_bh(&tcp_md5sig_pool_lock);
-       if (--tcp_md5sig_users == 0) {
-               pool = tcp_md5sig_pool;
-               tcp_md5sig_pool = NULL;
+                       hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+                       if (IS_ERR_OR_NULL(hash))
+                               return;
+                       per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash;
+               }
        }
-       spin_unlock_bh(&tcp_md5sig_pool_lock);
-       if (pool)
-               __tcp_free_md5sig_pool(pool);
+       /* before setting tcp_md5sig_pool_populated, we must commit all writes
+        * to memory. See smp_rmb() in tcp_get_md5sig_pool()
+        */
+       smp_wmb();
+       tcp_md5sig_pool_populated = true;
 }
-EXPORT_SYMBOL(tcp_free_md5sig_pool);
 
-static struct tcp_md5sig_pool * __percpu *
-__tcp_alloc_md5sig_pool(struct sock *sk)
+bool tcp_alloc_md5sig_pool(void)
 {
-       int cpu;
-       struct tcp_md5sig_pool * __percpu *pool;
-
-       pool = alloc_percpu(struct tcp_md5sig_pool *);
-       if (!pool)
-               return NULL;
-
-       for_each_possible_cpu(cpu) {
-               struct tcp_md5sig_pool *p;
-               struct crypto_hash *hash;
-
-               p = kzalloc(sizeof(*p), sk->sk_allocation);
-               if (!p)
-                       goto out_free;
-               *per_cpu_ptr(pool, cpu) = p;
+       if (unlikely(!tcp_md5sig_pool_populated)) {
+               mutex_lock(&tcp_md5sig_mutex);
 
-               hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
-               if (!hash || IS_ERR(hash))
-                       goto out_free;
+               if (!tcp_md5sig_pool_populated)
+                       __tcp_alloc_md5sig_pool();
 
-               p->md5_desc.tfm = hash;
-       }
-       return pool;
-out_free:
-       __tcp_free_md5sig_pool(pool);
-       return NULL;
-}
-
-struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
-{
-       struct tcp_md5sig_pool * __percpu *pool;
-       int alloc = 0;
-
-retry:
-       spin_lock_bh(&tcp_md5sig_pool_lock);
-       pool = tcp_md5sig_pool;
-       if (tcp_md5sig_users++ == 0) {
-               alloc = 1;
-               spin_unlock_bh(&tcp_md5sig_pool_lock);
-       } else if (!pool) {
-               tcp_md5sig_users--;
-               spin_unlock_bh(&tcp_md5sig_pool_lock);
-               cpu_relax();
-               goto retry;
-       } else
-               spin_unlock_bh(&tcp_md5sig_pool_lock);
-
-       if (alloc) {
-               /* we cannot hold spinlock here because this may sleep. */
-               struct tcp_md5sig_pool * __percpu *p;
-
-               p = __tcp_alloc_md5sig_pool(sk);
-               spin_lock_bh(&tcp_md5sig_pool_lock);
-               if (!p) {
-                       tcp_md5sig_users--;
-                       spin_unlock_bh(&tcp_md5sig_pool_lock);
-                       return NULL;
-               }
-               pool = tcp_md5sig_pool;
-               if (pool) {
-                       /* oops, it has already been assigned. */
-                       spin_unlock_bh(&tcp_md5sig_pool_lock);
-                       __tcp_free_md5sig_pool(p);
-               } else {
-                       tcp_md5sig_pool = pool = p;
-                       spin_unlock_bh(&tcp_md5sig_pool_lock);
-               }
+               mutex_unlock(&tcp_md5sig_mutex);
        }
-       return pool;
+       return tcp_md5sig_pool_populated;
 }
 EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
 
@@ -2974,49 +2920,38 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
  */
 struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
 {
-       struct tcp_md5sig_pool * __percpu *p;
-
        local_bh_disable();
 
-       spin_lock(&tcp_md5sig_pool_lock);
-       p = tcp_md5sig_pool;
-       if (p)
-               tcp_md5sig_users++;
-       spin_unlock(&tcp_md5sig_pool_lock);
-
-       if (p)
-               return *this_cpu_ptr(p);
-
+       if (tcp_md5sig_pool_populated) {
+               /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
+               smp_rmb();
+               return this_cpu_ptr(&tcp_md5sig_pool);
+       }
        local_bh_enable();
        return NULL;
 }
 EXPORT_SYMBOL(tcp_get_md5sig_pool);
 
-void tcp_put_md5sig_pool(void)
-{
-       local_bh_enable();
-       tcp_free_md5sig_pool();
-}
-EXPORT_SYMBOL(tcp_put_md5sig_pool);
-
 int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
-                       struct tcphdr *th)
+                       const struct tcphdr *th)
 {
        struct scatterlist sg;
+       struct tcphdr hdr;
        int err;
 
-       __sum16 old_checksum = th->check;
-       th->check = 0;
+       /* We are not allowed to change tcphdr, make a local copy */
+       memcpy(&hdr, th, sizeof(hdr));
+       hdr.check = 0;
+
        /* options aren't included in the hash */
-       sg_init_one(&sg, th, sizeof(struct tcphdr));
-       err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr));
-       th->check = old_checksum;
+       sg_init_one(&sg, &hdr, sizeof(hdr));
+       err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
        return err;
 }
 EXPORT_SYMBOL(tcp_md5_hash_header);
 
 int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
-                         struct sk_buff *skb, unsigned header_len)
+                         const struct sk_buff *skb, unsigned int header_len)
 {
        struct scatterlist sg;
        const struct tcphdr *tp = tcp_hdr(skb);
@@ -3035,8 +2970,12 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
 
        for (i = 0; i < shi->nr_frags; ++i) {
                const struct skb_frag_struct *f = &shi->frags[i];
-               sg_set_page(&sg, f->page, f->size, f->page_offset);
-               if (crypto_hash_update(desc, &sg, f->size))
+               unsigned int offset = f->page_offset;
+               struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
+
+               sg_set_page(&sg, page, skb_frag_size(f),
+                           offset_in_page(offset));
+               if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
                        return 1;
        }
 
@@ -3048,7 +2987,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
 }
 EXPORT_SYMBOL(tcp_md5_hash_skb_data);
 
-int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key)
+int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
 {
        struct scatterlist sg;
 
@@ -3221,7 +3160,7 @@ void __init tcp_init(void)
 {
        struct sk_buff *skb = NULL;
        unsigned long limit;
-       int i, max_share, cnt;
+       int i, max_rshare, max_wshare, cnt;
        unsigned long jiffy = jiffies;
 
        BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -3285,15 +3224,16 @@ void __init tcp_init(void)
 
        /* Set per-socket limits to no more than 1/128 the pressure threshold */
        limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
-       max_share = min(4UL*1024*1024, limit);
+       max_wshare = min(4UL*1024*1024, limit);
+       max_rshare = min(6UL*1024*1024, limit);
 
        sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
        sysctl_tcp_wmem[1] = 16*1024;
-       sysctl_tcp_wmem[2] = max(64*1024, max_share);
+       sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
 
        sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
        sysctl_tcp_rmem[1] = 87380;
-       sysctl_tcp_rmem[2] = max(87380, max_share);
+       sysctl_tcp_rmem[2] = max(87380, max_rshare);
 
        printk(KERN_INFO "TCP: Hash tables configured "
               "(established %u bind %u)\n",