Merge branch 'topic/pcm-subclass-fix' into for-linus
[pandora-kernel.git] / net / netfilter / nf_conntrack_proto_tcp.c
1 /* (C) 1999-2001 Paul `Rusty' Russell
2  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License version 2 as
6  * published by the Free Software Foundation.
7  */
8
9 #include <linux/types.h>
10 #include <linux/timer.h>
11 #include <linux/module.h>
12 #include <linux/in.h>
13 #include <linux/tcp.h>
14 #include <linux/spinlock.h>
15 #include <linux/skbuff.h>
16 #include <linux/ipv6.h>
17 #include <net/ip6_checksum.h>
18 #include <asm/unaligned.h>
19
20 #include <net/tcp.h>
21
22 #include <linux/netfilter.h>
23 #include <linux/netfilter_ipv4.h>
24 #include <linux/netfilter_ipv6.h>
25 #include <net/netfilter/nf_conntrack.h>
26 #include <net/netfilter/nf_conntrack_l4proto.h>
27 #include <net/netfilter/nf_conntrack_ecache.h>
28 #include <net/netfilter/nf_log.h>
29 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
30 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
31
32 /* Protects ct->proto.tcp */
33 static DEFINE_RWLOCK(tcp_lock);
34
35 /* "Be conservative in what you do,
36     be liberal in what you accept from others."
37     If it's non-zero, we mark only out of window RST segments as INVALID. */
38 static int nf_ct_tcp_be_liberal __read_mostly = 0;
39
40 /* If it is set to zero, we disable picking up already established
41    connections. */
42 static int nf_ct_tcp_loose __read_mostly = 1;
43
44 /* Max number of the retransmitted packets without receiving an (acceptable)
45    ACK from the destination. If this number is reached, a shorter timer
46    will be started. */
47 static int nf_ct_tcp_max_retrans __read_mostly = 3;
48
49   /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
50      closely.  They're more complex. --RR */
51
52 static const char *const tcp_conntrack_names[] = {
53         "NONE",
54         "SYN_SENT",
55         "SYN_RECV",
56         "ESTABLISHED",
57         "FIN_WAIT",
58         "CLOSE_WAIT",
59         "LAST_ACK",
60         "TIME_WAIT",
61         "CLOSE",
62         "LISTEN"
63 };
64
65 #define SECS * HZ
66 #define MINS * 60 SECS
67 #define HOURS * 60 MINS
68 #define DAYS * 24 HOURS
69
70 /* RFC1122 says the R2 limit should be at least 100 seconds.
71    Linux uses 15 packets as limit, which corresponds
72    to ~13-30min depending on RTO. */
73 static unsigned int nf_ct_tcp_timeout_max_retrans __read_mostly    =   5 MINS;
74 static unsigned int nf_ct_tcp_timeout_unacknowledged __read_mostly =   5 MINS;
75
76 static unsigned int tcp_timeouts[TCP_CONNTRACK_MAX] __read_mostly = {
77         [TCP_CONNTRACK_SYN_SENT]        = 2 MINS,
78         [TCP_CONNTRACK_SYN_RECV]        = 60 SECS,
79         [TCP_CONNTRACK_ESTABLISHED]     = 5 DAYS,
80         [TCP_CONNTRACK_FIN_WAIT]        = 2 MINS,
81         [TCP_CONNTRACK_CLOSE_WAIT]      = 60 SECS,
82         [TCP_CONNTRACK_LAST_ACK]        = 30 SECS,
83         [TCP_CONNTRACK_TIME_WAIT]       = 2 MINS,
84         [TCP_CONNTRACK_CLOSE]           = 10 SECS,
85 };
86
87 #define sNO TCP_CONNTRACK_NONE
88 #define sSS TCP_CONNTRACK_SYN_SENT
89 #define sSR TCP_CONNTRACK_SYN_RECV
90 #define sES TCP_CONNTRACK_ESTABLISHED
91 #define sFW TCP_CONNTRACK_FIN_WAIT
92 #define sCW TCP_CONNTRACK_CLOSE_WAIT
93 #define sLA TCP_CONNTRACK_LAST_ACK
94 #define sTW TCP_CONNTRACK_TIME_WAIT
95 #define sCL TCP_CONNTRACK_CLOSE
96 #define sLI TCP_CONNTRACK_LISTEN
97 #define sIV TCP_CONNTRACK_MAX
98 #define sIG TCP_CONNTRACK_IGNORE
99
100 /* What TCP flags are set from RST/SYN/FIN/ACK. */
101 enum tcp_bit_set {
102         TCP_SYN_SET,
103         TCP_SYNACK_SET,
104         TCP_FIN_SET,
105         TCP_ACK_SET,
106         TCP_RST_SET,
107         TCP_NONE_SET,
108 };
109
110 /*
111  * The TCP state transition table needs a few words...
112  *
113  * We are the man in the middle. All the packets go through us
114  * but might get lost in transit to the destination.
115  * It is assumed that the destinations can't receive segments
116  * we haven't seen.
117  *
118  * The checked segment is in window, but our windows are *not*
119  * equivalent with the ones of the sender/receiver. We always
120  * try to guess the state of the current sender.
121  *
122  * The meaning of the states are:
123  *
124  * NONE:        initial state
125  * SYN_SENT:    SYN-only packet seen
126  * SYN_RECV:    SYN-ACK packet seen
127  * ESTABLISHED: ACK packet seen
128  * FIN_WAIT:    FIN packet seen
129  * CLOSE_WAIT:  ACK seen (after FIN)
130  * LAST_ACK:    FIN seen (after FIN)
131  * TIME_WAIT:   last ACK seen
132  * CLOSE:       closed connection (RST)
133  *
134  * LISTEN state is not used.
135  *
136  * Packets marked as IGNORED (sIG):
137  *      if they may be either invalid or valid
138  *      and the receiver may send back a connection
139  *      closing RST or a SYN/ACK.
140  *
141  * Packets marked as INVALID (sIV):
142  *      if they are invalid
143  *      or we do not support the request (simultaneous open)
144  */
145 static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
146         {
147 /* ORIGINAL */
148 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
149 /*syn*/    { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV },
150 /*
151  *      sNO -> sSS      Initialize a new connection
152  *      sSS -> sSS      Retransmitted SYN
153  *      sSR -> sIG      Late retransmitted SYN?
154  *      sES -> sIG      Error: SYNs in window outside the SYN_SENT state
155  *                      are errors. Receiver will reply with RST
156  *                      and close the connection.
157  *                      Or we are not in sync and hold a dead connection.
158  *      sFW -> sIG
159  *      sCW -> sIG
160  *      sLA -> sIG
161  *      sTW -> sSS      Reopened connection (RFC 1122).
162  *      sCL -> sSS
163  */
164 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
165 /*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
166 /*
167  * A SYN/ACK from the client is always invalid:
168  *      - either it tries to set up a simultaneous open, which is
169  *        not supported;
170  *      - or the firewall has just been inserted between the two hosts
171  *        during the session set-up. The SYN will be retransmitted
172  *        by the true client (or it'll time out).
173  */
174 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
175 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
176 /*
177  *      sNO -> sIV      Too late and no reason to do anything...
178  *      sSS -> sIV      Client migth not send FIN in this state:
179  *                      we enforce waiting for a SYN/ACK reply first.
180  *      sSR -> sFW      Close started.
181  *      sES -> sFW
182  *      sFW -> sLA      FIN seen in both directions, waiting for
183  *                      the last ACK.
184  *                      Migth be a retransmitted FIN as well...
185  *      sCW -> sLA
186  *      sLA -> sLA      Retransmitted FIN. Remain in the same state.
187  *      sTW -> sTW
188  *      sCL -> sCL
189  */
190 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
191 /*ack*/    { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
192 /*
193  *      sNO -> sES      Assumed.
194  *      sSS -> sIV      ACK is invalid: we haven't seen a SYN/ACK yet.
195  *      sSR -> sES      Established state is reached.
196  *      sES -> sES      :-)
197  *      sFW -> sCW      Normal close request answered by ACK.
198  *      sCW -> sCW
199  *      sLA -> sTW      Last ACK detected.
200  *      sTW -> sTW      Retransmitted last ACK. Remain in the same state.
201  *      sCL -> sCL
202  */
203 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
204 /*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
205 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
206         },
207         {
208 /* REPLY */
209 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
210 /*syn*/    { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
211 /*
212  *      sNO -> sIV      Never reached.
213  *      sSS -> sIV      Simultaneous open, not supported
214  *      sSR -> sIV      Simultaneous open, not supported.
215  *      sES -> sIV      Server may not initiate a connection.
216  *      sFW -> sIV
217  *      sCW -> sIV
218  *      sLA -> sIV
219  *      sTW -> sIV      Reopened connection, but server may not do it.
220  *      sCL -> sIV
221  */
222 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
223 /*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV },
224 /*
225  *      sSS -> sSR      Standard open.
226  *      sSR -> sSR      Retransmitted SYN/ACK.
227  *      sES -> sIG      Late retransmitted SYN/ACK?
228  *      sFW -> sIG      Might be SYN/ACK answering ignored SYN
229  *      sCW -> sIG
230  *      sLA -> sIG
231  *      sTW -> sIG
232  *      sCL -> sIG
233  */
234 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
235 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
236 /*
237  *      sSS -> sIV      Server might not send FIN in this state.
238  *      sSR -> sFW      Close started.
239  *      sES -> sFW
240  *      sFW -> sLA      FIN seen in both directions.
241  *      sCW -> sLA
242  *      sLA -> sLA      Retransmitted FIN.
243  *      sTW -> sTW
244  *      sCL -> sCL
245  */
246 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
247 /*ack*/    { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV },
248 /*
249  *      sSS -> sIG      Might be a half-open connection.
250  *      sSR -> sSR      Might answer late resent SYN.
251  *      sES -> sES      :-)
252  *      sFW -> sCW      Normal close request answered by ACK.
253  *      sCW -> sCW
254  *      sLA -> sTW      Last ACK detected.
255  *      sTW -> sTW      Retransmitted last ACK.
256  *      sCL -> sCL
257  */
258 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI   */
259 /*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
260 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
261         }
262 };
263
264 static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
265                              struct nf_conntrack_tuple *tuple)
266 {
267         const struct tcphdr *hp;
268         struct tcphdr _hdr;
269
270         /* Actually only need first 8 bytes. */
271         hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
272         if (hp == NULL)
273                 return false;
274
275         tuple->src.u.tcp.port = hp->source;
276         tuple->dst.u.tcp.port = hp->dest;
277
278         return true;
279 }
280
281 static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
282                              const struct nf_conntrack_tuple *orig)
283 {
284         tuple->src.u.tcp.port = orig->dst.u.tcp.port;
285         tuple->dst.u.tcp.port = orig->src.u.tcp.port;
286         return true;
287 }
288
289 /* Print out the per-protocol part of the tuple. */
290 static int tcp_print_tuple(struct seq_file *s,
291                            const struct nf_conntrack_tuple *tuple)
292 {
293         return seq_printf(s, "sport=%hu dport=%hu ",
294                           ntohs(tuple->src.u.tcp.port),
295                           ntohs(tuple->dst.u.tcp.port));
296 }
297
298 /* Print out the private part of the conntrack. */
299 static int tcp_print_conntrack(struct seq_file *s, const struct nf_conn *ct)
300 {
301         enum tcp_conntrack state;
302
303         read_lock_bh(&tcp_lock);
304         state = ct->proto.tcp.state;
305         read_unlock_bh(&tcp_lock);
306
307         return seq_printf(s, "%s ", tcp_conntrack_names[state]);
308 }
309
310 static unsigned int get_conntrack_index(const struct tcphdr *tcph)
311 {
312         if (tcph->rst) return TCP_RST_SET;
313         else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
314         else if (tcph->fin) return TCP_FIN_SET;
315         else if (tcph->ack) return TCP_ACK_SET;
316         else return TCP_NONE_SET;
317 }
318
319 /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
320    in IP Filter' by Guido van Rooij.
321
322    http://www.nluug.nl/events/sane2000/papers.html
323    http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
324
325    The boundaries and the conditions are changed according to RFC793:
326    the packet must intersect the window (i.e. segments may be
327    after the right or before the left edge) and thus receivers may ACK
328    segments after the right edge of the window.
329
330         td_maxend = max(sack + max(win,1)) seen in reply packets
331         td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
332         td_maxwin += seq + len - sender.td_maxend
333                         if seq + len > sender.td_maxend
334         td_end    = max(seq + len) seen in sent packets
335
336    I.   Upper bound for valid data:     seq <= sender.td_maxend
337    II.  Lower bound for valid data:     seq + len >= sender.td_end - receiver.td_maxwin
338    III. Upper bound for valid (s)ack:   sack <= receiver.td_end
339    IV.  Lower bound for valid (s)ack:   sack >= receiver.td_end - MAXACKWINDOW
340
341    where sack is the highest right edge of sack block found in the packet
342    or ack in the case of packet without SACK option.
343
344    The upper bound limit for a valid (s)ack is not ignored -
345    we doesn't have to deal with fragments.
346 */
347
348 static inline __u32 segment_seq_plus_len(__u32 seq,
349                                          size_t len,
350                                          unsigned int dataoff,
351                                          const struct tcphdr *tcph)
352 {
353         /* XXX Should I use payload length field in IP/IPv6 header ?
354          * - YK */
355         return (seq + len - dataoff - tcph->doff*4
356                 + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
357 }
358
359 /* Fixme: what about big packets? */
360 #define MAXACKWINCONST                  66000
361 #define MAXACKWINDOW(sender)                                            \
362         ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin     \
363                                               : MAXACKWINCONST)
364
365 /*
366  * Simplified tcp_parse_options routine from tcp_input.c
367  */
368 static void tcp_options(const struct sk_buff *skb,
369                         unsigned int dataoff,
370                         const struct tcphdr *tcph,
371                         struct ip_ct_tcp_state *state)
372 {
373         unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
374         const unsigned char *ptr;
375         int length = (tcph->doff*4) - sizeof(struct tcphdr);
376
377         if (!length)
378                 return;
379
380         ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
381                                  length, buff);
382         BUG_ON(ptr == NULL);
383
384         state->td_scale =
385         state->flags = 0;
386
387         while (length > 0) {
388                 int opcode=*ptr++;
389                 int opsize;
390
391                 switch (opcode) {
392                 case TCPOPT_EOL:
393                         return;
394                 case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
395                         length--;
396                         continue;
397                 default:
398                         opsize=*ptr++;
399                         if (opsize < 2) /* "silly options" */
400                                 return;
401                         if (opsize > length)
402                                 break;  /* don't parse partial options */
403
404                         if (opcode == TCPOPT_SACK_PERM
405                             && opsize == TCPOLEN_SACK_PERM)
406                                 state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
407                         else if (opcode == TCPOPT_WINDOW
408                                  && opsize == TCPOLEN_WINDOW) {
409                                 state->td_scale = *(u_int8_t *)ptr;
410
411                                 if (state->td_scale > 14) {
412                                         /* See RFC1323 */
413                                         state->td_scale = 14;
414                                 }
415                                 state->flags |=
416                                         IP_CT_TCP_FLAG_WINDOW_SCALE;
417                         }
418                         ptr += opsize - 2;
419                         length -= opsize;
420                 }
421         }
422 }
423
424 static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
425                      const struct tcphdr *tcph, __u32 *sack)
426 {
427         unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
428         const unsigned char *ptr;
429         int length = (tcph->doff*4) - sizeof(struct tcphdr);
430         __u32 tmp;
431
432         if (!length)
433                 return;
434
435         ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
436                                  length, buff);
437         BUG_ON(ptr == NULL);
438
439         /* Fast path for timestamp-only option */
440         if (length == TCPOLEN_TSTAMP_ALIGNED*4
441             && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24)
442                                        | (TCPOPT_NOP << 16)
443                                        | (TCPOPT_TIMESTAMP << 8)
444                                        | TCPOLEN_TIMESTAMP))
445                 return;
446
447         while (length > 0) {
448                 int opcode = *ptr++;
449                 int opsize, i;
450
451                 switch (opcode) {
452                 case TCPOPT_EOL:
453                         return;
454                 case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
455                         length--;
456                         continue;
457                 default:
458                         opsize = *ptr++;
459                         if (opsize < 2) /* "silly options" */
460                                 return;
461                         if (opsize > length)
462                                 break;  /* don't parse partial options */
463
464                         if (opcode == TCPOPT_SACK
465                             && opsize >= (TCPOLEN_SACK_BASE
466                                           + TCPOLEN_SACK_PERBLOCK)
467                             && !((opsize - TCPOLEN_SACK_BASE)
468                                  % TCPOLEN_SACK_PERBLOCK)) {
469                                 for (i = 0;
470                                      i < (opsize - TCPOLEN_SACK_BASE);
471                                      i += TCPOLEN_SACK_PERBLOCK) {
472                                         tmp = get_unaligned_be32((__be32 *)(ptr+i)+1);
473
474                                         if (after(tmp, *sack))
475                                                 *sack = tmp;
476                                 }
477                                 return;
478                         }
479                         ptr += opsize - 2;
480                         length -= opsize;
481                 }
482         }
483 }
484
485 static bool tcp_in_window(const struct nf_conn *ct,
486                           struct ip_ct_tcp *state,
487                           enum ip_conntrack_dir dir,
488                           unsigned int index,
489                           const struct sk_buff *skb,
490                           unsigned int dataoff,
491                           const struct tcphdr *tcph,
492                           u_int8_t pf)
493 {
494         struct net *net = nf_ct_net(ct);
495         struct ip_ct_tcp_state *sender = &state->seen[dir];
496         struct ip_ct_tcp_state *receiver = &state->seen[!dir];
497         const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
498         __u32 seq, ack, sack, end, win, swin;
499         bool res;
500
501         /*
502          * Get the required data from the packet.
503          */
504         seq = ntohl(tcph->seq);
505         ack = sack = ntohl(tcph->ack_seq);
506         win = ntohs(tcph->window);
507         end = segment_seq_plus_len(seq, skb->len, dataoff, tcph);
508
509         if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
510                 tcp_sack(skb, dataoff, tcph, &sack);
511
512         pr_debug("tcp_in_window: START\n");
513         pr_debug("tcp_in_window: ");
514         nf_ct_dump_tuple(tuple);
515         pr_debug("seq=%u ack=%u sack=%u win=%u end=%u\n",
516                  seq, ack, sack, win, end);
517         pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
518                  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
519                  sender->td_end, sender->td_maxend, sender->td_maxwin,
520                  sender->td_scale,
521                  receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
522                  receiver->td_scale);
523
524         if (sender->td_end == 0) {
525                 /*
526                  * Initialize sender data.
527                  */
528                 if (tcph->syn && tcph->ack) {
529                         /*
530                          * Outgoing SYN-ACK in reply to a SYN.
531                          */
532                         sender->td_end =
533                         sender->td_maxend = end;
534                         sender->td_maxwin = (win == 0 ? 1 : win);
535
536                         tcp_options(skb, dataoff, tcph, sender);
537                         /*
538                          * RFC 1323:
539                          * Both sides must send the Window Scale option
540                          * to enable window scaling in either direction.
541                          */
542                         if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
543                               && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
544                                 sender->td_scale =
545                                 receiver->td_scale = 0;
546                 } else {
547                         /*
548                          * We are in the middle of a connection,
549                          * its history is lost for us.
550                          * Let's try to use the data from the packet.
551                          */
552                         sender->td_end = end;
553                         sender->td_maxwin = (win == 0 ? 1 : win);
554                         sender->td_maxend = end + sender->td_maxwin;
555                 }
556         } else if (((state->state == TCP_CONNTRACK_SYN_SENT
557                      && dir == IP_CT_DIR_ORIGINAL)
558                    || (state->state == TCP_CONNTRACK_SYN_RECV
559                      && dir == IP_CT_DIR_REPLY))
560                    && after(end, sender->td_end)) {
561                 /*
562                  * RFC 793: "if a TCP is reinitialized ... then it need
563                  * not wait at all; it must only be sure to use sequence
564                  * numbers larger than those recently used."
565                  */
566                 sender->td_end =
567                 sender->td_maxend = end;
568                 sender->td_maxwin = (win == 0 ? 1 : win);
569
570                 tcp_options(skb, dataoff, tcph, sender);
571         }
572
573         if (!(tcph->ack)) {
574                 /*
575                  * If there is no ACK, just pretend it was set and OK.
576                  */
577                 ack = sack = receiver->td_end;
578         } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
579                     (TCP_FLAG_ACK|TCP_FLAG_RST))
580                    && (ack == 0)) {
581                 /*
582                  * Broken TCP stacks, that set ACK in RST packets as well
583                  * with zero ack value.
584                  */
585                 ack = sack = receiver->td_end;
586         }
587
588         if (seq == end
589             && (!tcph->rst
590                 || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)))
591                 /*
592                  * Packets contains no data: we assume it is valid
593                  * and check the ack value only.
594                  * However RST segments are always validated by their
595                  * SEQ number, except when seq == 0 (reset sent answering
596                  * SYN.
597                  */
598                 seq = end = sender->td_end;
599
600         pr_debug("tcp_in_window: ");
601         nf_ct_dump_tuple(tuple);
602         pr_debug("seq=%u ack=%u sack =%u win=%u end=%u\n",
603                  seq, ack, sack, win, end);
604         pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
605                  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
606                  sender->td_end, sender->td_maxend, sender->td_maxwin,
607                  sender->td_scale,
608                  receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
609                  receiver->td_scale);
610
611         pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
612                  before(seq, sender->td_maxend + 1),
613                  after(end, sender->td_end - receiver->td_maxwin - 1),
614                  before(sack, receiver->td_end + 1),
615                  after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1));
616
617         if (before(seq, sender->td_maxend + 1) &&
618             after(end, sender->td_end - receiver->td_maxwin - 1) &&
619             before(sack, receiver->td_end + 1) &&
620             after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {
621                 /*
622                  * Take into account window scaling (RFC 1323).
623                  */
624                 if (!tcph->syn)
625                         win <<= sender->td_scale;
626
627                 /*
628                  * Update sender data.
629                  */
630                 swin = win + (sack - ack);
631                 if (sender->td_maxwin < swin)
632                         sender->td_maxwin = swin;
633                 if (after(end, sender->td_end)) {
634                         sender->td_end = end;
635                         sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
636                 }
637                 if (tcph->ack) {
638                         if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
639                                 sender->td_maxack = ack;
640                                 sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
641                         } else if (after(ack, sender->td_maxack))
642                                 sender->td_maxack = ack;
643                 }
644
645                 /*
646                  * Update receiver data.
647                  */
648                 if (after(end, sender->td_maxend))
649                         receiver->td_maxwin += end - sender->td_maxend;
650                 if (after(sack + win, receiver->td_maxend - 1)) {
651                         receiver->td_maxend = sack + win;
652                         if (win == 0)
653                                 receiver->td_maxend++;
654                 }
655                 if (ack == receiver->td_end)
656                         receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
657
658                 /*
659                  * Check retransmissions.
660                  */
661                 if (index == TCP_ACK_SET) {
662                         if (state->last_dir == dir
663                             && state->last_seq == seq
664                             && state->last_ack == ack
665                             && state->last_end == end
666                             && state->last_win == win)
667                                 state->retrans++;
668                         else {
669                                 state->last_dir = dir;
670                                 state->last_seq = seq;
671                                 state->last_ack = ack;
672                                 state->last_end = end;
673                                 state->last_win = win;
674                                 state->retrans = 0;
675                         }
676                 }
677                 res = true;
678         } else {
679                 res = false;
680                 if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
681                     nf_ct_tcp_be_liberal)
682                         res = true;
683                 if (!res && LOG_INVALID(net, IPPROTO_TCP))
684                         nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
685                         "nf_ct_tcp: %s ",
686                         before(seq, sender->td_maxend + 1) ?
687                         after(end, sender->td_end - receiver->td_maxwin - 1) ?
688                         before(sack, receiver->td_end + 1) ?
689                         after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG"
690                         : "ACK is under the lower bound (possible overly delayed ACK)"
691                         : "ACK is over the upper bound (ACKed data not seen yet)"
692                         : "SEQ is under the lower bound (already ACKed data retransmitted)"
693                         : "SEQ is over the upper bound (over the window of the receiver)");
694         }
695
696         pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u "
697                  "receiver end=%u maxend=%u maxwin=%u\n",
698                  res, sender->td_end, sender->td_maxend, sender->td_maxwin,
699                  receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
700
701         return res;
702 }
703
704 #ifdef CONFIG_NF_NAT_NEEDED
705 /* Update sender->td_end after NAT successfully mangled the packet */
706 /* Caller must linearize skb at tcp header. */
707 void nf_conntrack_tcp_update(const struct sk_buff *skb,
708                              unsigned int dataoff,
709                              struct nf_conn *ct,
710                              int dir)
711 {
712         const struct tcphdr *tcph = (const void *)skb->data + dataoff;
713         const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[dir];
714         const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[!dir];
715         __u32 end;
716
717         end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph);
718
719         write_lock_bh(&tcp_lock);
720         /*
721          * We have to worry for the ack in the reply packet only...
722          */
723         if (after(end, ct->proto.tcp.seen[dir].td_end))
724                 ct->proto.tcp.seen[dir].td_end = end;
725         ct->proto.tcp.last_end = end;
726         write_unlock_bh(&tcp_lock);
727         pr_debug("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
728                  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
729                  sender->td_end, sender->td_maxend, sender->td_maxwin,
730                  sender->td_scale,
731                  receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
732                  receiver->td_scale);
733 }
734 EXPORT_SYMBOL_GPL(nf_conntrack_tcp_update);
735 #endif
736
737 #define TH_FIN  0x01
738 #define TH_SYN  0x02
739 #define TH_RST  0x04
740 #define TH_PUSH 0x08
741 #define TH_ACK  0x10
742 #define TH_URG  0x20
743 #define TH_ECE  0x40
744 #define TH_CWR  0x80
745
746 /* table of valid flag combinations - PUSH, ECE and CWR are always valid */
747 static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + 1] =
748 {
749         [TH_SYN]                        = 1,
750         [TH_SYN|TH_URG]                 = 1,
751         [TH_SYN|TH_ACK]                 = 1,
752         [TH_RST]                        = 1,
753         [TH_RST|TH_ACK]                 = 1,
754         [TH_FIN|TH_ACK]                 = 1,
755         [TH_FIN|TH_ACK|TH_URG]          = 1,
756         [TH_ACK]                        = 1,
757         [TH_ACK|TH_URG]                 = 1,
758 };
759
760 /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
761 static int tcp_error(struct net *net,
762                      struct sk_buff *skb,
763                      unsigned int dataoff,
764                      enum ip_conntrack_info *ctinfo,
765                      u_int8_t pf,
766                      unsigned int hooknum)
767 {
768         const struct tcphdr *th;
769         struct tcphdr _tcph;
770         unsigned int tcplen = skb->len - dataoff;
771         u_int8_t tcpflags;
772
773         /* Smaller that minimal TCP header? */
774         th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
775         if (th == NULL) {
776                 if (LOG_INVALID(net, IPPROTO_TCP))
777                         nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
778                                 "nf_ct_tcp: short packet ");
779                 return -NF_ACCEPT;
780         }
781
782         /* Not whole TCP header or malformed packet */
783         if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
784                 if (LOG_INVALID(net, IPPROTO_TCP))
785                         nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
786                                 "nf_ct_tcp: truncated/malformed packet ");
787                 return -NF_ACCEPT;
788         }
789
790         /* Checksum invalid? Ignore.
791          * We skip checking packets on the outgoing path
792          * because the checksum is assumed to be correct.
793          */
794         /* FIXME: Source route IP option packets --RR */
795         if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
796             nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
797                 if (LOG_INVALID(net, IPPROTO_TCP))
798                         nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
799                                   "nf_ct_tcp: bad TCP checksum ");
800                 return -NF_ACCEPT;
801         }
802
803         /* Check TCP flags. */
804         tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR|TH_PUSH));
805         if (!tcp_valid_flags[tcpflags]) {
806                 if (LOG_INVALID(net, IPPROTO_TCP))
807                         nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
808                                   "nf_ct_tcp: invalid TCP flag combination ");
809                 return -NF_ACCEPT;
810         }
811
812         return NF_ACCEPT;
813 }
814
815 /* Returns verdict for packet, or -1 for invalid. */
816 static int tcp_packet(struct nf_conn *ct,
817                       const struct sk_buff *skb,
818                       unsigned int dataoff,
819                       enum ip_conntrack_info ctinfo,
820                       u_int8_t pf,
821                       unsigned int hooknum)
822 {
823         struct net *net = nf_ct_net(ct);
824         struct nf_conntrack_tuple *tuple;
825         enum tcp_conntrack new_state, old_state;
826         enum ip_conntrack_dir dir;
827         const struct tcphdr *th;
828         struct tcphdr _tcph;
829         unsigned long timeout;
830         unsigned int index;
831
832         th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
833         BUG_ON(th == NULL);
834
835         write_lock_bh(&tcp_lock);
836         old_state = ct->proto.tcp.state;
837         dir = CTINFO2DIR(ctinfo);
838         index = get_conntrack_index(th);
839         new_state = tcp_conntracks[dir][index][old_state];
840         tuple = &ct->tuplehash[dir].tuple;
841
842         switch (new_state) {
843         case TCP_CONNTRACK_SYN_SENT:
844                 if (old_state < TCP_CONNTRACK_TIME_WAIT)
845                         break;
846                 /* RFC 1122: "When a connection is closed actively,
847                  * it MUST linger in TIME-WAIT state for a time 2xMSL
848                  * (Maximum Segment Lifetime). However, it MAY accept
849                  * a new SYN from the remote TCP to reopen the connection
850                  * directly from TIME-WAIT state, if..."
851                  * We ignore the conditions because we are in the
852                  * TIME-WAIT state anyway.
853                  *
854                  * Handle aborted connections: we and the server
855                  * think there is an existing connection but the client
856                  * aborts it and starts a new one.
857                  */
858                 if (((ct->proto.tcp.seen[dir].flags
859                       | ct->proto.tcp.seen[!dir].flags)
860                      & IP_CT_TCP_FLAG_CLOSE_INIT)
861                     || (ct->proto.tcp.last_dir == dir
862                         && ct->proto.tcp.last_index == TCP_RST_SET)) {
863                         /* Attempt to reopen a closed/aborted connection.
864                          * Delete this connection and look up again. */
865                         write_unlock_bh(&tcp_lock);
866
867                         /* Only repeat if we can actually remove the timer.
868                          * Destruction may already be in progress in process
869                          * context and we must give it a chance to terminate.
870                          */
871                         if (nf_ct_kill(ct))
872                                 return -NF_REPEAT;
873                         return NF_DROP;
874                 }
875                 /* Fall through */
876         case TCP_CONNTRACK_IGNORE:
877                 /* Ignored packets:
878                  *
879                  * Our connection entry may be out of sync, so ignore
880                  * packets which may signal the real connection between
881                  * the client and the server.
882                  *
883                  * a) SYN in ORIGINAL
884                  * b) SYN/ACK in REPLY
885                  * c) ACK in reply direction after initial SYN in original.
886                  *
887                  * If the ignored packet is invalid, the receiver will send
888                  * a RST we'll catch below.
889                  */
890                 if (index == TCP_SYNACK_SET
891                     && ct->proto.tcp.last_index == TCP_SYN_SET
892                     && ct->proto.tcp.last_dir != dir
893                     && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
894                         /* b) This SYN/ACK acknowledges a SYN that we earlier
895                          * ignored as invalid. This means that the client and
896                          * the server are both in sync, while the firewall is
897                          * not. We kill this session and block the SYN/ACK so
898                          * that the client cannot but retransmit its SYN and
899                          * thus initiate a clean new session.
900                          */
901                         write_unlock_bh(&tcp_lock);
902                         if (LOG_INVALID(net, IPPROTO_TCP))
903                                 nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
904                                           "nf_ct_tcp: killing out of sync session ");
905                         nf_ct_kill(ct);
906                         return NF_DROP;
907                 }
908                 ct->proto.tcp.last_index = index;
909                 ct->proto.tcp.last_dir = dir;
910                 ct->proto.tcp.last_seq = ntohl(th->seq);
911                 ct->proto.tcp.last_end =
912                     segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
913
914                 write_unlock_bh(&tcp_lock);
915                 if (LOG_INVALID(net, IPPROTO_TCP))
916                         nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
917                                   "nf_ct_tcp: invalid packet ignored ");
918                 return NF_ACCEPT;
919         case TCP_CONNTRACK_MAX:
920                 /* Invalid packet */
921                 pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
922                          dir, get_conntrack_index(th), old_state);
923                 write_unlock_bh(&tcp_lock);
924                 if (LOG_INVALID(net, IPPROTO_TCP))
925                         nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
926                                   "nf_ct_tcp: invalid state ");
927                 return -NF_ACCEPT;
928         case TCP_CONNTRACK_CLOSE:
929                 if (index == TCP_RST_SET
930                     && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET)
931                     && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) {
932                         /* Invalid RST  */
933                         write_unlock_bh(&tcp_lock);
934                         if (LOG_INVALID(net, IPPROTO_TCP))
935                                 nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
936                                           "nf_ct_tcp: invalid RST ");
937                         return -NF_ACCEPT;
938                 }
939                 if (index == TCP_RST_SET
940                     && ((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
941                          && ct->proto.tcp.last_index == TCP_SYN_SET)
942                         || (!test_bit(IPS_ASSURED_BIT, &ct->status)
943                             && ct->proto.tcp.last_index == TCP_ACK_SET))
944                     && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
945                         /* RST sent to invalid SYN or ACK we had let through
946                          * at a) and c) above:
947                          *
948                          * a) SYN was in window then
949                          * c) we hold a half-open connection.
950                          *
951                          * Delete our connection entry.
952                          * We skip window checking, because packet might ACK
953                          * segments we ignored. */
954                         goto in_window;
955                 }
956                 /* Just fall through */
957         default:
958                 /* Keep compilers happy. */
959                 break;
960         }
961
962         if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
963                            skb, dataoff, th, pf)) {
964                 write_unlock_bh(&tcp_lock);
965                 return -NF_ACCEPT;
966         }
967      in_window:
968         /* From now on we have got in-window packets */
969         ct->proto.tcp.last_index = index;
970         ct->proto.tcp.last_dir = dir;
971
972         pr_debug("tcp_conntracks: ");
973         nf_ct_dump_tuple(tuple);
974         pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
975                  (th->syn ? 1 : 0), (th->ack ? 1 : 0),
976                  (th->fin ? 1 : 0), (th->rst ? 1 : 0),
977                  old_state, new_state);
978
979         ct->proto.tcp.state = new_state;
980         if (old_state != new_state
981             && new_state == TCP_CONNTRACK_FIN_WAIT)
982                 ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
983
984         if (ct->proto.tcp.retrans >= nf_ct_tcp_max_retrans &&
985             tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans)
986                 timeout = nf_ct_tcp_timeout_max_retrans;
987         else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &
988                  IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED &&
989                  tcp_timeouts[new_state] > nf_ct_tcp_timeout_unacknowledged)
990                 timeout = nf_ct_tcp_timeout_unacknowledged;
991         else
992                 timeout = tcp_timeouts[new_state];
993         write_unlock_bh(&tcp_lock);
994
995         nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct);
996         if (new_state != old_state)
997                 nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
998
999         if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1000                 /* If only reply is a RST, we can consider ourselves not to
1001                    have an established connection: this is a fairly common
1002                    problem case, so we can delete the conntrack
1003                    immediately.  --RR */
1004                 if (th->rst) {
1005                         nf_ct_kill_acct(ct, ctinfo, skb);
1006                         return NF_ACCEPT;
1007                 }
1008         } else if (!test_bit(IPS_ASSURED_BIT, &ct->status)
1009                    && (old_state == TCP_CONNTRACK_SYN_RECV
1010                        || old_state == TCP_CONNTRACK_ESTABLISHED)
1011                    && new_state == TCP_CONNTRACK_ESTABLISHED) {
1012                 /* Set ASSURED if we see see valid ack in ESTABLISHED
1013                    after SYN_RECV or a valid answer for a picked up
1014                    connection. */
1015                 set_bit(IPS_ASSURED_BIT, &ct->status);
1016                 nf_conntrack_event_cache(IPCT_STATUS, ct);
1017         }
1018         nf_ct_refresh_acct(ct, ctinfo, skb, timeout);
1019
1020         return NF_ACCEPT;
1021 }
1022
1023 /* Called when a new connection for this protocol found. */
1024 static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
1025                     unsigned int dataoff)
1026 {
1027         enum tcp_conntrack new_state;
1028         const struct tcphdr *th;
1029         struct tcphdr _tcph;
1030         const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
1031         const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
1032
1033         th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
1034         BUG_ON(th == NULL);
1035
1036         /* Don't need lock here: this conntrack not in circulation yet */
1037         new_state
1038                 = tcp_conntracks[0][get_conntrack_index(th)]
1039                 [TCP_CONNTRACK_NONE];
1040
1041         /* Invalid: delete conntrack */
1042         if (new_state >= TCP_CONNTRACK_MAX) {
1043                 pr_debug("nf_ct_tcp: invalid new deleting.\n");
1044                 return false;
1045         }
1046
1047         if (new_state == TCP_CONNTRACK_SYN_SENT) {
1048                 /* SYN packet */
1049                 ct->proto.tcp.seen[0].td_end =
1050                         segment_seq_plus_len(ntohl(th->seq), skb->len,
1051                                              dataoff, th);
1052                 ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1053                 if (ct->proto.tcp.seen[0].td_maxwin == 0)
1054                         ct->proto.tcp.seen[0].td_maxwin = 1;
1055                 ct->proto.tcp.seen[0].td_maxend =
1056                         ct->proto.tcp.seen[0].td_end;
1057
1058                 tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
1059                 ct->proto.tcp.seen[1].flags = 0;
1060         } else if (nf_ct_tcp_loose == 0) {
1061                 /* Don't try to pick up connections. */
1062                 return false;
1063         } else {
1064                 /*
1065                  * We are in the middle of a connection,
1066                  * its history is lost for us.
1067                  * Let's try to use the data from the packet.
1068                  */
1069                 ct->proto.tcp.seen[0].td_end =
1070                         segment_seq_plus_len(ntohl(th->seq), skb->len,
1071                                              dataoff, th);
1072                 ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1073                 if (ct->proto.tcp.seen[0].td_maxwin == 0)
1074                         ct->proto.tcp.seen[0].td_maxwin = 1;
1075                 ct->proto.tcp.seen[0].td_maxend =
1076                         ct->proto.tcp.seen[0].td_end +
1077                         ct->proto.tcp.seen[0].td_maxwin;
1078                 ct->proto.tcp.seen[0].td_scale = 0;
1079
1080                 /* We assume SACK and liberal window checking to handle
1081                  * window scaling */
1082                 ct->proto.tcp.seen[0].flags =
1083                 ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
1084                                               IP_CT_TCP_FLAG_BE_LIBERAL;
1085         }
1086
1087         ct->proto.tcp.seen[1].td_end = 0;
1088         ct->proto.tcp.seen[1].td_maxend = 0;
1089         ct->proto.tcp.seen[1].td_maxwin = 1;
1090         ct->proto.tcp.seen[1].td_scale = 0;
1091
1092         /* tcp_packet will set them */
1093         ct->proto.tcp.state = TCP_CONNTRACK_NONE;
1094         ct->proto.tcp.last_index = TCP_NONE_SET;
1095
1096         pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
1097                  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
1098                  sender->td_end, sender->td_maxend, sender->td_maxwin,
1099                  sender->td_scale,
1100                  receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
1101                  receiver->td_scale);
1102         return true;
1103 }
1104
1105 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
1106
1107 #include <linux/netfilter/nfnetlink.h>
1108 #include <linux/netfilter/nfnetlink_conntrack.h>
1109
1110 static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
1111                          const struct nf_conn *ct)
1112 {
1113         struct nlattr *nest_parms;
1114         struct nf_ct_tcp_flags tmp = {};
1115
1116         read_lock_bh(&tcp_lock);
1117         nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED);
1118         if (!nest_parms)
1119                 goto nla_put_failure;
1120
1121         NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state);
1122
1123         NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
1124                    ct->proto.tcp.seen[0].td_scale);
1125
1126         NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
1127                    ct->proto.tcp.seen[1].td_scale);
1128
1129         tmp.flags = ct->proto.tcp.seen[0].flags;
1130         NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL,
1131                 sizeof(struct nf_ct_tcp_flags), &tmp);
1132
1133         tmp.flags = ct->proto.tcp.seen[1].flags;
1134         NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
1135                 sizeof(struct nf_ct_tcp_flags), &tmp);
1136         read_unlock_bh(&tcp_lock);
1137
1138         nla_nest_end(skb, nest_parms);
1139
1140         return 0;
1141
1142 nla_put_failure:
1143         read_unlock_bh(&tcp_lock);
1144         return -1;
1145 }
1146
1147 static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = {
1148         [CTA_PROTOINFO_TCP_STATE]           = { .type = NLA_U8 },
1149         [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 },
1150         [CTA_PROTOINFO_TCP_WSCALE_REPLY]    = { .type = NLA_U8 },
1151         [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]  = { .len = sizeof(struct nf_ct_tcp_flags) },
1152         [CTA_PROTOINFO_TCP_FLAGS_REPLY]     = { .len =  sizeof(struct nf_ct_tcp_flags) },
1153 };
1154
1155 static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
1156 {
1157         struct nlattr *pattr = cda[CTA_PROTOINFO_TCP];
1158         struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1];
1159         int err;
1160
1161         /* updates could not contain anything about the private
1162          * protocol info, in that case skip the parsing */
1163         if (!pattr)
1164                 return 0;
1165
1166         err = nla_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, pattr, tcp_nla_policy);
1167         if (err < 0)
1168                 return err;
1169
1170         if (tb[CTA_PROTOINFO_TCP_STATE] &&
1171             nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
1172                 return -EINVAL;
1173
1174         write_lock_bh(&tcp_lock);
1175         if (tb[CTA_PROTOINFO_TCP_STATE])
1176                 ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
1177
1178         if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) {
1179                 struct nf_ct_tcp_flags *attr =
1180                         nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]);
1181                 ct->proto.tcp.seen[0].flags &= ~attr->mask;
1182                 ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask;
1183         }
1184
1185         if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) {
1186                 struct nf_ct_tcp_flags *attr =
1187                         nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]);
1188                 ct->proto.tcp.seen[1].flags &= ~attr->mask;
1189                 ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask;
1190         }
1191
1192         if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] &&
1193             tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] &&
1194             ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
1195             ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
1196                 ct->proto.tcp.seen[0].td_scale =
1197                         nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]);
1198                 ct->proto.tcp.seen[1].td_scale =
1199                         nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
1200         }
1201         write_unlock_bh(&tcp_lock);
1202
1203         return 0;
1204 }
1205
1206 static int tcp_nlattr_size(void)
1207 {
1208         return nla_total_size(0)           /* CTA_PROTOINFO_TCP */
1209                 + nla_policy_len(tcp_nla_policy, CTA_PROTOINFO_TCP_MAX + 1);
1210 }
1211
1212 static int tcp_nlattr_tuple_size(void)
1213 {
1214         return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1215 }
1216 #endif
1217
1218 #ifdef CONFIG_SYSCTL
1219 static unsigned int tcp_sysctl_table_users;
1220 static struct ctl_table_header *tcp_sysctl_header;
1221 static struct ctl_table tcp_sysctl_table[] = {
1222         {
1223                 .procname       = "nf_conntrack_tcp_timeout_syn_sent",
1224                 .data           = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT],
1225                 .maxlen         = sizeof(unsigned int),
1226                 .mode           = 0644,
1227                 .proc_handler   = proc_dointvec_jiffies,
1228         },
1229         {
1230                 .procname       = "nf_conntrack_tcp_timeout_syn_recv",
1231                 .data           = &tcp_timeouts[TCP_CONNTRACK_SYN_RECV],
1232                 .maxlen         = sizeof(unsigned int),
1233                 .mode           = 0644,
1234                 .proc_handler   = proc_dointvec_jiffies,
1235         },
1236         {
1237                 .procname       = "nf_conntrack_tcp_timeout_established",
1238                 .data           = &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED],
1239                 .maxlen         = sizeof(unsigned int),
1240                 .mode           = 0644,
1241                 .proc_handler   = proc_dointvec_jiffies,
1242         },
1243         {
1244                 .procname       = "nf_conntrack_tcp_timeout_fin_wait",
1245                 .data           = &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT],
1246                 .maxlen         = sizeof(unsigned int),
1247                 .mode           = 0644,
1248                 .proc_handler   = proc_dointvec_jiffies,
1249         },
1250         {
1251                 .procname       = "nf_conntrack_tcp_timeout_close_wait",
1252                 .data           = &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT],
1253                 .maxlen         = sizeof(unsigned int),
1254                 .mode           = 0644,
1255                 .proc_handler   = proc_dointvec_jiffies,
1256         },
1257         {
1258                 .procname       = "nf_conntrack_tcp_timeout_last_ack",
1259                 .data           = &tcp_timeouts[TCP_CONNTRACK_LAST_ACK],
1260                 .maxlen         = sizeof(unsigned int),
1261                 .mode           = 0644,
1262                 .proc_handler   = proc_dointvec_jiffies,
1263         },
1264         {
1265                 .procname       = "nf_conntrack_tcp_timeout_time_wait",
1266                 .data           = &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT],
1267                 .maxlen         = sizeof(unsigned int),
1268                 .mode           = 0644,
1269                 .proc_handler   = proc_dointvec_jiffies,
1270         },
1271         {
1272                 .procname       = "nf_conntrack_tcp_timeout_close",
1273                 .data           = &tcp_timeouts[TCP_CONNTRACK_CLOSE],
1274                 .maxlen         = sizeof(unsigned int),
1275                 .mode           = 0644,
1276                 .proc_handler   = proc_dointvec_jiffies,
1277         },
1278         {
1279                 .procname       = "nf_conntrack_tcp_timeout_max_retrans",
1280                 .data           = &nf_ct_tcp_timeout_max_retrans,
1281                 .maxlen         = sizeof(unsigned int),
1282                 .mode           = 0644,
1283                 .proc_handler   = proc_dointvec_jiffies,
1284         },
1285         {
1286                 .procname       = "nf_conntrack_tcp_timeout_unacknowledged",
1287                 .data           = &nf_ct_tcp_timeout_unacknowledged,
1288                 .maxlen         = sizeof(unsigned int),
1289                 .mode           = 0644,
1290                 .proc_handler   = proc_dointvec_jiffies,
1291         },
1292         {
1293                 .ctl_name       = NET_NF_CONNTRACK_TCP_LOOSE,
1294                 .procname       = "nf_conntrack_tcp_loose",
1295                 .data           = &nf_ct_tcp_loose,
1296                 .maxlen         = sizeof(unsigned int),
1297                 .mode           = 0644,
1298                 .proc_handler   = proc_dointvec,
1299         },
1300         {
1301                 .ctl_name       = NET_NF_CONNTRACK_TCP_BE_LIBERAL,
1302                 .procname       = "nf_conntrack_tcp_be_liberal",
1303                 .data           = &nf_ct_tcp_be_liberal,
1304                 .maxlen         = sizeof(unsigned int),
1305                 .mode           = 0644,
1306                 .proc_handler   = proc_dointvec,
1307         },
1308         {
1309                 .ctl_name       = NET_NF_CONNTRACK_TCP_MAX_RETRANS,
1310                 .procname       = "nf_conntrack_tcp_max_retrans",
1311                 .data           = &nf_ct_tcp_max_retrans,
1312                 .maxlen         = sizeof(unsigned int),
1313                 .mode           = 0644,
1314                 .proc_handler   = proc_dointvec,
1315         },
1316         {
1317                 .ctl_name       = 0
1318         }
1319 };
1320
1321 #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
1322 static struct ctl_table tcp_compat_sysctl_table[] = {
1323         {
1324                 .procname       = "ip_conntrack_tcp_timeout_syn_sent",
1325                 .data           = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT],
1326                 .maxlen         = sizeof(unsigned int),
1327                 .mode           = 0644,
1328                 .proc_handler   = proc_dointvec_jiffies,
1329         },
1330         {
1331                 .procname       = "ip_conntrack_tcp_timeout_syn_recv",
1332                 .data           = &tcp_timeouts[TCP_CONNTRACK_SYN_RECV],
1333                 .maxlen         = sizeof(unsigned int),
1334                 .mode           = 0644,
1335                 .proc_handler   = proc_dointvec_jiffies,
1336         },
1337         {
1338                 .procname       = "ip_conntrack_tcp_timeout_established",
1339                 .data           = &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED],
1340                 .maxlen         = sizeof(unsigned int),
1341                 .mode           = 0644,
1342                 .proc_handler   = proc_dointvec_jiffies,
1343         },
1344         {
1345                 .procname       = "ip_conntrack_tcp_timeout_fin_wait",
1346                 .data           = &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT],
1347                 .maxlen         = sizeof(unsigned int),
1348                 .mode           = 0644,
1349                 .proc_handler   = proc_dointvec_jiffies,
1350         },
1351         {
1352                 .procname       = "ip_conntrack_tcp_timeout_close_wait",
1353                 .data           = &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT],
1354                 .maxlen         = sizeof(unsigned int),
1355                 .mode           = 0644,
1356                 .proc_handler   = proc_dointvec_jiffies,
1357         },
1358         {
1359                 .procname       = "ip_conntrack_tcp_timeout_last_ack",
1360                 .data           = &tcp_timeouts[TCP_CONNTRACK_LAST_ACK],
1361                 .maxlen         = sizeof(unsigned int),
1362                 .mode           = 0644,
1363                 .proc_handler   = proc_dointvec_jiffies,
1364         },
1365         {
1366                 .procname       = "ip_conntrack_tcp_timeout_time_wait",
1367                 .data           = &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT],
1368                 .maxlen         = sizeof(unsigned int),
1369                 .mode           = 0644,
1370                 .proc_handler   = proc_dointvec_jiffies,
1371         },
1372         {
1373                 .procname       = "ip_conntrack_tcp_timeout_close",
1374                 .data           = &tcp_timeouts[TCP_CONNTRACK_CLOSE],
1375                 .maxlen         = sizeof(unsigned int),
1376                 .mode           = 0644,
1377                 .proc_handler   = proc_dointvec_jiffies,
1378         },
1379         {
1380                 .procname       = "ip_conntrack_tcp_timeout_max_retrans",
1381                 .data           = &nf_ct_tcp_timeout_max_retrans,
1382                 .maxlen         = sizeof(unsigned int),
1383                 .mode           = 0644,
1384                 .proc_handler   = proc_dointvec_jiffies,
1385         },
1386         {
1387                 .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_LOOSE,
1388                 .procname       = "ip_conntrack_tcp_loose",
1389                 .data           = &nf_ct_tcp_loose,
1390                 .maxlen         = sizeof(unsigned int),
1391                 .mode           = 0644,
1392                 .proc_handler   = proc_dointvec,
1393         },
1394         {
1395                 .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL,
1396                 .procname       = "ip_conntrack_tcp_be_liberal",
1397                 .data           = &nf_ct_tcp_be_liberal,
1398                 .maxlen         = sizeof(unsigned int),
1399                 .mode           = 0644,
1400                 .proc_handler   = proc_dointvec,
1401         },
1402         {
1403                 .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS,
1404                 .procname       = "ip_conntrack_tcp_max_retrans",
1405                 .data           = &nf_ct_tcp_max_retrans,
1406                 .maxlen         = sizeof(unsigned int),
1407                 .mode           = 0644,
1408                 .proc_handler   = proc_dointvec,
1409         },
1410         {
1411                 .ctl_name       = 0
1412         }
1413 };
1414 #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
1415 #endif /* CONFIG_SYSCTL */
1416
1417 struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
1418 {
1419         .l3proto                = PF_INET,
1420         .l4proto                = IPPROTO_TCP,
1421         .name                   = "tcp",
1422         .pkt_to_tuple           = tcp_pkt_to_tuple,
1423         .invert_tuple           = tcp_invert_tuple,
1424         .print_tuple            = tcp_print_tuple,
1425         .print_conntrack        = tcp_print_conntrack,
1426         .packet                 = tcp_packet,
1427         .new                    = tcp_new,
1428         .error                  = tcp_error,
1429 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
1430         .to_nlattr              = tcp_to_nlattr,
1431         .nlattr_size            = tcp_nlattr_size,
1432         .from_nlattr            = nlattr_to_tcp,
1433         .tuple_to_nlattr        = nf_ct_port_tuple_to_nlattr,
1434         .nlattr_to_tuple        = nf_ct_port_nlattr_to_tuple,
1435         .nlattr_tuple_size      = tcp_nlattr_tuple_size,
1436         .nla_policy             = nf_ct_port_nla_policy,
1437 #endif
1438 #ifdef CONFIG_SYSCTL
1439         .ctl_table_users        = &tcp_sysctl_table_users,
1440         .ctl_table_header       = &tcp_sysctl_header,
1441         .ctl_table              = tcp_sysctl_table,
1442 #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
1443         .ctl_compat_table       = tcp_compat_sysctl_table,
1444 #endif
1445 #endif
1446 };
1447 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4);
1448
1449 struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =
1450 {
1451         .l3proto                = PF_INET6,
1452         .l4proto                = IPPROTO_TCP,
1453         .name                   = "tcp",
1454         .pkt_to_tuple           = tcp_pkt_to_tuple,
1455         .invert_tuple           = tcp_invert_tuple,
1456         .print_tuple            = tcp_print_tuple,
1457         .print_conntrack        = tcp_print_conntrack,
1458         .packet                 = tcp_packet,
1459         .new                    = tcp_new,
1460         .error                  = tcp_error,
1461 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
1462         .to_nlattr              = tcp_to_nlattr,
1463         .nlattr_size            = tcp_nlattr_size,
1464         .from_nlattr            = nlattr_to_tcp,
1465         .tuple_to_nlattr        = nf_ct_port_tuple_to_nlattr,
1466         .nlattr_to_tuple        = nf_ct_port_nlattr_to_tuple,
1467         .nlattr_tuple_size      = tcp_nlattr_tuple_size,
1468         .nla_policy             = nf_ct_port_nla_policy,
1469 #endif
1470 #ifdef CONFIG_SYSCTL
1471         .ctl_table_users        = &tcp_sysctl_table_users,
1472         .ctl_table_header       = &tcp_sysctl_header,
1473         .ctl_table              = tcp_sysctl_table,
1474 #endif
1475 };
1476 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6);