[NETFILTER]: Fix multiple problems with the conntrack event cache
[pandora-kernel.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40 #include <linux/notifier.h>
41
42 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
43    registrations, conntrack timers*/
44 #define ASSERT_READ_LOCK(x)
45 #define ASSERT_WRITE_LOCK(x)
46
47 #include <linux/netfilter_ipv4/ip_conntrack.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
51 #include <linux/netfilter_ipv4/listhelp.h>
52
53 #define IP_CONNTRACK_VERSION    "2.3"
54
55 #if 0
56 #define DEBUGP printk
57 #else
58 #define DEBUGP(format, args...)
59 #endif
60
61 DEFINE_RWLOCK(ip_conntrack_lock);
62
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65
66 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67 LIST_HEAD(ip_conntrack_expect_list);
68 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69 static LIST_HEAD(helpers);
70 unsigned int ip_conntrack_htable_size = 0;
71 int ip_conntrack_max;
72 struct list_head *ip_conntrack_hash;
73 static kmem_cache_t *ip_conntrack_cachep;
74 static kmem_cache_t *ip_conntrack_expect_cachep;
75 struct ip_conntrack ip_conntrack_untracked;
76 unsigned int ip_ct_log_invalid;
77 static LIST_HEAD(unconfirmed);
78 static int ip_conntrack_vmalloc;
79
80 static unsigned int ip_conntrack_next_id = 1;
81 static unsigned int ip_conntrack_expect_next_id = 1;
82 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83 struct notifier_block *ip_conntrack_chain;
84 struct notifier_block *ip_conntrack_expect_chain;
85
86 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
87
88 /* deliver cached events and clear cache entry - must be called with locally
89  * disabled softirqs */
90 static inline void
91 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
92 {
93         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
94         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
95                 notifier_call_chain(&ip_conntrack_chain, ecache->events,
96                                     ecache->ct);
97         ecache->events = 0;
98         ip_conntrack_put(ecache->ct);
99         ecache->ct = NULL;
100 }
101
102 /* Deliver all cached events for a particular conntrack. This is called
103  * by code prior to async packet handling or freeing the skb */
104 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
105 {
106         struct ip_conntrack_ecache *ecache;
107         
108         local_bh_disable();
109         ecache = &__get_cpu_var(ip_conntrack_ecache);
110         if (ecache->ct == ct)
111                 __ip_ct_deliver_cached_events(ecache);
112         local_bh_enable();
113 }
114
115 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
116 {
117         struct ip_conntrack_ecache *ecache;
118
119         /* take care of delivering potentially old events */
120         ecache = &__get_cpu_var(ip_conntrack_ecache);
121         BUG_ON(ecache->ct == ct);
122         if (ecache->ct)
123                 __ip_ct_deliver_cached_events(ecache);
124         /* initialize for this conntrack/packet */
125         ecache->ct = ct;
126         nf_conntrack_get(&ct->ct_general);
127 }
128
129 /* flush the event cache - touches other CPU's data and must not be called while
130  * packets are still passing through the code */
131 static void ip_ct_event_cache_flush(void)
132 {
133         struct ip_conntrack_ecache *ecache;
134         int cpu;
135
136         for_each_cpu(cpu) {
137                 ecache = &per_cpu(ip_conntrack_ecache, cpu);
138                 if (ecache->ct)
139                         ip_conntrack_put(ecache->ct);
140         }
141 }
142 #else
143 static inline void ip_ct_event_cache_flush(void) {}
144 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
145
146 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
147
148 static int ip_conntrack_hash_rnd_initted;
149 static unsigned int ip_conntrack_hash_rnd;
150
151 static u_int32_t
152 hash_conntrack(const struct ip_conntrack_tuple *tuple)
153 {
154 #if 0
155         dump_tuple(tuple);
156 #endif
157         return (jhash_3words(tuple->src.ip,
158                              (tuple->dst.ip ^ tuple->dst.protonum),
159                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
160                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
161 }
162
163 int
164 ip_ct_get_tuple(const struct iphdr *iph,
165                 const struct sk_buff *skb,
166                 unsigned int dataoff,
167                 struct ip_conntrack_tuple *tuple,
168                 const struct ip_conntrack_protocol *protocol)
169 {
170         /* Never happen */
171         if (iph->frag_off & htons(IP_OFFSET)) {
172                 printk("ip_conntrack_core: Frag of proto %u.\n",
173                        iph->protocol);
174                 return 0;
175         }
176
177         tuple->src.ip = iph->saddr;
178         tuple->dst.ip = iph->daddr;
179         tuple->dst.protonum = iph->protocol;
180         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
181
182         return protocol->pkt_to_tuple(skb, dataoff, tuple);
183 }
184
185 int
186 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
187                    const struct ip_conntrack_tuple *orig,
188                    const struct ip_conntrack_protocol *protocol)
189 {
190         inverse->src.ip = orig->dst.ip;
191         inverse->dst.ip = orig->src.ip;
192         inverse->dst.protonum = orig->dst.protonum;
193         inverse->dst.dir = !orig->dst.dir;
194
195         return protocol->invert_tuple(inverse, orig);
196 }
197
198
199 /* ip_conntrack_expect helper functions */
200 static void unlink_expect(struct ip_conntrack_expect *exp)
201 {
202         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
203         IP_NF_ASSERT(!timer_pending(&exp->timeout));
204         list_del(&exp->list);
205         CONNTRACK_STAT_INC(expect_delete);
206         exp->master->expecting--;
207 }
208
209 void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp)
210 {
211         unlink_expect(exp);
212         ip_conntrack_expect_put(exp);
213 }
214
215 static void expectation_timed_out(unsigned long ul_expect)
216 {
217         struct ip_conntrack_expect *exp = (void *)ul_expect;
218
219         write_lock_bh(&ip_conntrack_lock);
220         unlink_expect(exp);
221         write_unlock_bh(&ip_conntrack_lock);
222         ip_conntrack_expect_put(exp);
223 }
224
225 struct ip_conntrack_expect *
226 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
227 {
228         struct ip_conntrack_expect *i;
229         
230         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
231                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
232                         atomic_inc(&i->use);
233                         return i;
234                 }
235         }
236         return NULL;
237 }
238
239 /* Just find a expectation corresponding to a tuple. */
240 struct ip_conntrack_expect *
241 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
242 {
243         struct ip_conntrack_expect *i;
244         
245         read_lock_bh(&ip_conntrack_lock);
246         i = __ip_conntrack_expect_find(tuple);
247         read_unlock_bh(&ip_conntrack_lock);
248
249         return i;
250 }
251
252 /* If an expectation for this connection is found, it gets delete from
253  * global list then returned. */
254 static struct ip_conntrack_expect *
255 find_expectation(const struct ip_conntrack_tuple *tuple)
256 {
257         struct ip_conntrack_expect *i;
258
259         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
260                 /* If master is not in hash table yet (ie. packet hasn't left
261                    this machine yet), how can other end know about expected?
262                    Hence these are not the droids you are looking for (if
263                    master ct never got confirmed, we'd hold a reference to it
264                    and weird things would happen to future packets). */
265                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
266                     && is_confirmed(i->master)
267                     && del_timer(&i->timeout)) {
268                         unlink_expect(i);
269                         return i;
270                 }
271         }
272         return NULL;
273 }
274
275 /* delete all expectations for this conntrack */
276 void ip_ct_remove_expectations(struct ip_conntrack *ct)
277 {
278         struct ip_conntrack_expect *i, *tmp;
279
280         /* Optimization: most connection never expect any others. */
281         if (ct->expecting == 0)
282                 return;
283
284         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
285                 if (i->master == ct && del_timer(&i->timeout)) {
286                         unlink_expect(i);
287                         ip_conntrack_expect_put(i);
288                 }
289         }
290 }
291
292 static void
293 clean_from_lists(struct ip_conntrack *ct)
294 {
295         unsigned int ho, hr;
296         
297         DEBUGP("clean_from_lists(%p)\n", ct);
298         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
299
300         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
301         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
302         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
303         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
304
305         /* Destroy all pending expectations */
306         ip_ct_remove_expectations(ct);
307 }
308
309 static void
310 destroy_conntrack(struct nf_conntrack *nfct)
311 {
312         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
313         struct ip_conntrack_protocol *proto;
314
315         DEBUGP("destroy_conntrack(%p)\n", ct);
316         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
317         IP_NF_ASSERT(!timer_pending(&ct->timeout));
318
319         set_bit(IPS_DYING_BIT, &ct->status);
320
321         /* To make sure we don't get any weird locking issues here:
322          * destroy_conntrack() MUST NOT be called with a write lock
323          * to ip_conntrack_lock!!! -HW */
324         proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
325         if (proto && proto->destroy)
326                 proto->destroy(ct);
327
328         if (ip_conntrack_destroyed)
329                 ip_conntrack_destroyed(ct);
330
331         write_lock_bh(&ip_conntrack_lock);
332         /* Expectations will have been removed in clean_from_lists,
333          * except TFTP can create an expectation on the first packet,
334          * before connection is in the list, so we need to clean here,
335          * too. */
336         ip_ct_remove_expectations(ct);
337
338         /* We overload first tuple to link into unconfirmed list. */
339         if (!is_confirmed(ct)) {
340                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
341                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
342         }
343
344         CONNTRACK_STAT_INC(delete);
345         write_unlock_bh(&ip_conntrack_lock);
346
347         if (ct->master)
348                 ip_conntrack_put(ct->master);
349
350         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
351         ip_conntrack_free(ct);
352 }
353
354 static void death_by_timeout(unsigned long ul_conntrack)
355 {
356         struct ip_conntrack *ct = (void *)ul_conntrack;
357
358         ip_conntrack_event(IPCT_DESTROY, ct);
359         write_lock_bh(&ip_conntrack_lock);
360         /* Inside lock so preempt is disabled on module removal path.
361          * Otherwise we can get spurious warnings. */
362         CONNTRACK_STAT_INC(delete_list);
363         clean_from_lists(ct);
364         write_unlock_bh(&ip_conntrack_lock);
365         ip_conntrack_put(ct);
366 }
367
368 static inline int
369 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
370                     const struct ip_conntrack_tuple *tuple,
371                     const struct ip_conntrack *ignored_conntrack)
372 {
373         ASSERT_READ_LOCK(&ip_conntrack_lock);
374         return tuplehash_to_ctrack(i) != ignored_conntrack
375                 && ip_ct_tuple_equal(tuple, &i->tuple);
376 }
377
378 struct ip_conntrack_tuple_hash *
379 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
380                     const struct ip_conntrack *ignored_conntrack)
381 {
382         struct ip_conntrack_tuple_hash *h;
383         unsigned int hash = hash_conntrack(tuple);
384
385         ASSERT_READ_LOCK(&ip_conntrack_lock);
386         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
387                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
388                         CONNTRACK_STAT_INC(found);
389                         return h;
390                 }
391                 CONNTRACK_STAT_INC(searched);
392         }
393
394         return NULL;
395 }
396
397 /* Find a connection corresponding to a tuple. */
398 struct ip_conntrack_tuple_hash *
399 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
400                       const struct ip_conntrack *ignored_conntrack)
401 {
402         struct ip_conntrack_tuple_hash *h;
403
404         read_lock_bh(&ip_conntrack_lock);
405         h = __ip_conntrack_find(tuple, ignored_conntrack);
406         if (h)
407                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
408         read_unlock_bh(&ip_conntrack_lock);
409
410         return h;
411 }
412
413 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
414                                         unsigned int hash,
415                                         unsigned int repl_hash) 
416 {
417         ct->id = ++ip_conntrack_next_id;
418         list_prepend(&ip_conntrack_hash[hash],
419                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
420         list_prepend(&ip_conntrack_hash[repl_hash],
421                      &ct->tuplehash[IP_CT_DIR_REPLY].list);
422 }
423
424 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
425 {
426         unsigned int hash, repl_hash;
427
428         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
429         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
430
431         write_lock_bh(&ip_conntrack_lock);
432         __ip_conntrack_hash_insert(ct, hash, repl_hash);
433         write_unlock_bh(&ip_conntrack_lock);
434 }
435
436 /* Confirm a connection given skb; places it in hash table */
437 int
438 __ip_conntrack_confirm(struct sk_buff **pskb)
439 {
440         unsigned int hash, repl_hash;
441         struct ip_conntrack *ct;
442         enum ip_conntrack_info ctinfo;
443
444         ct = ip_conntrack_get(*pskb, &ctinfo);
445
446         /* ipt_REJECT uses ip_conntrack_attach to attach related
447            ICMP/TCP RST packets in other direction.  Actual packet
448            which created connection will be IP_CT_NEW or for an
449            expected connection, IP_CT_RELATED. */
450         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
451                 return NF_ACCEPT;
452
453         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
454         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
455
456         /* We're not in hash table, and we refuse to set up related
457            connections for unconfirmed conns.  But packet copies and
458            REJECT will give spurious warnings here. */
459         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
460
461         /* No external references means noone else could have
462            confirmed us. */
463         IP_NF_ASSERT(!is_confirmed(ct));
464         DEBUGP("Confirming conntrack %p\n", ct);
465
466         write_lock_bh(&ip_conntrack_lock);
467
468         /* See if there's one in the list already, including reverse:
469            NAT could have grabbed it without realizing, since we're
470            not in the hash.  If there is, we lost race. */
471         if (!LIST_FIND(&ip_conntrack_hash[hash],
472                        conntrack_tuple_cmp,
473                        struct ip_conntrack_tuple_hash *,
474                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
475             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
476                           conntrack_tuple_cmp,
477                           struct ip_conntrack_tuple_hash *,
478                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
479                 /* Remove from unconfirmed list */
480                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
481
482                 __ip_conntrack_hash_insert(ct, hash, repl_hash);
483                 /* Timer relative to confirmation time, not original
484                    setting time, otherwise we'd get timer wrap in
485                    weird delay cases. */
486                 ct->timeout.expires += jiffies;
487                 add_timer(&ct->timeout);
488                 atomic_inc(&ct->ct_general.use);
489                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
490                 CONNTRACK_STAT_INC(insert);
491                 write_unlock_bh(&ip_conntrack_lock);
492                 if (ct->helper)
493                         ip_conntrack_event_cache(IPCT_HELPER, *pskb);
494 #ifdef CONFIG_IP_NF_NAT_NEEDED
495                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
496                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
497                         ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
498 #endif
499                 ip_conntrack_event_cache(master_ct(ct) ?
500                                          IPCT_RELATED : IPCT_NEW, *pskb);
501
502                 return NF_ACCEPT;
503         }
504
505         CONNTRACK_STAT_INC(insert_failed);
506         write_unlock_bh(&ip_conntrack_lock);
507
508         return NF_DROP;
509 }
510
511 /* Returns true if a connection correspondings to the tuple (required
512    for NAT). */
513 int
514 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
515                          const struct ip_conntrack *ignored_conntrack)
516 {
517         struct ip_conntrack_tuple_hash *h;
518
519         read_lock_bh(&ip_conntrack_lock);
520         h = __ip_conntrack_find(tuple, ignored_conntrack);
521         read_unlock_bh(&ip_conntrack_lock);
522
523         return h != NULL;
524 }
525
526 /* There's a small race here where we may free a just-assured
527    connection.  Too bad: we're in trouble anyway. */
528 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
529 {
530         return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
531 }
532
533 static int early_drop(struct list_head *chain)
534 {
535         /* Traverse backwards: gives us oldest, which is roughly LRU */
536         struct ip_conntrack_tuple_hash *h;
537         struct ip_conntrack *ct = NULL;
538         int dropped = 0;
539
540         read_lock_bh(&ip_conntrack_lock);
541         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
542         if (h) {
543                 ct = tuplehash_to_ctrack(h);
544                 atomic_inc(&ct->ct_general.use);
545         }
546         read_unlock_bh(&ip_conntrack_lock);
547
548         if (!ct)
549                 return dropped;
550
551         if (del_timer(&ct->timeout)) {
552                 death_by_timeout((unsigned long)ct);
553                 dropped = 1;
554                 CONNTRACK_STAT_INC(early_drop);
555         }
556         ip_conntrack_put(ct);
557         return dropped;
558 }
559
560 static inline int helper_cmp(const struct ip_conntrack_helper *i,
561                              const struct ip_conntrack_tuple *rtuple)
562 {
563         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
564 }
565
566 static struct ip_conntrack_helper *
567 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
568 {
569         return LIST_FIND(&helpers, helper_cmp,
570                          struct ip_conntrack_helper *,
571                          tuple);
572 }
573
574 struct ip_conntrack_helper *
575 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
576 {
577         struct ip_conntrack_helper *helper;
578
579         /* need ip_conntrack_lock to assure that helper exists until
580          * try_module_get() is called */
581         read_lock_bh(&ip_conntrack_lock);
582
583         helper = __ip_conntrack_helper_find(tuple);
584         if (helper) {
585                 /* need to increase module usage count to assure helper will
586                  * not go away while the caller is e.g. busy putting a
587                  * conntrack in the hash that uses the helper */
588                 if (!try_module_get(helper->me))
589                         helper = NULL;
590         }
591
592         read_unlock_bh(&ip_conntrack_lock);
593
594         return helper;
595 }
596
597 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
598 {
599         module_put(helper->me);
600 }
601
602 struct ip_conntrack_protocol *
603 __ip_conntrack_proto_find(u_int8_t protocol)
604 {
605         return ip_ct_protos[protocol];
606 }
607
608 /* this is guaranteed to always return a valid protocol helper, since
609  * it falls back to generic_protocol */
610 struct ip_conntrack_protocol *
611 ip_conntrack_proto_find_get(u_int8_t protocol)
612 {
613         struct ip_conntrack_protocol *p;
614
615         preempt_disable();
616         p = __ip_conntrack_proto_find(protocol);
617         if (p) {
618                 if (!try_module_get(p->me))
619                         p = &ip_conntrack_generic_protocol;
620         }
621         preempt_enable();
622         
623         return p;
624 }
625
626 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
627 {
628         module_put(p->me);
629 }
630
631 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
632                                         struct ip_conntrack_tuple *repl)
633 {
634         struct ip_conntrack *conntrack;
635
636         if (!ip_conntrack_hash_rnd_initted) {
637                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
638                 ip_conntrack_hash_rnd_initted = 1;
639         }
640
641         if (ip_conntrack_max
642             && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
643                 unsigned int hash = hash_conntrack(orig);
644                 /* Try dropping from this hash chain. */
645                 if (!early_drop(&ip_conntrack_hash[hash])) {
646                         if (net_ratelimit())
647                                 printk(KERN_WARNING
648                                        "ip_conntrack: table full, dropping"
649                                        " packet.\n");
650                         return ERR_PTR(-ENOMEM);
651                 }
652         }
653
654         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
655         if (!conntrack) {
656                 DEBUGP("Can't allocate conntrack.\n");
657                 return NULL;
658         }
659
660         memset(conntrack, 0, sizeof(*conntrack));
661         atomic_set(&conntrack->ct_general.use, 1);
662         conntrack->ct_general.destroy = destroy_conntrack;
663         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
664         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
665         /* Don't set timer yet: wait for confirmation */
666         init_timer(&conntrack->timeout);
667         conntrack->timeout.data = (unsigned long)conntrack;
668         conntrack->timeout.function = death_by_timeout;
669
670         atomic_inc(&ip_conntrack_count);
671
672         return conntrack;
673 }
674
675 void
676 ip_conntrack_free(struct ip_conntrack *conntrack)
677 {
678         atomic_dec(&ip_conntrack_count);
679         kmem_cache_free(ip_conntrack_cachep, conntrack);
680 }
681
682 /* Allocate a new conntrack: we return -ENOMEM if classification
683  * failed due to stress.   Otherwise it really is unclassifiable */
684 static struct ip_conntrack_tuple_hash *
685 init_conntrack(struct ip_conntrack_tuple *tuple,
686                struct ip_conntrack_protocol *protocol,
687                struct sk_buff *skb)
688 {
689         struct ip_conntrack *conntrack;
690         struct ip_conntrack_tuple repl_tuple;
691         struct ip_conntrack_expect *exp;
692
693         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
694                 DEBUGP("Can't invert tuple.\n");
695                 return NULL;
696         }
697
698         if (!(conntrack = ip_conntrack_alloc(tuple, &repl_tuple)))
699                 return NULL;
700
701         if (!protocol->new(conntrack, skb)) {
702                 ip_conntrack_free(conntrack);
703                 return NULL;
704         }
705
706         write_lock_bh(&ip_conntrack_lock);
707         exp = find_expectation(tuple);
708
709         if (exp) {
710                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
711                         conntrack, exp);
712                 /* Welcome, Mr. Bond.  We've been expecting you... */
713                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
714                 conntrack->master = exp->master;
715 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
716                 conntrack->mark = exp->master->mark;
717 #endif
718 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
719     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
720                 /* this is ugly, but there is no other place where to put it */
721                 conntrack->nat.masq_index = exp->master->nat.masq_index;
722 #endif
723                 nf_conntrack_get(&conntrack->master->ct_general);
724                 CONNTRACK_STAT_INC(expect_new);
725         } else {
726                 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
727
728                 CONNTRACK_STAT_INC(new);
729         }
730
731         /* Overload tuple linked list to put us in unconfirmed list. */
732         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
733
734         write_unlock_bh(&ip_conntrack_lock);
735
736         if (exp) {
737                 if (exp->expectfn)
738                         exp->expectfn(conntrack, exp);
739                 ip_conntrack_expect_put(exp);
740         }
741
742         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
743 }
744
745 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
746 static inline struct ip_conntrack *
747 resolve_normal_ct(struct sk_buff *skb,
748                   struct ip_conntrack_protocol *proto,
749                   int *set_reply,
750                   unsigned int hooknum,
751                   enum ip_conntrack_info *ctinfo)
752 {
753         struct ip_conntrack_tuple tuple;
754         struct ip_conntrack_tuple_hash *h;
755         struct ip_conntrack *ct;
756
757         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
758
759         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
760                                 &tuple,proto))
761                 return NULL;
762
763         /* look for tuple match */
764         h = ip_conntrack_find_get(&tuple, NULL);
765         if (!h) {
766                 h = init_conntrack(&tuple, proto, skb);
767                 if (!h)
768                         return NULL;
769                 if (IS_ERR(h))
770                         return (void *)h;
771         }
772         ct = tuplehash_to_ctrack(h);
773
774         /* It exists; we have (non-exclusive) reference. */
775         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
776                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
777                 /* Please set reply bit if this packet OK */
778                 *set_reply = 1;
779         } else {
780                 /* Once we've had two way comms, always ESTABLISHED. */
781                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
782                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
783                                ct);
784                         *ctinfo = IP_CT_ESTABLISHED;
785                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
786                         DEBUGP("ip_conntrack_in: related packet for %p\n",
787                                ct);
788                         *ctinfo = IP_CT_RELATED;
789                 } else {
790                         DEBUGP("ip_conntrack_in: new packet for %p\n",
791                                ct);
792                         *ctinfo = IP_CT_NEW;
793                 }
794                 *set_reply = 0;
795         }
796         skb->nfct = &ct->ct_general;
797         skb->nfctinfo = *ctinfo;
798         return ct;
799 }
800
801 /* Netfilter hook itself. */
802 unsigned int ip_conntrack_in(unsigned int hooknum,
803                              struct sk_buff **pskb,
804                              const struct net_device *in,
805                              const struct net_device *out,
806                              int (*okfn)(struct sk_buff *))
807 {
808         struct ip_conntrack *ct;
809         enum ip_conntrack_info ctinfo;
810         struct ip_conntrack_protocol *proto;
811         int set_reply = 0;
812         int ret;
813
814         /* Previously seen (loopback or untracked)?  Ignore. */
815         if ((*pskb)->nfct) {
816                 CONNTRACK_STAT_INC(ignore);
817                 return NF_ACCEPT;
818         }
819
820         /* Never happen */
821         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
822                 if (net_ratelimit()) {
823                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
824                        (*pskb)->nh.iph->protocol, hooknum);
825                 }
826                 return NF_DROP;
827         }
828
829 /* Doesn't cover locally-generated broadcast, so not worth it. */
830 #if 0
831         /* Ignore broadcast: no `connection'. */
832         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
833                 printk("Broadcast packet!\n");
834                 return NF_ACCEPT;
835         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
836                    == htonl(0x000000FF)) {
837                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
838                        NIPQUAD((*pskb)->nh.iph->saddr),
839                        NIPQUAD((*pskb)->nh.iph->daddr),
840                        (*pskb)->sk, (*pskb)->pkt_type);
841         }
842 #endif
843
844         proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
845
846         /* It may be an special packet, error, unclean...
847          * inverse of the return code tells to the netfilter
848          * core what to do with the packet. */
849         if (proto->error != NULL 
850             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
851                 CONNTRACK_STAT_INC(error);
852                 CONNTRACK_STAT_INC(invalid);
853                 return -ret;
854         }
855
856         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
857                 /* Not valid part of a connection */
858                 CONNTRACK_STAT_INC(invalid);
859                 return NF_ACCEPT;
860         }
861
862         if (IS_ERR(ct)) {
863                 /* Too stressed to deal. */
864                 CONNTRACK_STAT_INC(drop);
865                 return NF_DROP;
866         }
867
868         IP_NF_ASSERT((*pskb)->nfct);
869
870         ret = proto->packet(ct, *pskb, ctinfo);
871         if (ret < 0) {
872                 /* Invalid: inverse of the return code tells
873                  * the netfilter core what to do*/
874                 nf_conntrack_put((*pskb)->nfct);
875                 (*pskb)->nfct = NULL;
876                 CONNTRACK_STAT_INC(invalid);
877                 return -ret;
878         }
879
880         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
881                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
882
883         return ret;
884 }
885
886 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
887                    const struct ip_conntrack_tuple *orig)
888 {
889         return ip_ct_invert_tuple(inverse, orig, 
890                                   __ip_conntrack_proto_find(orig->dst.protonum));
891 }
892
893 /* Would two expected things clash? */
894 static inline int expect_clash(const struct ip_conntrack_expect *a,
895                                const struct ip_conntrack_expect *b)
896 {
897         /* Part covered by intersection of masks must be unequal,
898            otherwise they clash */
899         struct ip_conntrack_tuple intersect_mask
900                 = { { a->mask.src.ip & b->mask.src.ip,
901                       { a->mask.src.u.all & b->mask.src.u.all } },
902                     { a->mask.dst.ip & b->mask.dst.ip,
903                       { a->mask.dst.u.all & b->mask.dst.u.all },
904                       a->mask.dst.protonum & b->mask.dst.protonum } };
905
906         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
907 }
908
909 static inline int expect_matches(const struct ip_conntrack_expect *a,
910                                  const struct ip_conntrack_expect *b)
911 {
912         return a->master == b->master
913                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
914                 && ip_ct_tuple_equal(&a->mask, &b->mask);
915 }
916
917 /* Generally a bad idea to call this: could have matched already. */
918 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
919 {
920         struct ip_conntrack_expect *i;
921
922         write_lock_bh(&ip_conntrack_lock);
923         /* choose the the oldest expectation to evict */
924         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
925                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
926                         unlink_expect(i);
927                         write_unlock_bh(&ip_conntrack_lock);
928                         ip_conntrack_expect_put(i);
929                         return;
930                 }
931         }
932         write_unlock_bh(&ip_conntrack_lock);
933 }
934
935 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
936 {
937         struct ip_conntrack_expect *new;
938
939         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
940         if (!new) {
941                 DEBUGP("expect_related: OOM allocating expect\n");
942                 return NULL;
943         }
944         new->master = me;
945         atomic_inc(&new->master->ct_general.use);
946         atomic_set(&new->use, 1);
947         return new;
948 }
949
950 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
951 {
952         if (atomic_dec_and_test(&exp->use)) {
953                 ip_conntrack_put(exp->master);
954                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
955         }
956 }
957
958 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
959 {
960         atomic_inc(&exp->use);
961         exp->master->expecting++;
962         list_add(&exp->list, &ip_conntrack_expect_list);
963
964         init_timer(&exp->timeout);
965         exp->timeout.data = (unsigned long)exp;
966         exp->timeout.function = expectation_timed_out;
967         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
968         add_timer(&exp->timeout);
969
970         exp->id = ++ip_conntrack_expect_next_id;
971         atomic_inc(&exp->use);
972         CONNTRACK_STAT_INC(expect_create);
973 }
974
975 /* Race with expectations being used means we could have none to find; OK. */
976 static void evict_oldest_expect(struct ip_conntrack *master)
977 {
978         struct ip_conntrack_expect *i;
979
980         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
981                 if (i->master == master) {
982                         if (del_timer(&i->timeout)) {
983                                 unlink_expect(i);
984                                 ip_conntrack_expect_put(i);
985                         }
986                         break;
987                 }
988         }
989 }
990
991 static inline int refresh_timer(struct ip_conntrack_expect *i)
992 {
993         if (!del_timer(&i->timeout))
994                 return 0;
995
996         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
997         add_timer(&i->timeout);
998         return 1;
999 }
1000
1001 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1002 {
1003         struct ip_conntrack_expect *i;
1004         int ret;
1005
1006         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1007         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1008         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
1009
1010         write_lock_bh(&ip_conntrack_lock);
1011         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1012                 if (expect_matches(i, expect)) {
1013                         /* Refresh timer: if it's dying, ignore.. */
1014                         if (refresh_timer(i)) {
1015                                 ret = 0;
1016                                 goto out;
1017                         }
1018                 } else if (expect_clash(i, expect)) {
1019                         ret = -EBUSY;
1020                         goto out;
1021                 }
1022         }
1023
1024         /* Will be over limit? */
1025         if (expect->master->helper->max_expected && 
1026             expect->master->expecting >= expect->master->helper->max_expected)
1027                 evict_oldest_expect(expect->master);
1028
1029         ip_conntrack_expect_insert(expect);
1030         ip_conntrack_expect_event(IPEXP_NEW, expect);
1031         ret = 0;
1032 out:
1033         write_unlock_bh(&ip_conntrack_lock);
1034         return ret;
1035 }
1036
1037 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1038    implicitly racy: see __ip_conntrack_confirm */
1039 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1040                               const struct ip_conntrack_tuple *newreply)
1041 {
1042         write_lock_bh(&ip_conntrack_lock);
1043         /* Should be unconfirmed, so not in hash table yet */
1044         IP_NF_ASSERT(!is_confirmed(conntrack));
1045
1046         DEBUGP("Altering reply tuple of %p to ", conntrack);
1047         DUMP_TUPLE(newreply);
1048
1049         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1050         if (!conntrack->master && conntrack->expecting == 0)
1051                 conntrack->helper = __ip_conntrack_helper_find(newreply);
1052         write_unlock_bh(&ip_conntrack_lock);
1053 }
1054
1055 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1056 {
1057         BUG_ON(me->timeout == 0);
1058         write_lock_bh(&ip_conntrack_lock);
1059         list_prepend(&helpers, me);
1060         write_unlock_bh(&ip_conntrack_lock);
1061
1062         return 0;
1063 }
1064
1065 struct ip_conntrack_helper *
1066 __ip_conntrack_helper_find_byname(const char *name)
1067 {
1068         struct ip_conntrack_helper *h;
1069
1070         list_for_each_entry(h, &helpers, list) {
1071                 if (!strcmp(h->name, name))
1072                         return h;
1073         }
1074
1075         return NULL;
1076 }
1077
1078 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1079                          const struct ip_conntrack_helper *me)
1080 {
1081         if (tuplehash_to_ctrack(i)->helper == me) {
1082                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1083                 tuplehash_to_ctrack(i)->helper = NULL;
1084         }
1085         return 0;
1086 }
1087
1088 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1089 {
1090         unsigned int i;
1091         struct ip_conntrack_expect *exp, *tmp;
1092
1093         /* Need write lock here, to delete helper. */
1094         write_lock_bh(&ip_conntrack_lock);
1095         LIST_DELETE(&helpers, me);
1096
1097         /* Get rid of expectations */
1098         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1099                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1100                         unlink_expect(exp);
1101                         ip_conntrack_expect_put(exp);
1102                 }
1103         }
1104         /* Get rid of expecteds, set helpers to NULL. */
1105         LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1106         for (i = 0; i < ip_conntrack_htable_size; i++)
1107                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1108                             struct ip_conntrack_tuple_hash *, me);
1109         write_unlock_bh(&ip_conntrack_lock);
1110
1111         /* Someone could be still looking at the helper in a bh. */
1112         synchronize_net();
1113 }
1114
1115 static inline void ct_add_counters(struct ip_conntrack *ct,
1116                                    enum ip_conntrack_info ctinfo,
1117                                    const struct sk_buff *skb)
1118 {
1119 #ifdef CONFIG_IP_NF_CT_ACCT
1120         if (skb) {
1121                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1122                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
1123                                         ntohs(skb->nh.iph->tot_len);
1124         }
1125 #endif
1126 }
1127
1128 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
1129 void ip_ct_refresh_acct(struct ip_conntrack *ct, 
1130                         enum ip_conntrack_info ctinfo,
1131                         const struct sk_buff *skb,
1132                         unsigned long extra_jiffies)
1133 {
1134         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1135
1136         /* If not in hash table, timer will not be active yet */
1137         if (!is_confirmed(ct)) {
1138                 ct->timeout.expires = extra_jiffies;
1139                 ct_add_counters(ct, ctinfo, skb);
1140         } else {
1141                 write_lock_bh(&ip_conntrack_lock);
1142                 /* Need del_timer for race avoidance (may already be dying). */
1143                 if (del_timer(&ct->timeout)) {
1144                         ct->timeout.expires = jiffies + extra_jiffies;
1145                         add_timer(&ct->timeout);
1146                         ip_conntrack_event_cache(IPCT_REFRESH, skb);
1147                 }
1148                 ct_add_counters(ct, ctinfo, skb);
1149                 write_unlock_bh(&ip_conntrack_lock);
1150         }
1151 }
1152
1153 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1154     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1155 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1156  * in ip_conntrack_core, since we don't want the protocols to autoload
1157  * or depend on ctnetlink */
1158 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1159                                const struct ip_conntrack_tuple *tuple)
1160 {
1161         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1162                 &tuple->src.u.tcp.port);
1163         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1164                 &tuple->dst.u.tcp.port);
1165         return 0;
1166
1167 nfattr_failure:
1168         return -1;
1169 }
1170
1171 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1172                                struct ip_conntrack_tuple *t)
1173 {
1174         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1175                 return -EINVAL;
1176
1177         t->src.u.tcp.port =
1178                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1179         t->dst.u.tcp.port =
1180                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1181
1182         return 0;
1183 }
1184 #endif
1185
1186 /* Returns new sk_buff, or NULL */
1187 struct sk_buff *
1188 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1189 {
1190         skb_orphan(skb);
1191
1192         local_bh_disable(); 
1193         skb = ip_defrag(skb, user);
1194         local_bh_enable();
1195
1196         if (skb)
1197                 ip_send_check(skb->nh.iph);
1198         return skb;
1199 }
1200
1201 /* Used by ipt_REJECT. */
1202 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1203 {
1204         struct ip_conntrack *ct;
1205         enum ip_conntrack_info ctinfo;
1206
1207         /* This ICMP is in reverse direction to the packet which caused it */
1208         ct = ip_conntrack_get(skb, &ctinfo);
1209         
1210         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1211                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1212         else
1213                 ctinfo = IP_CT_RELATED;
1214
1215         /* Attach to new skbuff, and increment count */
1216         nskb->nfct = &ct->ct_general;
1217         nskb->nfctinfo = ctinfo;
1218         nf_conntrack_get(nskb->nfct);
1219 }
1220
1221 static inline int
1222 do_iter(const struct ip_conntrack_tuple_hash *i,
1223         int (*iter)(struct ip_conntrack *i, void *data),
1224         void *data)
1225 {
1226         return iter(tuplehash_to_ctrack(i), data);
1227 }
1228
1229 /* Bring out ya dead! */
1230 static struct ip_conntrack_tuple_hash *
1231 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1232                 void *data, unsigned int *bucket)
1233 {
1234         struct ip_conntrack_tuple_hash *h = NULL;
1235
1236         write_lock_bh(&ip_conntrack_lock);
1237         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1238                 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1239                                 struct ip_conntrack_tuple_hash *, iter, data);
1240                 if (h)
1241                         break;
1242         }
1243         if (!h)
1244                 h = LIST_FIND_W(&unconfirmed, do_iter,
1245                                 struct ip_conntrack_tuple_hash *, iter, data);
1246         if (h)
1247                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1248         write_unlock_bh(&ip_conntrack_lock);
1249
1250         return h;
1251 }
1252
1253 void
1254 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1255 {
1256         struct ip_conntrack_tuple_hash *h;
1257         unsigned int bucket = 0;
1258
1259         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1260                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1261                 /* Time to push up daises... */
1262                 if (del_timer(&ct->timeout))
1263                         death_by_timeout((unsigned long)ct);
1264                 /* ... else the timer will get him soon. */
1265
1266                 ip_conntrack_put(ct);
1267         }
1268 }
1269
1270 /* Fast function for those who don't want to parse /proc (and I don't
1271    blame them). */
1272 /* Reversing the socket's dst/src point of view gives us the reply
1273    mapping. */
1274 static int
1275 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1276 {
1277         struct inet_sock *inet = inet_sk(sk);
1278         struct ip_conntrack_tuple_hash *h;
1279         struct ip_conntrack_tuple tuple;
1280         
1281         IP_CT_TUPLE_U_BLANK(&tuple);
1282         tuple.src.ip = inet->rcv_saddr;
1283         tuple.src.u.tcp.port = inet->sport;
1284         tuple.dst.ip = inet->daddr;
1285         tuple.dst.u.tcp.port = inet->dport;
1286         tuple.dst.protonum = IPPROTO_TCP;
1287
1288         /* We only do TCP at the moment: is there a better way? */
1289         if (strcmp(sk->sk_prot->name, "TCP")) {
1290                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1291                 return -ENOPROTOOPT;
1292         }
1293
1294         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1295                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1296                        *len, sizeof(struct sockaddr_in));
1297                 return -EINVAL;
1298         }
1299
1300         h = ip_conntrack_find_get(&tuple, NULL);
1301         if (h) {
1302                 struct sockaddr_in sin;
1303                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1304
1305                 sin.sin_family = AF_INET;
1306                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1307                         .tuple.dst.u.tcp.port;
1308                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1309                         .tuple.dst.ip;
1310
1311                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1312                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1313                 ip_conntrack_put(ct);
1314                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1315                         return -EFAULT;
1316                 else
1317                         return 0;
1318         }
1319         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1320                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1321                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1322         return -ENOENT;
1323 }
1324
1325 static struct nf_sockopt_ops so_getorigdst = {
1326         .pf             = PF_INET,
1327         .get_optmin     = SO_ORIGINAL_DST,
1328         .get_optmax     = SO_ORIGINAL_DST+1,
1329         .get            = &getorigdst,
1330 };
1331
1332 static int kill_all(struct ip_conntrack *i, void *data)
1333 {
1334         return 1;
1335 }
1336
1337 static void free_conntrack_hash(void)
1338 {
1339         if (ip_conntrack_vmalloc)
1340                 vfree(ip_conntrack_hash);
1341         else
1342                 free_pages((unsigned long)ip_conntrack_hash, 
1343                            get_order(sizeof(struct list_head)
1344                                      * ip_conntrack_htable_size));
1345 }
1346
1347 void ip_conntrack_flush()
1348 {
1349         /* This makes sure all current packets have passed through
1350            netfilter framework.  Roll on, two-stage module
1351            delete... */
1352         synchronize_net();
1353
1354         ip_ct_event_cache_flush();
1355  i_see_dead_people:
1356         ip_ct_iterate_cleanup(kill_all, NULL);
1357         if (atomic_read(&ip_conntrack_count) != 0) {
1358                 schedule();
1359                 goto i_see_dead_people;
1360         }
1361         /* wait until all references to ip_conntrack_untracked are dropped */
1362         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1363                 schedule();
1364 }
1365
1366 /* Mishearing the voices in his head, our hero wonders how he's
1367    supposed to kill the mall. */
1368 void ip_conntrack_cleanup(void)
1369 {
1370         ip_ct_attach = NULL;
1371         ip_conntrack_flush();
1372         kmem_cache_destroy(ip_conntrack_cachep);
1373         kmem_cache_destroy(ip_conntrack_expect_cachep);
1374         free_conntrack_hash();
1375         nf_unregister_sockopt(&so_getorigdst);
1376 }
1377
1378 static int hashsize;
1379 module_param(hashsize, int, 0400);
1380
1381 int __init ip_conntrack_init(void)
1382 {
1383         unsigned int i;
1384         int ret;
1385
1386         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1387          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1388         if (hashsize) {
1389                 ip_conntrack_htable_size = hashsize;
1390         } else {
1391                 ip_conntrack_htable_size
1392                         = (((num_physpages << PAGE_SHIFT) / 16384)
1393                            / sizeof(struct list_head));
1394                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1395                         ip_conntrack_htable_size = 8192;
1396                 if (ip_conntrack_htable_size < 16)
1397                         ip_conntrack_htable_size = 16;
1398         }
1399         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1400
1401         printk("ip_conntrack version %s (%u buckets, %d max)"
1402                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1403                ip_conntrack_htable_size, ip_conntrack_max,
1404                sizeof(struct ip_conntrack));
1405
1406         ret = nf_register_sockopt(&so_getorigdst);
1407         if (ret != 0) {
1408                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1409                 return ret;
1410         }
1411
1412         /* AK: the hash table is twice as big than needed because it
1413            uses list_head.  it would be much nicer to caches to use a
1414            single pointer list head here. */
1415         ip_conntrack_vmalloc = 0; 
1416         ip_conntrack_hash 
1417                 =(void*)__get_free_pages(GFP_KERNEL, 
1418                                          get_order(sizeof(struct list_head)
1419                                                    *ip_conntrack_htable_size));
1420         if (!ip_conntrack_hash) { 
1421                 ip_conntrack_vmalloc = 1;
1422                 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1423                 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1424                                             * ip_conntrack_htable_size);
1425         }
1426         if (!ip_conntrack_hash) {
1427                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1428                 goto err_unreg_sockopt;
1429         }
1430
1431         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1432                                                 sizeof(struct ip_conntrack), 0,
1433                                                 0, NULL, NULL);
1434         if (!ip_conntrack_cachep) {
1435                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1436                 goto err_free_hash;
1437         }
1438
1439         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1440                                         sizeof(struct ip_conntrack_expect),
1441                                         0, 0, NULL, NULL);
1442         if (!ip_conntrack_expect_cachep) {
1443                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1444                 goto err_free_conntrack_slab;
1445         }
1446
1447         /* Don't NEED lock here, but good form anyway. */
1448         write_lock_bh(&ip_conntrack_lock);
1449         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1450                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1451         /* Sew in builtin protocols. */
1452         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1453         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1454         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1455         write_unlock_bh(&ip_conntrack_lock);
1456
1457         for (i = 0; i < ip_conntrack_htable_size; i++)
1458                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1459
1460         /* For use by ipt_REJECT */
1461         ip_ct_attach = ip_conntrack_attach;
1462
1463         /* Set up fake conntrack:
1464             - to never be deleted, not in any hashes */
1465         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1466         /*  - and look it like as a confirmed connection */
1467         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1468
1469         return ret;
1470
1471 err_free_conntrack_slab:
1472         kmem_cache_destroy(ip_conntrack_cachep);
1473 err_free_hash:
1474         free_conntrack_hash();
1475 err_unreg_sockopt:
1476         nf_unregister_sockopt(&so_getorigdst);
1477
1478         return -ENOMEM;
1479 }