[NETFILTER]: Decrease number of pointer derefs in nf_conntrack_core.c
[pandora-kernel.git] / net / netfilter / nf_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6  * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
7  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  *
13  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
14  *      - new API and handling of conntrack/nat helpers
15  *      - now capable of multiple expectations for one master
16  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
17  *      - add usage/reference counts to ip_conntrack_expect
18  *      - export ip_conntrack[_expect]_{find_get,put} functions
19  * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
20  *      - generalize L3 protocol denendent part.
21  * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
22  *      - add support various size of conntrack structures.
23  *
24  * Derived from net/ipv4/netfilter/ip_conntrack_core.c
25  */
26
27 #include <linux/config.h>
28 #include <linux/types.h>
29 #include <linux/netfilter.h>
30 #include <linux/module.h>
31 #include <linux/skbuff.h>
32 #include <linux/proc_fs.h>
33 #include <linux/vmalloc.h>
34 #include <linux/stddef.h>
35 #include <linux/slab.h>
36 #include <linux/random.h>
37 #include <linux/jhash.h>
38 #include <linux/err.h>
39 #include <linux/percpu.h>
40 #include <linux/moduleparam.h>
41 #include <linux/notifier.h>
42 #include <linux/kernel.h>
43 #include <linux/netdevice.h>
44 #include <linux/socket.h>
45
46 /* This rwlock protects the main hash table, protocol/helper/expected
47    registrations, conntrack timers*/
48 #define ASSERT_READ_LOCK(x)
49 #define ASSERT_WRITE_LOCK(x)
50
51 #include <net/netfilter/nf_conntrack.h>
52 #include <net/netfilter/nf_conntrack_l3proto.h>
53 #include <net/netfilter/nf_conntrack_protocol.h>
54 #include <net/netfilter/nf_conntrack_helper.h>
55 #include <net/netfilter/nf_conntrack_core.h>
56 #include <linux/netfilter_ipv4/listhelp.h>
57
58 #define NF_CONNTRACK_VERSION    "0.4.1"
59
60 #if 0
61 #define DEBUGP printk
62 #else
63 #define DEBUGP(format, args...)
64 #endif
65
66 DEFINE_RWLOCK(nf_conntrack_lock);
67
68 /* nf_conntrack_standalone needs this */
69 atomic_t nf_conntrack_count = ATOMIC_INIT(0);
70
71 void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
72 LIST_HEAD(nf_conntrack_expect_list);
73 struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
74 struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
75 static LIST_HEAD(helpers);
76 unsigned int nf_conntrack_htable_size = 0;
77 int nf_conntrack_max;
78 struct list_head *nf_conntrack_hash;
79 static kmem_cache_t *nf_conntrack_expect_cachep;
80 struct nf_conn nf_conntrack_untracked;
81 unsigned int nf_ct_log_invalid;
82 static LIST_HEAD(unconfirmed);
83 static int nf_conntrack_vmalloc;
84
85 #ifdef CONFIG_NF_CONNTRACK_EVENTS
86 struct notifier_block *nf_conntrack_chain;
87 struct notifier_block *nf_conntrack_expect_chain;
88
89 DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
90
91 /* deliver cached events and clear cache entry - must be called with locally
92  * disabled softirqs */
93 static inline void
94 __nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
95 {
96         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
97         if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
98             && ecache->events)
99                 notifier_call_chain(&nf_conntrack_chain, ecache->events,
100                                     ecache->ct);
101
102         ecache->events = 0;
103         nf_ct_put(ecache->ct);
104         ecache->ct = NULL;
105 }
106
107 /* Deliver all cached events for a particular conntrack. This is called
108  * by code prior to async packet handling for freeing the skb */
109 void nf_ct_deliver_cached_events(const struct nf_conn *ct)
110 {
111         struct nf_conntrack_ecache *ecache;
112
113         local_bh_disable();
114         ecache = &__get_cpu_var(nf_conntrack_ecache);
115         if (ecache->ct == ct)
116                 __nf_ct_deliver_cached_events(ecache);
117         local_bh_enable();
118 }
119
120 /* Deliver cached events for old pending events, if current conntrack != old */
121 void __nf_ct_event_cache_init(struct nf_conn *ct)
122 {
123         struct nf_conntrack_ecache *ecache;
124         
125         /* take care of delivering potentially old events */
126         ecache = &__get_cpu_var(nf_conntrack_ecache);
127         BUG_ON(ecache->ct == ct);
128         if (ecache->ct)
129                 __nf_ct_deliver_cached_events(ecache);
130         /* initialize for this conntrack/packet */
131         ecache->ct = ct;
132         nf_conntrack_get(&ct->ct_general);
133 }
134
135 /* flush the event cache - touches other CPU's data and must not be called
136  * while packets are still passing through the code */
137 static void nf_ct_event_cache_flush(void)
138 {
139         struct nf_conntrack_ecache *ecache;
140         int cpu;
141
142         for_each_cpu(cpu) {
143                 ecache = &per_cpu(nf_conntrack_ecache, cpu);
144                 if (ecache->ct)
145                         nf_ct_put(ecache->ct);
146         }
147 }
148 #else
149 static inline void nf_ct_event_cache_flush(void) {}
150 #endif /* CONFIG_NF_CONNTRACK_EVENTS */
151
152 DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
153 EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
154
155 /*
156  * This scheme offers various size of "struct nf_conn" dependent on
157  * features(helper, nat, ...)
158  */
159
160 #define NF_CT_FEATURES_NAMELEN  256
161 static struct {
162         /* name of slab cache. printed in /proc/slabinfo */
163         char *name;
164
165         /* size of slab cache */
166         size_t size;
167
168         /* slab cache pointer */
169         kmem_cache_t *cachep;
170
171         /* allocated slab cache + modules which uses this slab cache */
172         int use;
173
174         /* Initialization */
175         int (*init_conntrack)(struct nf_conn *, u_int32_t);
176
177 } nf_ct_cache[NF_CT_F_NUM];
178
179 /* protect members of nf_ct_cache except of "use" */
180 DEFINE_RWLOCK(nf_ct_cache_lock);
181
182 /* This avoids calling kmem_cache_create() with same name simultaneously */
183 DECLARE_MUTEX(nf_ct_cache_mutex);
184
185 extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
186 struct nf_conntrack_protocol *
187 nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol)
188 {
189         if (unlikely(nf_ct_protos[l3proto] == NULL))
190                 return &nf_conntrack_generic_protocol;
191
192         return nf_ct_protos[l3proto][protocol];
193 }
194
195 static int nf_conntrack_hash_rnd_initted;
196 static unsigned int nf_conntrack_hash_rnd;
197
198 static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
199                                   unsigned int size, unsigned int rnd)
200 {
201         unsigned int a, b;
202         a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
203                   ((tuple->src.l3num) << 16) | tuple->dst.protonum);
204         b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
205                         (tuple->src.u.all << 16) | tuple->dst.u.all);
206
207         return jhash_2words(a, b, rnd) % size;
208 }
209
210 static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
211 {
212         return __hash_conntrack(tuple, nf_conntrack_htable_size,
213                                 nf_conntrack_hash_rnd);
214 }
215
216 /* Initialize "struct nf_conn" which has spaces for helper */
217 static int
218 init_conntrack_for_helper(struct nf_conn *conntrack, u_int32_t features)
219 {
220
221         conntrack->help = (union nf_conntrack_help *)
222                 (((unsigned long)conntrack->data
223                   + (__alignof__(union nf_conntrack_help) - 1))
224                  & (~((unsigned long)(__alignof__(union nf_conntrack_help) -1))));
225         return 0;
226 }
227
228 int nf_conntrack_register_cache(u_int32_t features, const char *name,
229                                 size_t size,
230                                 int (*init)(struct nf_conn *, u_int32_t))
231 {
232         int ret = 0;
233         char *cache_name;
234         kmem_cache_t *cachep;
235
236         DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
237                features, name, size);
238
239         if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
240                 DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
241                         features);
242                 return -EINVAL;
243         }
244
245         down(&nf_ct_cache_mutex);
246
247         write_lock_bh(&nf_ct_cache_lock);
248         /* e.g: multiple helpers are loaded */
249         if (nf_ct_cache[features].use > 0) {
250                 DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
251                 if ((!strncmp(nf_ct_cache[features].name, name,
252                               NF_CT_FEATURES_NAMELEN))
253                     && nf_ct_cache[features].size == size
254                     && nf_ct_cache[features].init_conntrack == init) {
255                         DEBUGP("nf_conntrack_register_cache: reusing.\n");
256                         nf_ct_cache[features].use++;
257                         ret = 0;
258                 } else
259                         ret = -EBUSY;
260
261                 write_unlock_bh(&nf_ct_cache_lock);
262                 up(&nf_ct_cache_mutex);
263                 return ret;
264         }
265         write_unlock_bh(&nf_ct_cache_lock);
266
267         /*
268          * The memory space for name of slab cache must be alive until
269          * cache is destroyed.
270          */
271         cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
272         if (cache_name == NULL) {
273                 DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
274                 ret = -ENOMEM;
275                 goto out_up_mutex;
276         }
277
278         if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
279                                                 >= NF_CT_FEATURES_NAMELEN) {
280                 printk("nf_conntrack_register_cache: name too long\n");
281                 ret = -EINVAL;
282                 goto out_free_name;
283         }
284
285         cachep = kmem_cache_create(cache_name, size, 0, 0,
286                                    NULL, NULL);
287         if (!cachep) {
288                 printk("nf_conntrack_register_cache: Can't create slab cache "
289                        "for the features = 0x%x\n", features);
290                 ret = -ENOMEM;
291                 goto out_free_name;
292         }
293
294         write_lock_bh(&nf_ct_cache_lock);
295         nf_ct_cache[features].use = 1;
296         nf_ct_cache[features].size = size;
297         nf_ct_cache[features].init_conntrack = init;
298         nf_ct_cache[features].cachep = cachep;
299         nf_ct_cache[features].name = cache_name;
300         write_unlock_bh(&nf_ct_cache_lock);
301
302         goto out_up_mutex;
303
304 out_free_name:
305         kfree(cache_name);
306 out_up_mutex:
307         up(&nf_ct_cache_mutex);
308         return ret;
309 }
310
311 /* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
312 void nf_conntrack_unregister_cache(u_int32_t features)
313 {
314         kmem_cache_t *cachep;
315         char *name;
316
317         /*
318          * This assures that kmem_cache_create() isn't called before destroying
319          * slab cache.
320          */
321         DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
322         down(&nf_ct_cache_mutex);
323
324         write_lock_bh(&nf_ct_cache_lock);
325         if (--nf_ct_cache[features].use > 0) {
326                 write_unlock_bh(&nf_ct_cache_lock);
327                 up(&nf_ct_cache_mutex);
328                 return;
329         }
330         cachep = nf_ct_cache[features].cachep;
331         name = nf_ct_cache[features].name;
332         nf_ct_cache[features].cachep = NULL;
333         nf_ct_cache[features].name = NULL;
334         nf_ct_cache[features].init_conntrack = NULL;
335         nf_ct_cache[features].size = 0;
336         write_unlock_bh(&nf_ct_cache_lock);
337
338         synchronize_net();
339
340         kmem_cache_destroy(cachep);
341         kfree(name);
342
343         up(&nf_ct_cache_mutex);
344 }
345
346 int
347 nf_ct_get_tuple(const struct sk_buff *skb,
348                 unsigned int nhoff,
349                 unsigned int dataoff,
350                 u_int16_t l3num,
351                 u_int8_t protonum,
352                 struct nf_conntrack_tuple *tuple,
353                 const struct nf_conntrack_l3proto *l3proto,
354                 const struct nf_conntrack_protocol *protocol)
355 {
356         NF_CT_TUPLE_U_BLANK(tuple);
357
358         tuple->src.l3num = l3num;
359         if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
360                 return 0;
361
362         tuple->dst.protonum = protonum;
363         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
364
365         return protocol->pkt_to_tuple(skb, dataoff, tuple);
366 }
367
368 int
369 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
370                    const struct nf_conntrack_tuple *orig,
371                    const struct nf_conntrack_l3proto *l3proto,
372                    const struct nf_conntrack_protocol *protocol)
373 {
374         NF_CT_TUPLE_U_BLANK(inverse);
375
376         inverse->src.l3num = orig->src.l3num;
377         if (l3proto->invert_tuple(inverse, orig) == 0)
378                 return 0;
379
380         inverse->dst.dir = !orig->dst.dir;
381
382         inverse->dst.protonum = orig->dst.protonum;
383         return protocol->invert_tuple(inverse, orig);
384 }
385
386 /* nf_conntrack_expect helper functions */
387 static void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
388 {
389         ASSERT_WRITE_LOCK(&nf_conntrack_lock);
390         NF_CT_ASSERT(!timer_pending(&exp->timeout));
391         list_del(&exp->list);
392         NF_CT_STAT_INC(expect_delete);
393         exp->master->expecting--;
394         nf_conntrack_expect_put(exp);
395 }
396
397 static void expectation_timed_out(unsigned long ul_expect)
398 {
399         struct nf_conntrack_expect *exp = (void *)ul_expect;
400
401         write_lock_bh(&nf_conntrack_lock);
402         nf_ct_unlink_expect(exp);
403         write_unlock_bh(&nf_conntrack_lock);
404         nf_conntrack_expect_put(exp);
405 }
406
407 /* If an expectation for this connection is found, it gets delete from
408  * global list then returned. */
409 static struct nf_conntrack_expect *
410 find_expectation(const struct nf_conntrack_tuple *tuple)
411 {
412         struct nf_conntrack_expect *i;
413
414         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
415         /* If master is not in hash table yet (ie. packet hasn't left
416            this machine yet), how can other end know about expected?
417            Hence these are not the droids you are looking for (if
418            master ct never got confirmed, we'd hold a reference to it
419            and weird things would happen to future packets). */
420                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
421                     && nf_ct_is_confirmed(i->master)) {
422                         if (i->flags & NF_CT_EXPECT_PERMANENT) {
423                                 atomic_inc(&i->use);
424                                 return i;
425                         } else if (del_timer(&i->timeout)) {
426                                 nf_ct_unlink_expect(i);
427                                 return i;
428                         }
429                 }
430         }
431         return NULL;
432 }
433
434 /* delete all expectations for this conntrack */
435 static void remove_expectations(struct nf_conn *ct)
436 {
437         struct nf_conntrack_expect *i, *tmp;
438
439         /* Optimization: most connection never expect any others. */
440         if (ct->expecting == 0)
441                 return;
442
443         list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
444                 if (i->master == ct && del_timer(&i->timeout)) {
445                         nf_ct_unlink_expect(i);
446                         nf_conntrack_expect_put(i);
447                 }
448         }
449 }
450
451 static void
452 clean_from_lists(struct nf_conn *ct)
453 {
454         unsigned int ho, hr;
455         
456         DEBUGP("clean_from_lists(%p)\n", ct);
457         ASSERT_WRITE_LOCK(&nf_conntrack_lock);
458
459         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
460         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
461         LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
462         LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
463
464         /* Destroy all pending expectations */
465         remove_expectations(ct);
466 }
467
468 static void
469 destroy_conntrack(struct nf_conntrack *nfct)
470 {
471         struct nf_conn *ct = (struct nf_conn *)nfct;
472         struct nf_conntrack_l3proto *l3proto;
473         struct nf_conntrack_protocol *proto;
474
475         DEBUGP("destroy_conntrack(%p)\n", ct);
476         NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
477         NF_CT_ASSERT(!timer_pending(&ct->timeout));
478
479         nf_conntrack_event(IPCT_DESTROY, ct);
480         set_bit(IPS_DYING_BIT, &ct->status);
481
482         /* To make sure we don't get any weird locking issues here:
483          * destroy_conntrack() MUST NOT be called with a write lock
484          * to nf_conntrack_lock!!! -HW */
485         l3proto = nf_ct_find_l3proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
486         if (l3proto && l3proto->destroy)
487                 l3proto->destroy(ct);
488
489         proto = nf_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num,
490                                  ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
491         if (proto && proto->destroy)
492                 proto->destroy(ct);
493
494         if (nf_conntrack_destroyed)
495                 nf_conntrack_destroyed(ct);
496
497         write_lock_bh(&nf_conntrack_lock);
498         /* Expectations will have been removed in clean_from_lists,
499          * except TFTP can create an expectation on the first packet,
500          * before connection is in the list, so we need to clean here,
501          * too. */
502         remove_expectations(ct);
503
504         /* We overload first tuple to link into unconfirmed list. */
505         if (!nf_ct_is_confirmed(ct)) {
506                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
507                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
508         }
509
510         NF_CT_STAT_INC(delete);
511         write_unlock_bh(&nf_conntrack_lock);
512
513         if (ct->master)
514                 nf_ct_put(ct->master);
515
516         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
517         nf_conntrack_free(ct);
518 }
519
520 static void death_by_timeout(unsigned long ul_conntrack)
521 {
522         struct nf_conn *ct = (void *)ul_conntrack;
523
524         write_lock_bh(&nf_conntrack_lock);
525         /* Inside lock so preempt is disabled on module removal path.
526          * Otherwise we can get spurious warnings. */
527         NF_CT_STAT_INC(delete_list);
528         clean_from_lists(ct);
529         write_unlock_bh(&nf_conntrack_lock);
530         nf_ct_put(ct);
531 }
532
533 static inline int
534 conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
535                     const struct nf_conntrack_tuple *tuple,
536                     const struct nf_conn *ignored_conntrack)
537 {
538         ASSERT_READ_LOCK(&nf_conntrack_lock);
539         return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
540                 && nf_ct_tuple_equal(tuple, &i->tuple);
541 }
542
543 static struct nf_conntrack_tuple_hash *
544 __nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
545                     const struct nf_conn *ignored_conntrack)
546 {
547         struct nf_conntrack_tuple_hash *h;
548         unsigned int hash = hash_conntrack(tuple);
549
550         ASSERT_READ_LOCK(&nf_conntrack_lock);
551         list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
552                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
553                         NF_CT_STAT_INC(found);
554                         return h;
555                 }
556                 NF_CT_STAT_INC(searched);
557         }
558
559         return NULL;
560 }
561
562 /* Find a connection corresponding to a tuple. */
563 struct nf_conntrack_tuple_hash *
564 nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
565                       const struct nf_conn *ignored_conntrack)
566 {
567         struct nf_conntrack_tuple_hash *h;
568
569         read_lock_bh(&nf_conntrack_lock);
570         h = __nf_conntrack_find(tuple, ignored_conntrack);
571         if (h)
572                 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
573         read_unlock_bh(&nf_conntrack_lock);
574
575         return h;
576 }
577
578 /* Confirm a connection given skb; places it in hash table */
579 int
580 __nf_conntrack_confirm(struct sk_buff **pskb)
581 {
582         unsigned int hash, repl_hash;
583         struct nf_conn *ct;
584         enum ip_conntrack_info ctinfo;
585
586         ct = nf_ct_get(*pskb, &ctinfo);
587
588         /* ipt_REJECT uses nf_conntrack_attach to attach related
589            ICMP/TCP RST packets in other direction.  Actual packet
590            which created connection will be IP_CT_NEW or for an
591            expected connection, IP_CT_RELATED. */
592         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
593                 return NF_ACCEPT;
594
595         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
596         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
597
598         /* We're not in hash table, and we refuse to set up related
599            connections for unconfirmed conns.  But packet copies and
600            REJECT will give spurious warnings here. */
601         /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
602
603         /* No external references means noone else could have
604            confirmed us. */
605         NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
606         DEBUGP("Confirming conntrack %p\n", ct);
607
608         write_lock_bh(&nf_conntrack_lock);
609
610         /* See if there's one in the list already, including reverse:
611            NAT could have grabbed it without realizing, since we're
612            not in the hash.  If there is, we lost race. */
613         if (!LIST_FIND(&nf_conntrack_hash[hash],
614                        conntrack_tuple_cmp,
615                        struct nf_conntrack_tuple_hash *,
616                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
617             && !LIST_FIND(&nf_conntrack_hash[repl_hash],
618                           conntrack_tuple_cmp,
619                           struct nf_conntrack_tuple_hash *,
620                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
621                 /* Remove from unconfirmed list */
622                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
623
624                 list_prepend(&nf_conntrack_hash[hash],
625                              &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
626                 list_prepend(&nf_conntrack_hash[repl_hash],
627                              &ct->tuplehash[IP_CT_DIR_REPLY]);
628                 /* Timer relative to confirmation time, not original
629                    setting time, otherwise we'd get timer wrap in
630                    weird delay cases. */
631                 ct->timeout.expires += jiffies;
632                 add_timer(&ct->timeout);
633                 atomic_inc(&ct->ct_general.use);
634                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
635                 NF_CT_STAT_INC(insert);
636                 write_unlock_bh(&nf_conntrack_lock);
637                 if (ct->helper)
638                         nf_conntrack_event_cache(IPCT_HELPER, *pskb);
639 #ifdef CONFIG_NF_NAT_NEEDED
640                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
641                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
642                         nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
643 #endif
644                 nf_conntrack_event_cache(master_ct(ct) ?
645                                          IPCT_RELATED : IPCT_NEW, *pskb);
646                 return NF_ACCEPT;
647         }
648
649         NF_CT_STAT_INC(insert_failed);
650         write_unlock_bh(&nf_conntrack_lock);
651         return NF_DROP;
652 }
653
654 /* Returns true if a connection correspondings to the tuple (required
655    for NAT). */
656 int
657 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
658                          const struct nf_conn *ignored_conntrack)
659 {
660         struct nf_conntrack_tuple_hash *h;
661
662         read_lock_bh(&nf_conntrack_lock);
663         h = __nf_conntrack_find(tuple, ignored_conntrack);
664         read_unlock_bh(&nf_conntrack_lock);
665
666         return h != NULL;
667 }
668
669 /* There's a small race here where we may free a just-assured
670    connection.  Too bad: we're in trouble anyway. */
671 static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
672 {
673         return !(test_bit(IPS_ASSURED_BIT,
674                           &nf_ct_tuplehash_to_ctrack(i)->status));
675 }
676
677 static int early_drop(struct list_head *chain)
678 {
679         /* Traverse backwards: gives us oldest, which is roughly LRU */
680         struct nf_conntrack_tuple_hash *h;
681         struct nf_conn *ct = NULL;
682         int dropped = 0;
683
684         read_lock_bh(&nf_conntrack_lock);
685         h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
686         if (h) {
687                 ct = nf_ct_tuplehash_to_ctrack(h);
688                 atomic_inc(&ct->ct_general.use);
689         }
690         read_unlock_bh(&nf_conntrack_lock);
691
692         if (!ct)
693                 return dropped;
694
695         if (del_timer(&ct->timeout)) {
696                 death_by_timeout((unsigned long)ct);
697                 dropped = 1;
698                 NF_CT_STAT_INC(early_drop);
699         }
700         nf_ct_put(ct);
701         return dropped;
702 }
703
704 static inline int helper_cmp(const struct nf_conntrack_helper *i,
705                              const struct nf_conntrack_tuple *rtuple)
706 {
707         return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
708 }
709
710 static struct nf_conntrack_helper *
711 nf_ct_find_helper(const struct nf_conntrack_tuple *tuple)
712 {
713         return LIST_FIND(&helpers, helper_cmp,
714                          struct nf_conntrack_helper *,
715                          tuple);
716 }
717
718 static struct nf_conn *
719 __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
720                      const struct nf_conntrack_tuple *repl,
721                      const struct nf_conntrack_l3proto *l3proto)
722 {
723         struct nf_conn *conntrack = NULL;
724         u_int32_t features = 0;
725
726         if (!nf_conntrack_hash_rnd_initted) {
727                 get_random_bytes(&nf_conntrack_hash_rnd, 4);
728                 nf_conntrack_hash_rnd_initted = 1;
729         }
730
731         if (nf_conntrack_max
732             && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
733                 unsigned int hash = hash_conntrack(orig);
734                 /* Try dropping from this hash chain. */
735                 if (!early_drop(&nf_conntrack_hash[hash])) {
736                         if (net_ratelimit())
737                                 printk(KERN_WARNING
738                                        "nf_conntrack: table full, dropping"
739                                        " packet.\n");
740                         return ERR_PTR(-ENOMEM);
741                 }
742         }
743
744         /*  find features needed by this conntrack. */
745         features = l3proto->get_features(orig);
746         read_lock_bh(&nf_conntrack_lock);
747         if (nf_ct_find_helper(repl) != NULL)
748                 features |= NF_CT_F_HELP;
749         read_unlock_bh(&nf_conntrack_lock);
750
751         DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
752
753         read_lock_bh(&nf_ct_cache_lock);
754
755         if (!nf_ct_cache[features].use) {
756                 DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
757                         features);
758                 goto out;
759         }
760
761         conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
762         if (conntrack == NULL) {
763                 DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
764                 goto out;
765         }
766
767         memset(conntrack, 0, nf_ct_cache[features].size);
768         conntrack->features = features;
769         if (nf_ct_cache[features].init_conntrack &&
770             nf_ct_cache[features].init_conntrack(conntrack, features) < 0) {
771                 DEBUGP("nf_conntrack_alloc: failed to init\n");
772                 kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
773                 conntrack = NULL;
774                 goto out;
775         }
776
777         atomic_set(&conntrack->ct_general.use, 1);
778         conntrack->ct_general.destroy = destroy_conntrack;
779         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
780         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
781         /* Don't set timer yet: wait for confirmation */
782         init_timer(&conntrack->timeout);
783         conntrack->timeout.data = (unsigned long)conntrack;
784         conntrack->timeout.function = death_by_timeout;
785
786         atomic_inc(&nf_conntrack_count);
787 out:
788         read_unlock_bh(&nf_ct_cache_lock);
789         return conntrack;
790 }
791
792 struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
793                                    const struct nf_conntrack_tuple *repl)
794 {
795         struct nf_conntrack_l3proto *l3proto;
796
797         l3proto = nf_ct_find_l3proto(orig->src.l3num);
798         return __nf_conntrack_alloc(orig, repl, l3proto);
799 }
800
801 void nf_conntrack_free(struct nf_conn *conntrack)
802 {
803         u_int32_t features = conntrack->features;
804         NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
805         DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
806                conntrack);
807         kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
808         atomic_dec(&nf_conntrack_count);
809 }
810
811 /* Allocate a new conntrack: we return -ENOMEM if classification
812    failed due to stress.  Otherwise it really is unclassifiable. */
813 static struct nf_conntrack_tuple_hash *
814 init_conntrack(const struct nf_conntrack_tuple *tuple,
815                struct nf_conntrack_l3proto *l3proto,
816                struct nf_conntrack_protocol *protocol,
817                struct sk_buff *skb,
818                unsigned int dataoff)
819 {
820         struct nf_conn *conntrack;
821         struct nf_conntrack_tuple repl_tuple;
822         struct nf_conntrack_expect *exp;
823
824         if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
825                 DEBUGP("Can't invert tuple.\n");
826                 return NULL;
827         }
828
829         conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
830         if (conntrack == NULL || IS_ERR(conntrack)) {
831                 DEBUGP("Can't allocate conntrack.\n");
832                 return (struct nf_conntrack_tuple_hash *)conntrack;
833         }
834
835         if (!protocol->new(conntrack, skb, dataoff)) {
836                 nf_conntrack_free(conntrack);
837                 DEBUGP("init conntrack: can't track with proto module\n");
838                 return NULL;
839         }
840
841         write_lock_bh(&nf_conntrack_lock);
842         exp = find_expectation(tuple);
843
844         if (exp) {
845                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
846                         conntrack, exp);
847                 /* Welcome, Mr. Bond.  We've been expecting you... */
848                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
849                 conntrack->master = exp->master;
850 #ifdef CONFIG_NF_CONNTRACK_MARK
851                 conntrack->mark = exp->master->mark;
852 #endif
853                 nf_conntrack_get(&conntrack->master->ct_general);
854                 NF_CT_STAT_INC(expect_new);
855         } else {
856                 conntrack->helper = nf_ct_find_helper(&repl_tuple);
857
858                 NF_CT_STAT_INC(new);
859         }
860
861         /* Overload tuple linked list to put us in unconfirmed list. */
862         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
863
864         write_unlock_bh(&nf_conntrack_lock);
865
866         if (exp) {
867                 if (exp->expectfn)
868                         exp->expectfn(conntrack, exp);
869                 nf_conntrack_expect_put(exp);
870         }
871
872         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
873 }
874
875 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
876 static inline struct nf_conn *
877 resolve_normal_ct(struct sk_buff *skb,
878                   unsigned int dataoff,
879                   u_int16_t l3num,
880                   u_int8_t protonum,
881                   struct nf_conntrack_l3proto *l3proto,
882                   struct nf_conntrack_protocol *proto,
883                   int *set_reply,
884                   enum ip_conntrack_info *ctinfo)
885 {
886         struct nf_conntrack_tuple tuple;
887         struct nf_conntrack_tuple_hash *h;
888         struct nf_conn *ct;
889
890         if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
891                              dataoff, l3num, protonum, &tuple, l3proto,
892                              proto)) {
893                 DEBUGP("resolve_normal_ct: Can't get tuple\n");
894                 return NULL;
895         }
896
897         /* look for tuple match */
898         h = nf_conntrack_find_get(&tuple, NULL);
899         if (!h) {
900                 h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
901                 if (!h)
902                         return NULL;
903                 if (IS_ERR(h))
904                         return (void *)h;
905         }
906         ct = nf_ct_tuplehash_to_ctrack(h);
907
908         /* It exists; we have (non-exclusive) reference. */
909         if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
910                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
911                 /* Please set reply bit if this packet OK */
912                 *set_reply = 1;
913         } else {
914                 /* Once we've had two way comms, always ESTABLISHED. */
915                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
916                         DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
917                         *ctinfo = IP_CT_ESTABLISHED;
918                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
919                         DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
920                         *ctinfo = IP_CT_RELATED;
921                 } else {
922                         DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
923                         *ctinfo = IP_CT_NEW;
924                 }
925                 *set_reply = 0;
926         }
927         skb->nfct = &ct->ct_general;
928         skb->nfctinfo = *ctinfo;
929         return ct;
930 }
931
932 unsigned int
933 nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
934 {
935         struct nf_conn *ct;
936         enum ip_conntrack_info ctinfo;
937         struct nf_conntrack_l3proto *l3proto;
938         struct nf_conntrack_protocol *proto;
939         unsigned int dataoff;
940         u_int8_t protonum;
941         int set_reply = 0;
942         int ret;
943
944         /* Previously seen (loopback or untracked)?  Ignore. */
945         if ((*pskb)->nfct) {
946                 NF_CT_STAT_INC(ignore);
947                 return NF_ACCEPT;
948         }
949
950         l3proto = nf_ct_find_l3proto((u_int16_t)pf);
951         if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
952                 DEBUGP("not prepared to track yet or error occured\n");
953                 return -ret;
954         }
955
956         proto = nf_ct_find_proto((u_int16_t)pf, protonum);
957
958         /* It may be an special packet, error, unclean...
959          * inverse of the return code tells to the netfilter
960          * core what to do with the packet. */
961         if (proto->error != NULL &&
962             (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
963                 NF_CT_STAT_INC(error);
964                 NF_CT_STAT_INC(invalid);
965                 return -ret;
966         }
967
968         ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
969                                &set_reply, &ctinfo);
970         if (!ct) {
971                 /* Not valid part of a connection */
972                 NF_CT_STAT_INC(invalid);
973                 return NF_ACCEPT;
974         }
975
976         if (IS_ERR(ct)) {
977                 /* Too stressed to deal. */
978                 NF_CT_STAT_INC(drop);
979                 return NF_DROP;
980         }
981
982         NF_CT_ASSERT((*pskb)->nfct);
983
984         ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
985         if (ret < 0) {
986                 /* Invalid: inverse of the return code tells
987                  * the netfilter core what to do */
988                 DEBUGP("nf_conntrack_in: Can't track with proto module\n");
989                 nf_conntrack_put((*pskb)->nfct);
990                 (*pskb)->nfct = NULL;
991                 NF_CT_STAT_INC(invalid);
992                 return -ret;
993         }
994
995         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
996                 nf_conntrack_event_cache(IPCT_STATUS, *pskb);
997
998         return ret;
999 }
1000
1001 int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1002                          const struct nf_conntrack_tuple *orig)
1003 {
1004         return nf_ct_invert_tuple(inverse, orig,
1005                                   nf_ct_find_l3proto(orig->src.l3num),
1006                                   nf_ct_find_proto(orig->src.l3num,
1007                                                    orig->dst.protonum));
1008 }
1009
1010 /* Would two expected things clash? */
1011 static inline int expect_clash(const struct nf_conntrack_expect *a,
1012                                const struct nf_conntrack_expect *b)
1013 {
1014         /* Part covered by intersection of masks must be unequal,
1015            otherwise they clash */
1016         struct nf_conntrack_tuple intersect_mask;
1017         int count;
1018
1019         intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
1020         intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
1021         intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
1022         intersect_mask.dst.protonum = a->mask.dst.protonum
1023                                         & b->mask.dst.protonum;
1024
1025         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1026                 intersect_mask.src.u3.all[count] =
1027                         a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
1028         }
1029
1030         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1031                 intersect_mask.dst.u3.all[count] =
1032                         a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
1033         }
1034
1035         return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
1036 }
1037
1038 static inline int expect_matches(const struct nf_conntrack_expect *a,
1039                                  const struct nf_conntrack_expect *b)
1040 {
1041         return a->master == b->master
1042                 && nf_ct_tuple_equal(&a->tuple, &b->tuple)
1043                 && nf_ct_tuple_equal(&a->mask, &b->mask);
1044 }
1045
1046 /* Generally a bad idea to call this: could have matched already. */
1047 void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
1048 {
1049         struct nf_conntrack_expect *i;
1050
1051         write_lock_bh(&nf_conntrack_lock);
1052         /* choose the the oldest expectation to evict */
1053         list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1054                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
1055                         nf_ct_unlink_expect(i);
1056                         write_unlock_bh(&nf_conntrack_lock);
1057                         nf_conntrack_expect_put(i);
1058                         return;
1059                 }
1060         }
1061         write_unlock_bh(&nf_conntrack_lock);
1062 }
1063
1064 /* We don't increase the master conntrack refcount for non-fulfilled
1065  * conntracks. During the conntrack destruction, the expectations are
1066  * always killed before the conntrack itself */
1067 struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me)
1068 {
1069         struct nf_conntrack_expect *new;
1070
1071         new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
1072         if (!new) {
1073                 DEBUGP("expect_related: OOM allocating expect\n");
1074                 return NULL;
1075         }
1076         new->master = me;
1077         atomic_set(&new->use, 1);
1078         return new;
1079 }
1080
1081 void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
1082 {
1083         if (atomic_dec_and_test(&exp->use))
1084                 kmem_cache_free(nf_conntrack_expect_cachep, exp);
1085 }
1086
1087 static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
1088 {
1089         atomic_inc(&exp->use);
1090         exp->master->expecting++;
1091         list_add(&exp->list, &nf_conntrack_expect_list);
1092
1093         init_timer(&exp->timeout);
1094         exp->timeout.data = (unsigned long)exp;
1095         exp->timeout.function = expectation_timed_out;
1096         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
1097         add_timer(&exp->timeout);
1098
1099         atomic_inc(&exp->use);
1100         NF_CT_STAT_INC(expect_create);
1101 }
1102
1103 /* Race with expectations being used means we could have none to find; OK. */
1104 static void evict_oldest_expect(struct nf_conn *master)
1105 {
1106         struct nf_conntrack_expect *i;
1107
1108         list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1109                 if (i->master == master) {
1110                         if (del_timer(&i->timeout)) {
1111                                 nf_ct_unlink_expect(i);
1112                                 nf_conntrack_expect_put(i);
1113                         }
1114                         break;
1115                 }
1116         }
1117 }
1118
1119 static inline int refresh_timer(struct nf_conntrack_expect *i)
1120 {
1121         if (!del_timer(&i->timeout))
1122                 return 0;
1123
1124         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
1125         add_timer(&i->timeout);
1126         return 1;
1127 }
1128
1129 int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
1130 {
1131         struct nf_conntrack_expect *i;
1132         struct nf_conn *master = expect->master;
1133         int ret;
1134
1135         DEBUGP("nf_conntrack_expect_related %p\n", related_to);
1136         DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
1137         DEBUGP("mask:  "); NF_CT_DUMP_TUPLE(&expect->mask);
1138
1139         write_lock_bh(&nf_conntrack_lock);
1140         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
1141                 if (expect_matches(i, expect)) {
1142                         /* Refresh timer: if it's dying, ignore.. */
1143                         if (refresh_timer(i)) {
1144                                 ret = 0;
1145                                 goto out;
1146                         }
1147                 } else if (expect_clash(i, expect)) {
1148                         ret = -EBUSY;
1149                         goto out;
1150                 }
1151         }
1152         /* Will be over limit? */
1153         if (master->helper->max_expected && 
1154             master->expecting >= master->helper->max_expected)
1155                 evict_oldest_expect(master);
1156
1157         nf_conntrack_expect_insert(expect);
1158         nf_conntrack_expect_event(IPEXP_NEW, expect);
1159         ret = 0;
1160 out:
1161         write_unlock_bh(&nf_conntrack_lock);
1162         return ret;
1163 }
1164
1165 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1166    implicitly racy: see __nf_conntrack_confirm */
1167 void nf_conntrack_alter_reply(struct nf_conn *conntrack,
1168                               const struct nf_conntrack_tuple *newreply)
1169 {
1170         write_lock_bh(&nf_conntrack_lock);
1171         /* Should be unconfirmed, so not in hash table yet */
1172         NF_CT_ASSERT(!nf_ct_is_confirmed(conntrack));
1173
1174         DEBUGP("Altering reply tuple of %p to ", conntrack);
1175         NF_CT_DUMP_TUPLE(newreply);
1176
1177         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1178         if (!conntrack->master && conntrack->expecting == 0)
1179                 conntrack->helper = nf_ct_find_helper(newreply);
1180         write_unlock_bh(&nf_conntrack_lock);
1181 }
1182
1183 int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
1184 {
1185         int ret;
1186         BUG_ON(me->timeout == 0);
1187
1188         ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
1189                                           sizeof(struct nf_conn)
1190                                           + sizeof(union nf_conntrack_help)
1191                                           + __alignof__(union nf_conntrack_help),
1192                                           init_conntrack_for_helper);
1193         if (ret < 0) {
1194                 printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
1195                 return ret;
1196         }
1197         write_lock_bh(&nf_conntrack_lock);
1198         list_prepend(&helpers, me);
1199         write_unlock_bh(&nf_conntrack_lock);
1200
1201         return 0;
1202 }
1203
1204 static inline int unhelp(struct nf_conntrack_tuple_hash *i,
1205                          const struct nf_conntrack_helper *me)
1206 {
1207         if (nf_ct_tuplehash_to_ctrack(i)->helper == me) {
1208                 nf_conntrack_event(IPCT_HELPER, nf_ct_tuplehash_to_ctrack(i));
1209                 nf_ct_tuplehash_to_ctrack(i)->helper = NULL;
1210         }
1211         return 0;
1212 }
1213
1214 void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
1215 {
1216         unsigned int i;
1217         struct nf_conntrack_expect *exp, *tmp;
1218
1219         /* Need write lock here, to delete helper. */
1220         write_lock_bh(&nf_conntrack_lock);
1221         LIST_DELETE(&helpers, me);
1222
1223         /* Get rid of expectations */
1224         list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
1225                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1226                         nf_ct_unlink_expect(exp);
1227                         nf_conntrack_expect_put(exp);
1228                 }
1229         }
1230
1231         /* Get rid of expecteds, set helpers to NULL. */
1232         LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
1233         for (i = 0; i < nf_conntrack_htable_size; i++)
1234                 LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
1235                             struct nf_conntrack_tuple_hash *, me);
1236         write_unlock_bh(&nf_conntrack_lock);
1237
1238         /* Someone could be still looking at the helper in a bh. */
1239         synchronize_net();
1240 }
1241
1242 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1243 void __nf_ct_refresh_acct(struct nf_conn *ct,
1244                           enum ip_conntrack_info ctinfo,
1245                           const struct sk_buff *skb,
1246                           unsigned long extra_jiffies,
1247                           int do_acct)
1248 {
1249         int event = 0;
1250
1251         NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1252         NF_CT_ASSERT(skb);
1253
1254         write_lock_bh(&nf_conntrack_lock);
1255
1256         /* If not in hash table, timer will not be active yet */
1257         if (!nf_ct_is_confirmed(ct)) {
1258                 ct->timeout.expires = extra_jiffies;
1259                 event = IPCT_REFRESH;
1260         } else {
1261                 /* Need del_timer for race avoidance (may already be dying). */
1262                 if (del_timer(&ct->timeout)) {
1263                         ct->timeout.expires = jiffies + extra_jiffies;
1264                         add_timer(&ct->timeout);
1265                         event = IPCT_REFRESH;
1266                 }
1267         }
1268
1269 #ifdef CONFIG_NF_CT_ACCT
1270         if (do_acct) {
1271                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1272                 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1273                         skb->len - (unsigned int)(skb->nh.raw - skb->data);
1274         if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1275             || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1276                 event |= IPCT_COUNTER_FILLING;
1277         }
1278 #endif
1279
1280         write_unlock_bh(&nf_conntrack_lock);
1281
1282         /* must be unlocked when calling event cache */
1283         if (event)
1284                 nf_conntrack_event_cache(event, skb);
1285 }
1286
1287 /* Used by ipt_REJECT and ip6t_REJECT. */
1288 void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1289 {
1290         struct nf_conn *ct;
1291         enum ip_conntrack_info ctinfo;
1292
1293         /* This ICMP is in reverse direction to the packet which caused it */
1294         ct = nf_ct_get(skb, &ctinfo);
1295         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1296                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1297         else
1298                 ctinfo = IP_CT_RELATED;
1299
1300         /* Attach to new skbuff, and increment count */
1301         nskb->nfct = &ct->ct_general;
1302         nskb->nfctinfo = ctinfo;
1303         nf_conntrack_get(nskb->nfct);
1304 }
1305
1306 static inline int
1307 do_iter(const struct nf_conntrack_tuple_hash *i,
1308         int (*iter)(struct nf_conn *i, void *data),
1309         void *data)
1310 {
1311         return iter(nf_ct_tuplehash_to_ctrack(i), data);
1312 }
1313
1314 /* Bring out ya dead! */
1315 static struct nf_conntrack_tuple_hash *
1316 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1317                 void *data, unsigned int *bucket)
1318 {
1319         struct nf_conntrack_tuple_hash *h = NULL;
1320
1321         write_lock_bh(&nf_conntrack_lock);
1322         for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1323                 h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
1324                                 struct nf_conntrack_tuple_hash *, iter, data);
1325                 if (h)
1326                         break;
1327         }
1328         if (!h)
1329                 h = LIST_FIND_W(&unconfirmed, do_iter,
1330                                 struct nf_conntrack_tuple_hash *, iter, data);
1331         if (h)
1332                 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
1333         write_unlock_bh(&nf_conntrack_lock);
1334
1335         return h;
1336 }
1337
1338 void
1339 nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
1340 {
1341         struct nf_conntrack_tuple_hash *h;
1342         unsigned int bucket = 0;
1343
1344         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1345                 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1346                 /* Time to push up daises... */
1347                 if (del_timer(&ct->timeout))
1348                         death_by_timeout((unsigned long)ct);
1349                 /* ... else the timer will get him soon. */
1350
1351                 nf_ct_put(ct);
1352         }
1353 }
1354
1355 static int kill_all(struct nf_conn *i, void *data)
1356 {
1357         return 1;
1358 }
1359
1360 static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
1361 {
1362         if (vmalloced)
1363                 vfree(hash);
1364         else
1365                 free_pages((unsigned long)hash, 
1366                            get_order(sizeof(struct list_head) * size));
1367 }
1368
1369 /* Mishearing the voices in his head, our hero wonders how he's
1370    supposed to kill the mall. */
1371 void nf_conntrack_cleanup(void)
1372 {
1373         int i;
1374
1375         /* This makes sure all current packets have passed through
1376            netfilter framework.  Roll on, two-stage module
1377            delete... */
1378         synchronize_net();
1379
1380         nf_ct_event_cache_flush();
1381  i_see_dead_people:
1382         nf_ct_iterate_cleanup(kill_all, NULL);
1383         if (atomic_read(&nf_conntrack_count) != 0) {
1384                 schedule();
1385                 goto i_see_dead_people;
1386         }
1387         /* wait until all references to nf_conntrack_untracked are dropped */
1388         while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
1389                 schedule();
1390
1391         for (i = 0; i < NF_CT_F_NUM; i++) {
1392                 if (nf_ct_cache[i].use == 0)
1393                         continue;
1394
1395                 NF_CT_ASSERT(nf_ct_cache[i].use == 1);
1396                 nf_ct_cache[i].use = 1;
1397                 nf_conntrack_unregister_cache(i);
1398         }
1399         kmem_cache_destroy(nf_conntrack_expect_cachep);
1400         free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1401                             nf_conntrack_htable_size);
1402
1403         /* free l3proto protocol tables */
1404         for (i = 0; i < PF_MAX; i++)
1405                 if (nf_ct_protos[i]) {
1406                         kfree(nf_ct_protos[i]);
1407                         nf_ct_protos[i] = NULL;
1408                 }
1409 }
1410
1411 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1412 {
1413         struct list_head *hash;
1414         unsigned int i;
1415
1416         *vmalloced = 0; 
1417         hash = (void*)__get_free_pages(GFP_KERNEL, 
1418                                        get_order(sizeof(struct list_head)
1419                                                  * size));
1420         if (!hash) { 
1421                 *vmalloced = 1;
1422                 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1423                 hash = vmalloc(sizeof(struct list_head) * size);
1424         }
1425
1426         if (hash)
1427                 for (i = 0; i < size; i++) 
1428                         INIT_LIST_HEAD(&hash[i]);
1429
1430         return hash;
1431 }
1432
1433 int set_hashsize(const char *val, struct kernel_param *kp)
1434 {
1435         int i, bucket, hashsize, vmalloced;
1436         int old_vmalloced, old_size;
1437         int rnd;
1438         struct list_head *hash, *old_hash;
1439         struct nf_conntrack_tuple_hash *h;
1440
1441         /* On boot, we can set this without any fancy locking. */
1442         if (!nf_conntrack_htable_size)
1443                 return param_set_uint(val, kp);
1444
1445         hashsize = simple_strtol(val, NULL, 0);
1446         if (!hashsize)
1447                 return -EINVAL;
1448
1449         hash = alloc_hashtable(hashsize, &vmalloced);
1450         if (!hash)
1451                 return -ENOMEM;
1452
1453         /* We have to rehahs for the new table anyway, so we also can
1454          * use a newrandom seed */
1455         get_random_bytes(&rnd, 4);
1456
1457         write_lock_bh(&nf_conntrack_lock);
1458         for (i = 0; i < nf_conntrack_htable_size; i++) {
1459                 while (!list_empty(&nf_conntrack_hash[i])) {
1460                         h = list_entry(nf_conntrack_hash[i].next,
1461                                        struct nf_conntrack_tuple_hash, list);
1462                         list_del(&h->list);
1463                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1464                         list_add_tail(&h->list, &hash[bucket]);
1465                 }
1466         }
1467         old_size = nf_conntrack_htable_size;
1468         old_vmalloced = nf_conntrack_vmalloc;
1469         old_hash = nf_conntrack_hash;
1470
1471         nf_conntrack_htable_size = hashsize;
1472         nf_conntrack_vmalloc = vmalloced;
1473         nf_conntrack_hash = hash;
1474         nf_conntrack_hash_rnd = rnd;
1475         write_unlock_bh(&nf_conntrack_lock);
1476
1477         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1478         return 0;
1479 }
1480
1481 module_param_call(hashsize, set_hashsize, param_get_uint,
1482                   &nf_conntrack_htable_size, 0600);
1483
1484 int __init nf_conntrack_init(void)
1485 {
1486         unsigned int i;
1487         int ret;
1488
1489         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1490          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1491         if (!nf_conntrack_htable_size) {
1492                 nf_conntrack_htable_size
1493                         = (((num_physpages << PAGE_SHIFT) / 16384)
1494                            / sizeof(struct list_head));
1495                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1496                         nf_conntrack_htable_size = 8192;
1497                 if (nf_conntrack_htable_size < 16)
1498                         nf_conntrack_htable_size = 16;
1499         }
1500         nf_conntrack_max = 8 * nf_conntrack_htable_size;
1501
1502         printk("nf_conntrack version %s (%u buckets, %d max)\n",
1503                NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1504                nf_conntrack_max);
1505
1506         nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
1507                                             &nf_conntrack_vmalloc);
1508         if (!nf_conntrack_hash) {
1509                 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1510                 goto err_out;
1511         }
1512
1513         ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
1514                                           sizeof(struct nf_conn), NULL);
1515         if (ret < 0) {
1516                 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1517                 goto err_free_hash;
1518         }
1519
1520         nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
1521                                         sizeof(struct nf_conntrack_expect),
1522                                         0, 0, NULL, NULL);
1523         if (!nf_conntrack_expect_cachep) {
1524                 printk(KERN_ERR "Unable to create nf_expect slab cache\n");
1525                 goto err_free_conntrack_slab;
1526         }
1527
1528         /* Don't NEED lock here, but good form anyway. */
1529         write_lock_bh(&nf_conntrack_lock);
1530         for (i = 0; i < PF_MAX; i++)
1531                 nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
1532         write_unlock_bh(&nf_conntrack_lock);
1533
1534         /* Set up fake conntrack:
1535             - to never be deleted, not in any hashes */
1536         atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1537         /*  - and look it like as a confirmed connection */
1538         set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1539
1540         return ret;
1541
1542 err_free_conntrack_slab:
1543         nf_conntrack_unregister_cache(NF_CT_F_BASIC);
1544 err_free_hash:
1545         free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1546                             nf_conntrack_htable_size);
1547 err_out:
1548         return -ENOMEM;
1549 }