Pull remove-sn-bist-lock into release branch
[pandora-kernel.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40 #include <linux/notifier.h>
41
42 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
43    registrations, conntrack timers*/
44 #define ASSERT_READ_LOCK(x)
45 #define ASSERT_WRITE_LOCK(x)
46
47 #include <linux/netfilter_ipv4/ip_conntrack.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
51 #include <linux/netfilter_ipv4/listhelp.h>
52
53 #define IP_CONNTRACK_VERSION    "2.3"
54
55 #if 0
56 #define DEBUGP printk
57 #else
58 #define DEBUGP(format, args...)
59 #endif
60
61 DEFINE_RWLOCK(ip_conntrack_lock);
62
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65
66 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67 LIST_HEAD(ip_conntrack_expect_list);
68 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69 static LIST_HEAD(helpers);
70 unsigned int ip_conntrack_htable_size = 0;
71 int ip_conntrack_max;
72 struct list_head *ip_conntrack_hash;
73 static kmem_cache_t *ip_conntrack_cachep __read_mostly;
74 static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
75 struct ip_conntrack ip_conntrack_untracked;
76 unsigned int ip_ct_log_invalid;
77 static LIST_HEAD(unconfirmed);
78 static int ip_conntrack_vmalloc;
79
80 static unsigned int ip_conntrack_next_id = 1;
81 static unsigned int ip_conntrack_expect_next_id = 1;
82 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83 struct notifier_block *ip_conntrack_chain;
84 struct notifier_block *ip_conntrack_expect_chain;
85
86 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
87
88 /* deliver cached events and clear cache entry - must be called with locally
89  * disabled softirqs */
90 static inline void
91 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
92 {
93         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
94         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
95                 notifier_call_chain(&ip_conntrack_chain, ecache->events,
96                                     ecache->ct);
97         ecache->events = 0;
98         ip_conntrack_put(ecache->ct);
99         ecache->ct = NULL;
100 }
101
102 /* Deliver all cached events for a particular conntrack. This is called
103  * by code prior to async packet handling or freeing the skb */
104 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
105 {
106         struct ip_conntrack_ecache *ecache;
107         
108         local_bh_disable();
109         ecache = &__get_cpu_var(ip_conntrack_ecache);
110         if (ecache->ct == ct)
111                 __ip_ct_deliver_cached_events(ecache);
112         local_bh_enable();
113 }
114
115 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
116 {
117         struct ip_conntrack_ecache *ecache;
118
119         /* take care of delivering potentially old events */
120         ecache = &__get_cpu_var(ip_conntrack_ecache);
121         BUG_ON(ecache->ct == ct);
122         if (ecache->ct)
123                 __ip_ct_deliver_cached_events(ecache);
124         /* initialize for this conntrack/packet */
125         ecache->ct = ct;
126         nf_conntrack_get(&ct->ct_general);
127 }
128
129 /* flush the event cache - touches other CPU's data and must not be called while
130  * packets are still passing through the code */
131 static void ip_ct_event_cache_flush(void)
132 {
133         struct ip_conntrack_ecache *ecache;
134         int cpu;
135
136         for_each_cpu(cpu) {
137                 ecache = &per_cpu(ip_conntrack_ecache, cpu);
138                 if (ecache->ct)
139                         ip_conntrack_put(ecache->ct);
140         }
141 }
142 #else
143 static inline void ip_ct_event_cache_flush(void) {}
144 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
145
146 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
147
148 static int ip_conntrack_hash_rnd_initted;
149 static unsigned int ip_conntrack_hash_rnd;
150
151 static u_int32_t
152 hash_conntrack(const struct ip_conntrack_tuple *tuple)
153 {
154 #if 0
155         dump_tuple(tuple);
156 #endif
157         return (jhash_3words(tuple->src.ip,
158                              (tuple->dst.ip ^ tuple->dst.protonum),
159                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
160                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
161 }
162
163 int
164 ip_ct_get_tuple(const struct iphdr *iph,
165                 const struct sk_buff *skb,
166                 unsigned int dataoff,
167                 struct ip_conntrack_tuple *tuple,
168                 const struct ip_conntrack_protocol *protocol)
169 {
170         /* Never happen */
171         if (iph->frag_off & htons(IP_OFFSET)) {
172                 printk("ip_conntrack_core: Frag of proto %u.\n",
173                        iph->protocol);
174                 return 0;
175         }
176
177         tuple->src.ip = iph->saddr;
178         tuple->dst.ip = iph->daddr;
179         tuple->dst.protonum = iph->protocol;
180         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
181
182         return protocol->pkt_to_tuple(skb, dataoff, tuple);
183 }
184
185 int
186 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
187                    const struct ip_conntrack_tuple *orig,
188                    const struct ip_conntrack_protocol *protocol)
189 {
190         inverse->src.ip = orig->dst.ip;
191         inverse->dst.ip = orig->src.ip;
192         inverse->dst.protonum = orig->dst.protonum;
193         inverse->dst.dir = !orig->dst.dir;
194
195         return protocol->invert_tuple(inverse, orig);
196 }
197
198
199 /* ip_conntrack_expect helper functions */
200 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
201 {
202         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
203         IP_NF_ASSERT(!timer_pending(&exp->timeout));
204         list_del(&exp->list);
205         CONNTRACK_STAT_INC(expect_delete);
206         exp->master->expecting--;
207         ip_conntrack_expect_put(exp);
208 }
209
210 static void expectation_timed_out(unsigned long ul_expect)
211 {
212         struct ip_conntrack_expect *exp = (void *)ul_expect;
213
214         write_lock_bh(&ip_conntrack_lock);
215         ip_ct_unlink_expect(exp);
216         write_unlock_bh(&ip_conntrack_lock);
217         ip_conntrack_expect_put(exp);
218 }
219
220 struct ip_conntrack_expect *
221 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
222 {
223         struct ip_conntrack_expect *i;
224         
225         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
226                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
227                         atomic_inc(&i->use);
228                         return i;
229                 }
230         }
231         return NULL;
232 }
233
234 /* Just find a expectation corresponding to a tuple. */
235 struct ip_conntrack_expect *
236 ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
237 {
238         struct ip_conntrack_expect *i;
239         
240         read_lock_bh(&ip_conntrack_lock);
241         i = __ip_conntrack_expect_find(tuple);
242         read_unlock_bh(&ip_conntrack_lock);
243
244         return i;
245 }
246
247 /* If an expectation for this connection is found, it gets delete from
248  * global list then returned. */
249 static struct ip_conntrack_expect *
250 find_expectation(const struct ip_conntrack_tuple *tuple)
251 {
252         struct ip_conntrack_expect *i;
253
254         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
255                 /* If master is not in hash table yet (ie. packet hasn't left
256                    this machine yet), how can other end know about expected?
257                    Hence these are not the droids you are looking for (if
258                    master ct never got confirmed, we'd hold a reference to it
259                    and weird things would happen to future packets). */
260                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
261                     && is_confirmed(i->master)) {
262                         if (i->flags & IP_CT_EXPECT_PERMANENT) {
263                                 atomic_inc(&i->use);
264                                 return i;
265                         } else if (del_timer(&i->timeout)) {
266                                 ip_ct_unlink_expect(i);
267                                 return i;
268                         }
269                 }
270         }
271         return NULL;
272 }
273
274 /* delete all expectations for this conntrack */
275 void ip_ct_remove_expectations(struct ip_conntrack *ct)
276 {
277         struct ip_conntrack_expect *i, *tmp;
278
279         /* Optimization: most connection never expect any others. */
280         if (ct->expecting == 0)
281                 return;
282
283         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
284                 if (i->master == ct && del_timer(&i->timeout)) {
285                         ip_ct_unlink_expect(i);
286                         ip_conntrack_expect_put(i);
287                 }
288         }
289 }
290
291 static void
292 clean_from_lists(struct ip_conntrack *ct)
293 {
294         unsigned int ho, hr;
295         
296         DEBUGP("clean_from_lists(%p)\n", ct);
297         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
298
299         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
300         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
301         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
302         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
303
304         /* Destroy all pending expectations */
305         ip_ct_remove_expectations(ct);
306 }
307
308 static void
309 destroy_conntrack(struct nf_conntrack *nfct)
310 {
311         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
312         struct ip_conntrack_protocol *proto;
313
314         DEBUGP("destroy_conntrack(%p)\n", ct);
315         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
316         IP_NF_ASSERT(!timer_pending(&ct->timeout));
317
318         ip_conntrack_event(IPCT_DESTROY, ct);
319         set_bit(IPS_DYING_BIT, &ct->status);
320
321         /* To make sure we don't get any weird locking issues here:
322          * destroy_conntrack() MUST NOT be called with a write lock
323          * to ip_conntrack_lock!!! -HW */
324         proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
325         if (proto && proto->destroy)
326                 proto->destroy(ct);
327
328         if (ip_conntrack_destroyed)
329                 ip_conntrack_destroyed(ct);
330
331         write_lock_bh(&ip_conntrack_lock);
332         /* Expectations will have been removed in clean_from_lists,
333          * except TFTP can create an expectation on the first packet,
334          * before connection is in the list, so we need to clean here,
335          * too. */
336         ip_ct_remove_expectations(ct);
337
338         /* We overload first tuple to link into unconfirmed list. */
339         if (!is_confirmed(ct)) {
340                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
341                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
342         }
343
344         CONNTRACK_STAT_INC(delete);
345         write_unlock_bh(&ip_conntrack_lock);
346
347         if (ct->master)
348                 ip_conntrack_put(ct->master);
349
350         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
351         ip_conntrack_free(ct);
352 }
353
354 static void death_by_timeout(unsigned long ul_conntrack)
355 {
356         struct ip_conntrack *ct = (void *)ul_conntrack;
357
358         write_lock_bh(&ip_conntrack_lock);
359         /* Inside lock so preempt is disabled on module removal path.
360          * Otherwise we can get spurious warnings. */
361         CONNTRACK_STAT_INC(delete_list);
362         clean_from_lists(ct);
363         write_unlock_bh(&ip_conntrack_lock);
364         ip_conntrack_put(ct);
365 }
366
367 static inline int
368 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
369                     const struct ip_conntrack_tuple *tuple,
370                     const struct ip_conntrack *ignored_conntrack)
371 {
372         ASSERT_READ_LOCK(&ip_conntrack_lock);
373         return tuplehash_to_ctrack(i) != ignored_conntrack
374                 && ip_ct_tuple_equal(tuple, &i->tuple);
375 }
376
377 struct ip_conntrack_tuple_hash *
378 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
379                     const struct ip_conntrack *ignored_conntrack)
380 {
381         struct ip_conntrack_tuple_hash *h;
382         unsigned int hash = hash_conntrack(tuple);
383
384         ASSERT_READ_LOCK(&ip_conntrack_lock);
385         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
386                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
387                         CONNTRACK_STAT_INC(found);
388                         return h;
389                 }
390                 CONNTRACK_STAT_INC(searched);
391         }
392
393         return NULL;
394 }
395
396 /* Find a connection corresponding to a tuple. */
397 struct ip_conntrack_tuple_hash *
398 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
399                       const struct ip_conntrack *ignored_conntrack)
400 {
401         struct ip_conntrack_tuple_hash *h;
402
403         read_lock_bh(&ip_conntrack_lock);
404         h = __ip_conntrack_find(tuple, ignored_conntrack);
405         if (h)
406                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
407         read_unlock_bh(&ip_conntrack_lock);
408
409         return h;
410 }
411
412 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
413                                         unsigned int hash,
414                                         unsigned int repl_hash) 
415 {
416         ct->id = ++ip_conntrack_next_id;
417         list_prepend(&ip_conntrack_hash[hash],
418                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
419         list_prepend(&ip_conntrack_hash[repl_hash],
420                      &ct->tuplehash[IP_CT_DIR_REPLY].list);
421 }
422
423 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
424 {
425         unsigned int hash, repl_hash;
426
427         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
428         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
429
430         write_lock_bh(&ip_conntrack_lock);
431         __ip_conntrack_hash_insert(ct, hash, repl_hash);
432         write_unlock_bh(&ip_conntrack_lock);
433 }
434
435 /* Confirm a connection given skb; places it in hash table */
436 int
437 __ip_conntrack_confirm(struct sk_buff **pskb)
438 {
439         unsigned int hash, repl_hash;
440         struct ip_conntrack *ct;
441         enum ip_conntrack_info ctinfo;
442
443         ct = ip_conntrack_get(*pskb, &ctinfo);
444
445         /* ipt_REJECT uses ip_conntrack_attach to attach related
446            ICMP/TCP RST packets in other direction.  Actual packet
447            which created connection will be IP_CT_NEW or for an
448            expected connection, IP_CT_RELATED. */
449         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
450                 return NF_ACCEPT;
451
452         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
453         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
454
455         /* We're not in hash table, and we refuse to set up related
456            connections for unconfirmed conns.  But packet copies and
457            REJECT will give spurious warnings here. */
458         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
459
460         /* No external references means noone else could have
461            confirmed us. */
462         IP_NF_ASSERT(!is_confirmed(ct));
463         DEBUGP("Confirming conntrack %p\n", ct);
464
465         write_lock_bh(&ip_conntrack_lock);
466
467         /* See if there's one in the list already, including reverse:
468            NAT could have grabbed it without realizing, since we're
469            not in the hash.  If there is, we lost race. */
470         if (!LIST_FIND(&ip_conntrack_hash[hash],
471                        conntrack_tuple_cmp,
472                        struct ip_conntrack_tuple_hash *,
473                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
474             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
475                           conntrack_tuple_cmp,
476                           struct ip_conntrack_tuple_hash *,
477                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
478                 /* Remove from unconfirmed list */
479                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
480
481                 __ip_conntrack_hash_insert(ct, hash, repl_hash);
482                 /* Timer relative to confirmation time, not original
483                    setting time, otherwise we'd get timer wrap in
484                    weird delay cases. */
485                 ct->timeout.expires += jiffies;
486                 add_timer(&ct->timeout);
487                 atomic_inc(&ct->ct_general.use);
488                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
489                 CONNTRACK_STAT_INC(insert);
490                 write_unlock_bh(&ip_conntrack_lock);
491                 if (ct->helper)
492                         ip_conntrack_event_cache(IPCT_HELPER, *pskb);
493 #ifdef CONFIG_IP_NF_NAT_NEEDED
494                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
495                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
496                         ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
497 #endif
498                 ip_conntrack_event_cache(master_ct(ct) ?
499                                          IPCT_RELATED : IPCT_NEW, *pskb);
500
501                 return NF_ACCEPT;
502         }
503
504         CONNTRACK_STAT_INC(insert_failed);
505         write_unlock_bh(&ip_conntrack_lock);
506
507         return NF_DROP;
508 }
509
510 /* Returns true if a connection correspondings to the tuple (required
511    for NAT). */
512 int
513 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
514                          const struct ip_conntrack *ignored_conntrack)
515 {
516         struct ip_conntrack_tuple_hash *h;
517
518         read_lock_bh(&ip_conntrack_lock);
519         h = __ip_conntrack_find(tuple, ignored_conntrack);
520         read_unlock_bh(&ip_conntrack_lock);
521
522         return h != NULL;
523 }
524
525 /* There's a small race here where we may free a just-assured
526    connection.  Too bad: we're in trouble anyway. */
527 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
528 {
529         return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
530 }
531
532 static int early_drop(struct list_head *chain)
533 {
534         /* Traverse backwards: gives us oldest, which is roughly LRU */
535         struct ip_conntrack_tuple_hash *h;
536         struct ip_conntrack *ct = NULL;
537         int dropped = 0;
538
539         read_lock_bh(&ip_conntrack_lock);
540         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
541         if (h) {
542                 ct = tuplehash_to_ctrack(h);
543                 atomic_inc(&ct->ct_general.use);
544         }
545         read_unlock_bh(&ip_conntrack_lock);
546
547         if (!ct)
548                 return dropped;
549
550         if (del_timer(&ct->timeout)) {
551                 death_by_timeout((unsigned long)ct);
552                 dropped = 1;
553                 CONNTRACK_STAT_INC(early_drop);
554         }
555         ip_conntrack_put(ct);
556         return dropped;
557 }
558
559 static inline int helper_cmp(const struct ip_conntrack_helper *i,
560                              const struct ip_conntrack_tuple *rtuple)
561 {
562         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
563 }
564
565 static struct ip_conntrack_helper *
566 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
567 {
568         return LIST_FIND(&helpers, helper_cmp,
569                          struct ip_conntrack_helper *,
570                          tuple);
571 }
572
573 struct ip_conntrack_helper *
574 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
575 {
576         struct ip_conntrack_helper *helper;
577
578         /* need ip_conntrack_lock to assure that helper exists until
579          * try_module_get() is called */
580         read_lock_bh(&ip_conntrack_lock);
581
582         helper = __ip_conntrack_helper_find(tuple);
583         if (helper) {
584                 /* need to increase module usage count to assure helper will
585                  * not go away while the caller is e.g. busy putting a
586                  * conntrack in the hash that uses the helper */
587                 if (!try_module_get(helper->me))
588                         helper = NULL;
589         }
590
591         read_unlock_bh(&ip_conntrack_lock);
592
593         return helper;
594 }
595
596 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
597 {
598         module_put(helper->me);
599 }
600
601 struct ip_conntrack_protocol *
602 __ip_conntrack_proto_find(u_int8_t protocol)
603 {
604         return ip_ct_protos[protocol];
605 }
606
607 /* this is guaranteed to always return a valid protocol helper, since
608  * it falls back to generic_protocol */
609 struct ip_conntrack_protocol *
610 ip_conntrack_proto_find_get(u_int8_t protocol)
611 {
612         struct ip_conntrack_protocol *p;
613
614         preempt_disable();
615         p = __ip_conntrack_proto_find(protocol);
616         if (p) {
617                 if (!try_module_get(p->me))
618                         p = &ip_conntrack_generic_protocol;
619         }
620         preempt_enable();
621         
622         return p;
623 }
624
625 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
626 {
627         module_put(p->me);
628 }
629
630 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
631                                         struct ip_conntrack_tuple *repl)
632 {
633         struct ip_conntrack *conntrack;
634
635         if (!ip_conntrack_hash_rnd_initted) {
636                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
637                 ip_conntrack_hash_rnd_initted = 1;
638         }
639
640         if (ip_conntrack_max
641             && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
642                 unsigned int hash = hash_conntrack(orig);
643                 /* Try dropping from this hash chain. */
644                 if (!early_drop(&ip_conntrack_hash[hash])) {
645                         if (net_ratelimit())
646                                 printk(KERN_WARNING
647                                        "ip_conntrack: table full, dropping"
648                                        " packet.\n");
649                         return ERR_PTR(-ENOMEM);
650                 }
651         }
652
653         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
654         if (!conntrack) {
655                 DEBUGP("Can't allocate conntrack.\n");
656                 return ERR_PTR(-ENOMEM);
657         }
658
659         memset(conntrack, 0, sizeof(*conntrack));
660         atomic_set(&conntrack->ct_general.use, 1);
661         conntrack->ct_general.destroy = destroy_conntrack;
662         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
663         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
664         /* Don't set timer yet: wait for confirmation */
665         init_timer(&conntrack->timeout);
666         conntrack->timeout.data = (unsigned long)conntrack;
667         conntrack->timeout.function = death_by_timeout;
668
669         atomic_inc(&ip_conntrack_count);
670
671         return conntrack;
672 }
673
674 void
675 ip_conntrack_free(struct ip_conntrack *conntrack)
676 {
677         atomic_dec(&ip_conntrack_count);
678         kmem_cache_free(ip_conntrack_cachep, conntrack);
679 }
680
681 /* Allocate a new conntrack: we return -ENOMEM if classification
682  * failed due to stress.   Otherwise it really is unclassifiable */
683 static struct ip_conntrack_tuple_hash *
684 init_conntrack(struct ip_conntrack_tuple *tuple,
685                struct ip_conntrack_protocol *protocol,
686                struct sk_buff *skb)
687 {
688         struct ip_conntrack *conntrack;
689         struct ip_conntrack_tuple repl_tuple;
690         struct ip_conntrack_expect *exp;
691
692         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
693                 DEBUGP("Can't invert tuple.\n");
694                 return NULL;
695         }
696
697         conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
698         if (conntrack == NULL || IS_ERR(conntrack))
699                 return (struct ip_conntrack_tuple_hash *)conntrack;
700
701         if (!protocol->new(conntrack, skb)) {
702                 ip_conntrack_free(conntrack);
703                 return NULL;
704         }
705
706         write_lock_bh(&ip_conntrack_lock);
707         exp = find_expectation(tuple);
708
709         if (exp) {
710                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
711                         conntrack, exp);
712                 /* Welcome, Mr. Bond.  We've been expecting you... */
713                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
714                 conntrack->master = exp->master;
715 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
716                 conntrack->mark = exp->master->mark;
717 #endif
718 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
719     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
720                 /* this is ugly, but there is no other place where to put it */
721                 conntrack->nat.masq_index = exp->master->nat.masq_index;
722 #endif
723                 nf_conntrack_get(&conntrack->master->ct_general);
724                 CONNTRACK_STAT_INC(expect_new);
725         } else {
726                 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
727
728                 CONNTRACK_STAT_INC(new);
729         }
730
731         /* Overload tuple linked list to put us in unconfirmed list. */
732         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
733
734         write_unlock_bh(&ip_conntrack_lock);
735
736         if (exp) {
737                 if (exp->expectfn)
738                         exp->expectfn(conntrack, exp);
739                 ip_conntrack_expect_put(exp);
740         }
741
742         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
743 }
744
745 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
746 static inline struct ip_conntrack *
747 resolve_normal_ct(struct sk_buff *skb,
748                   struct ip_conntrack_protocol *proto,
749                   int *set_reply,
750                   unsigned int hooknum,
751                   enum ip_conntrack_info *ctinfo)
752 {
753         struct ip_conntrack_tuple tuple;
754         struct ip_conntrack_tuple_hash *h;
755         struct ip_conntrack *ct;
756
757         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
758
759         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
760                                 &tuple,proto))
761                 return NULL;
762
763         /* look for tuple match */
764         h = ip_conntrack_find_get(&tuple, NULL);
765         if (!h) {
766                 h = init_conntrack(&tuple, proto, skb);
767                 if (!h)
768                         return NULL;
769                 if (IS_ERR(h))
770                         return (void *)h;
771         }
772         ct = tuplehash_to_ctrack(h);
773
774         /* It exists; we have (non-exclusive) reference. */
775         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
776                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
777                 /* Please set reply bit if this packet OK */
778                 *set_reply = 1;
779         } else {
780                 /* Once we've had two way comms, always ESTABLISHED. */
781                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
782                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
783                                ct);
784                         *ctinfo = IP_CT_ESTABLISHED;
785                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
786                         DEBUGP("ip_conntrack_in: related packet for %p\n",
787                                ct);
788                         *ctinfo = IP_CT_RELATED;
789                 } else {
790                         DEBUGP("ip_conntrack_in: new packet for %p\n",
791                                ct);
792                         *ctinfo = IP_CT_NEW;
793                 }
794                 *set_reply = 0;
795         }
796         skb->nfct = &ct->ct_general;
797         skb->nfctinfo = *ctinfo;
798         return ct;
799 }
800
801 /* Netfilter hook itself. */
802 unsigned int ip_conntrack_in(unsigned int hooknum,
803                              struct sk_buff **pskb,
804                              const struct net_device *in,
805                              const struct net_device *out,
806                              int (*okfn)(struct sk_buff *))
807 {
808         struct ip_conntrack *ct;
809         enum ip_conntrack_info ctinfo;
810         struct ip_conntrack_protocol *proto;
811         int set_reply = 0;
812         int ret;
813
814         /* Previously seen (loopback or untracked)?  Ignore. */
815         if ((*pskb)->nfct) {
816                 CONNTRACK_STAT_INC(ignore);
817                 return NF_ACCEPT;
818         }
819
820         /* Never happen */
821         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
822                 if (net_ratelimit()) {
823                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
824                        (*pskb)->nh.iph->protocol, hooknum);
825                 }
826                 return NF_DROP;
827         }
828
829 /* Doesn't cover locally-generated broadcast, so not worth it. */
830 #if 0
831         /* Ignore broadcast: no `connection'. */
832         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
833                 printk("Broadcast packet!\n");
834                 return NF_ACCEPT;
835         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
836                    == htonl(0x000000FF)) {
837                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
838                        NIPQUAD((*pskb)->nh.iph->saddr),
839                        NIPQUAD((*pskb)->nh.iph->daddr),
840                        (*pskb)->sk, (*pskb)->pkt_type);
841         }
842 #endif
843
844         proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
845
846         /* It may be an special packet, error, unclean...
847          * inverse of the return code tells to the netfilter
848          * core what to do with the packet. */
849         if (proto->error != NULL 
850             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
851                 CONNTRACK_STAT_INC(error);
852                 CONNTRACK_STAT_INC(invalid);
853                 return -ret;
854         }
855
856         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
857                 /* Not valid part of a connection */
858                 CONNTRACK_STAT_INC(invalid);
859                 return NF_ACCEPT;
860         }
861
862         if (IS_ERR(ct)) {
863                 /* Too stressed to deal. */
864                 CONNTRACK_STAT_INC(drop);
865                 return NF_DROP;
866         }
867
868         IP_NF_ASSERT((*pskb)->nfct);
869
870         ret = proto->packet(ct, *pskb, ctinfo);
871         if (ret < 0) {
872                 /* Invalid: inverse of the return code tells
873                  * the netfilter core what to do*/
874                 nf_conntrack_put((*pskb)->nfct);
875                 (*pskb)->nfct = NULL;
876                 CONNTRACK_STAT_INC(invalid);
877                 return -ret;
878         }
879
880         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
881                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
882
883         return ret;
884 }
885
886 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
887                    const struct ip_conntrack_tuple *orig)
888 {
889         return ip_ct_invert_tuple(inverse, orig, 
890                                   __ip_conntrack_proto_find(orig->dst.protonum));
891 }
892
893 /* Would two expected things clash? */
894 static inline int expect_clash(const struct ip_conntrack_expect *a,
895                                const struct ip_conntrack_expect *b)
896 {
897         /* Part covered by intersection of masks must be unequal,
898            otherwise they clash */
899         struct ip_conntrack_tuple intersect_mask
900                 = { { a->mask.src.ip & b->mask.src.ip,
901                       { a->mask.src.u.all & b->mask.src.u.all } },
902                     { a->mask.dst.ip & b->mask.dst.ip,
903                       { a->mask.dst.u.all & b->mask.dst.u.all },
904                       a->mask.dst.protonum & b->mask.dst.protonum } };
905
906         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
907 }
908
909 static inline int expect_matches(const struct ip_conntrack_expect *a,
910                                  const struct ip_conntrack_expect *b)
911 {
912         return a->master == b->master
913                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
914                 && ip_ct_tuple_equal(&a->mask, &b->mask);
915 }
916
917 /* Generally a bad idea to call this: could have matched already. */
918 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
919 {
920         struct ip_conntrack_expect *i;
921
922         write_lock_bh(&ip_conntrack_lock);
923         /* choose the the oldest expectation to evict */
924         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
925                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
926                         ip_ct_unlink_expect(i);
927                         write_unlock_bh(&ip_conntrack_lock);
928                         ip_conntrack_expect_put(i);
929                         return;
930                 }
931         }
932         write_unlock_bh(&ip_conntrack_lock);
933 }
934
935 /* We don't increase the master conntrack refcount for non-fulfilled
936  * conntracks. During the conntrack destruction, the expectations are 
937  * always killed before the conntrack itself */
938 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
939 {
940         struct ip_conntrack_expect *new;
941
942         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
943         if (!new) {
944                 DEBUGP("expect_related: OOM allocating expect\n");
945                 return NULL;
946         }
947         new->master = me;
948         atomic_set(&new->use, 1);
949         return new;
950 }
951
952 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
953 {
954         if (atomic_dec_and_test(&exp->use))
955                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
956 }
957
958 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
959 {
960         atomic_inc(&exp->use);
961         exp->master->expecting++;
962         list_add(&exp->list, &ip_conntrack_expect_list);
963
964         init_timer(&exp->timeout);
965         exp->timeout.data = (unsigned long)exp;
966         exp->timeout.function = expectation_timed_out;
967         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
968         add_timer(&exp->timeout);
969
970         exp->id = ++ip_conntrack_expect_next_id;
971         atomic_inc(&exp->use);
972         CONNTRACK_STAT_INC(expect_create);
973 }
974
975 /* Race with expectations being used means we could have none to find; OK. */
976 static void evict_oldest_expect(struct ip_conntrack *master)
977 {
978         struct ip_conntrack_expect *i;
979
980         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
981                 if (i->master == master) {
982                         if (del_timer(&i->timeout)) {
983                                 ip_ct_unlink_expect(i);
984                                 ip_conntrack_expect_put(i);
985                         }
986                         break;
987                 }
988         }
989 }
990
991 static inline int refresh_timer(struct ip_conntrack_expect *i)
992 {
993         if (!del_timer(&i->timeout))
994                 return 0;
995
996         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
997         add_timer(&i->timeout);
998         return 1;
999 }
1000
1001 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1002 {
1003         struct ip_conntrack_expect *i;
1004         int ret;
1005
1006         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1007         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1008         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
1009
1010         write_lock_bh(&ip_conntrack_lock);
1011         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1012                 if (expect_matches(i, expect)) {
1013                         /* Refresh timer: if it's dying, ignore.. */
1014                         if (refresh_timer(i)) {
1015                                 ret = 0;
1016                                 goto out;
1017                         }
1018                 } else if (expect_clash(i, expect)) {
1019                         ret = -EBUSY;
1020                         goto out;
1021                 }
1022         }
1023
1024         /* Will be over limit? */
1025         if (expect->master->helper->max_expected && 
1026             expect->master->expecting >= expect->master->helper->max_expected)
1027                 evict_oldest_expect(expect->master);
1028
1029         ip_conntrack_expect_insert(expect);
1030         ip_conntrack_expect_event(IPEXP_NEW, expect);
1031         ret = 0;
1032 out:
1033         write_unlock_bh(&ip_conntrack_lock);
1034         return ret;
1035 }
1036
1037 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1038    implicitly racy: see __ip_conntrack_confirm */
1039 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1040                               const struct ip_conntrack_tuple *newreply)
1041 {
1042         write_lock_bh(&ip_conntrack_lock);
1043         /* Should be unconfirmed, so not in hash table yet */
1044         IP_NF_ASSERT(!is_confirmed(conntrack));
1045
1046         DEBUGP("Altering reply tuple of %p to ", conntrack);
1047         DUMP_TUPLE(newreply);
1048
1049         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1050         if (!conntrack->master && conntrack->expecting == 0)
1051                 conntrack->helper = __ip_conntrack_helper_find(newreply);
1052         write_unlock_bh(&ip_conntrack_lock);
1053 }
1054
1055 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1056 {
1057         BUG_ON(me->timeout == 0);
1058         write_lock_bh(&ip_conntrack_lock);
1059         list_prepend(&helpers, me);
1060         write_unlock_bh(&ip_conntrack_lock);
1061
1062         return 0;
1063 }
1064
1065 struct ip_conntrack_helper *
1066 __ip_conntrack_helper_find_byname(const char *name)
1067 {
1068         struct ip_conntrack_helper *h;
1069
1070         list_for_each_entry(h, &helpers, list) {
1071                 if (!strcmp(h->name, name))
1072                         return h;
1073         }
1074
1075         return NULL;
1076 }
1077
1078 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1079                          const struct ip_conntrack_helper *me)
1080 {
1081         if (tuplehash_to_ctrack(i)->helper == me) {
1082                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1083                 tuplehash_to_ctrack(i)->helper = NULL;
1084         }
1085         return 0;
1086 }
1087
1088 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1089 {
1090         unsigned int i;
1091         struct ip_conntrack_expect *exp, *tmp;
1092
1093         /* Need write lock here, to delete helper. */
1094         write_lock_bh(&ip_conntrack_lock);
1095         LIST_DELETE(&helpers, me);
1096
1097         /* Get rid of expectations */
1098         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1099                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1100                         ip_ct_unlink_expect(exp);
1101                         ip_conntrack_expect_put(exp);
1102                 }
1103         }
1104         /* Get rid of expecteds, set helpers to NULL. */
1105         LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1106         for (i = 0; i < ip_conntrack_htable_size; i++)
1107                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1108                             struct ip_conntrack_tuple_hash *, me);
1109         write_unlock_bh(&ip_conntrack_lock);
1110
1111         /* Someone could be still looking at the helper in a bh. */
1112         synchronize_net();
1113 }
1114
1115 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1116 void __ip_ct_refresh_acct(struct ip_conntrack *ct, 
1117                         enum ip_conntrack_info ctinfo,
1118                         const struct sk_buff *skb,
1119                         unsigned long extra_jiffies,
1120                         int do_acct)
1121 {
1122         int event = 0;
1123
1124         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1125         IP_NF_ASSERT(skb);
1126
1127         write_lock_bh(&ip_conntrack_lock);
1128
1129         /* If not in hash table, timer will not be active yet */
1130         if (!is_confirmed(ct)) {
1131                 ct->timeout.expires = extra_jiffies;
1132                 event = IPCT_REFRESH;
1133         } else {
1134                 /* Need del_timer for race avoidance (may already be dying). */
1135                 if (del_timer(&ct->timeout)) {
1136                         ct->timeout.expires = jiffies + extra_jiffies;
1137                         add_timer(&ct->timeout);
1138                         event = IPCT_REFRESH;
1139                 }
1140         }
1141
1142 #ifdef CONFIG_IP_NF_CT_ACCT
1143         if (do_acct) {
1144                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1145                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
1146                                                 ntohs(skb->nh.iph->tot_len);
1147                 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1148                     || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1149                         event |= IPCT_COUNTER_FILLING;
1150         }
1151 #endif
1152
1153         write_unlock_bh(&ip_conntrack_lock);
1154
1155         /* must be unlocked when calling event cache */
1156         if (event)
1157                 ip_conntrack_event_cache(event, skb);
1158 }
1159
1160 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1161     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1162 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1163  * in ip_conntrack_core, since we don't want the protocols to autoload
1164  * or depend on ctnetlink */
1165 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1166                                const struct ip_conntrack_tuple *tuple)
1167 {
1168         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1169                 &tuple->src.u.tcp.port);
1170         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1171                 &tuple->dst.u.tcp.port);
1172         return 0;
1173
1174 nfattr_failure:
1175         return -1;
1176 }
1177
1178 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1179                                struct ip_conntrack_tuple *t)
1180 {
1181         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1182                 return -EINVAL;
1183
1184         t->src.u.tcp.port =
1185                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1186         t->dst.u.tcp.port =
1187                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1188
1189         return 0;
1190 }
1191 #endif
1192
1193 /* Returns new sk_buff, or NULL */
1194 struct sk_buff *
1195 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1196 {
1197         skb_orphan(skb);
1198
1199         local_bh_disable(); 
1200         skb = ip_defrag(skb, user);
1201         local_bh_enable();
1202
1203         if (skb)
1204                 ip_send_check(skb->nh.iph);
1205         return skb;
1206 }
1207
1208 /* Used by ipt_REJECT. */
1209 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1210 {
1211         struct ip_conntrack *ct;
1212         enum ip_conntrack_info ctinfo;
1213
1214         /* This ICMP is in reverse direction to the packet which caused it */
1215         ct = ip_conntrack_get(skb, &ctinfo);
1216         
1217         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1218                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1219         else
1220                 ctinfo = IP_CT_RELATED;
1221
1222         /* Attach to new skbuff, and increment count */
1223         nskb->nfct = &ct->ct_general;
1224         nskb->nfctinfo = ctinfo;
1225         nf_conntrack_get(nskb->nfct);
1226 }
1227
1228 static inline int
1229 do_iter(const struct ip_conntrack_tuple_hash *i,
1230         int (*iter)(struct ip_conntrack *i, void *data),
1231         void *data)
1232 {
1233         return iter(tuplehash_to_ctrack(i), data);
1234 }
1235
1236 /* Bring out ya dead! */
1237 static struct ip_conntrack_tuple_hash *
1238 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1239                 void *data, unsigned int *bucket)
1240 {
1241         struct ip_conntrack_tuple_hash *h = NULL;
1242
1243         write_lock_bh(&ip_conntrack_lock);
1244         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1245                 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1246                                 struct ip_conntrack_tuple_hash *, iter, data);
1247                 if (h)
1248                         break;
1249         }
1250         if (!h)
1251                 h = LIST_FIND_W(&unconfirmed, do_iter,
1252                                 struct ip_conntrack_tuple_hash *, iter, data);
1253         if (h)
1254                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1255         write_unlock_bh(&ip_conntrack_lock);
1256
1257         return h;
1258 }
1259
1260 void
1261 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1262 {
1263         struct ip_conntrack_tuple_hash *h;
1264         unsigned int bucket = 0;
1265
1266         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1267                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1268                 /* Time to push up daises... */
1269                 if (del_timer(&ct->timeout))
1270                         death_by_timeout((unsigned long)ct);
1271                 /* ... else the timer will get him soon. */
1272
1273                 ip_conntrack_put(ct);
1274         }
1275 }
1276
1277 /* Fast function for those who don't want to parse /proc (and I don't
1278    blame them). */
1279 /* Reversing the socket's dst/src point of view gives us the reply
1280    mapping. */
1281 static int
1282 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1283 {
1284         struct inet_sock *inet = inet_sk(sk);
1285         struct ip_conntrack_tuple_hash *h;
1286         struct ip_conntrack_tuple tuple;
1287         
1288         IP_CT_TUPLE_U_BLANK(&tuple);
1289         tuple.src.ip = inet->rcv_saddr;
1290         tuple.src.u.tcp.port = inet->sport;
1291         tuple.dst.ip = inet->daddr;
1292         tuple.dst.u.tcp.port = inet->dport;
1293         tuple.dst.protonum = IPPROTO_TCP;
1294
1295         /* We only do TCP at the moment: is there a better way? */
1296         if (strcmp(sk->sk_prot->name, "TCP")) {
1297                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1298                 return -ENOPROTOOPT;
1299         }
1300
1301         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1302                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1303                        *len, sizeof(struct sockaddr_in));
1304                 return -EINVAL;
1305         }
1306
1307         h = ip_conntrack_find_get(&tuple, NULL);
1308         if (h) {
1309                 struct sockaddr_in sin;
1310                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1311
1312                 sin.sin_family = AF_INET;
1313                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1314                         .tuple.dst.u.tcp.port;
1315                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1316                         .tuple.dst.ip;
1317
1318                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1319                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1320                 ip_conntrack_put(ct);
1321                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1322                         return -EFAULT;
1323                 else
1324                         return 0;
1325         }
1326         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1327                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1328                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1329         return -ENOENT;
1330 }
1331
1332 static struct nf_sockopt_ops so_getorigdst = {
1333         .pf             = PF_INET,
1334         .get_optmin     = SO_ORIGINAL_DST,
1335         .get_optmax     = SO_ORIGINAL_DST+1,
1336         .get            = &getorigdst,
1337 };
1338
1339 static int kill_all(struct ip_conntrack *i, void *data)
1340 {
1341         return 1;
1342 }
1343
1344 static void free_conntrack_hash(void)
1345 {
1346         if (ip_conntrack_vmalloc)
1347                 vfree(ip_conntrack_hash);
1348         else
1349                 free_pages((unsigned long)ip_conntrack_hash, 
1350                            get_order(sizeof(struct list_head)
1351                                      * ip_conntrack_htable_size));
1352 }
1353
1354 void ip_conntrack_flush()
1355 {
1356         /* This makes sure all current packets have passed through
1357            netfilter framework.  Roll on, two-stage module
1358            delete... */
1359         synchronize_net();
1360
1361         ip_ct_event_cache_flush();
1362  i_see_dead_people:
1363         ip_ct_iterate_cleanup(kill_all, NULL);
1364         if (atomic_read(&ip_conntrack_count) != 0) {
1365                 schedule();
1366                 goto i_see_dead_people;
1367         }
1368         /* wait until all references to ip_conntrack_untracked are dropped */
1369         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1370                 schedule();
1371 }
1372
1373 /* Mishearing the voices in his head, our hero wonders how he's
1374    supposed to kill the mall. */
1375 void ip_conntrack_cleanup(void)
1376 {
1377         ip_ct_attach = NULL;
1378         ip_conntrack_flush();
1379         kmem_cache_destroy(ip_conntrack_cachep);
1380         kmem_cache_destroy(ip_conntrack_expect_cachep);
1381         free_conntrack_hash();
1382         nf_unregister_sockopt(&so_getorigdst);
1383 }
1384
1385 static int hashsize;
1386 module_param(hashsize, int, 0400);
1387
1388 int __init ip_conntrack_init(void)
1389 {
1390         unsigned int i;
1391         int ret;
1392
1393         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1394          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1395         if (hashsize) {
1396                 ip_conntrack_htable_size = hashsize;
1397         } else {
1398                 ip_conntrack_htable_size
1399                         = (((num_physpages << PAGE_SHIFT) / 16384)
1400                            / sizeof(struct list_head));
1401                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1402                         ip_conntrack_htable_size = 8192;
1403                 if (ip_conntrack_htable_size < 16)
1404                         ip_conntrack_htable_size = 16;
1405         }
1406         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1407
1408         printk("ip_conntrack version %s (%u buckets, %d max)"
1409                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1410                ip_conntrack_htable_size, ip_conntrack_max,
1411                sizeof(struct ip_conntrack));
1412
1413         ret = nf_register_sockopt(&so_getorigdst);
1414         if (ret != 0) {
1415                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1416                 return ret;
1417         }
1418
1419         /* AK: the hash table is twice as big than needed because it
1420            uses list_head.  it would be much nicer to caches to use a
1421            single pointer list head here. */
1422         ip_conntrack_vmalloc = 0; 
1423         ip_conntrack_hash 
1424                 =(void*)__get_free_pages(GFP_KERNEL, 
1425                                          get_order(sizeof(struct list_head)
1426                                                    *ip_conntrack_htable_size));
1427         if (!ip_conntrack_hash) { 
1428                 ip_conntrack_vmalloc = 1;
1429                 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1430                 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1431                                             * ip_conntrack_htable_size);
1432         }
1433         if (!ip_conntrack_hash) {
1434                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1435                 goto err_unreg_sockopt;
1436         }
1437
1438         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1439                                                 sizeof(struct ip_conntrack), 0,
1440                                                 0, NULL, NULL);
1441         if (!ip_conntrack_cachep) {
1442                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1443                 goto err_free_hash;
1444         }
1445
1446         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1447                                         sizeof(struct ip_conntrack_expect),
1448                                         0, 0, NULL, NULL);
1449         if (!ip_conntrack_expect_cachep) {
1450                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1451                 goto err_free_conntrack_slab;
1452         }
1453
1454         /* Don't NEED lock here, but good form anyway. */
1455         write_lock_bh(&ip_conntrack_lock);
1456         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1457                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1458         /* Sew in builtin protocols. */
1459         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1460         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1461         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1462         write_unlock_bh(&ip_conntrack_lock);
1463
1464         for (i = 0; i < ip_conntrack_htable_size; i++)
1465                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1466
1467         /* For use by ipt_REJECT */
1468         ip_ct_attach = ip_conntrack_attach;
1469
1470         /* Set up fake conntrack:
1471             - to never be deleted, not in any hashes */
1472         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1473         /*  - and look it like as a confirmed connection */
1474         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1475
1476         return ret;
1477
1478 err_free_conntrack_slab:
1479         kmem_cache_destroy(ip_conntrack_cachep);
1480 err_free_hash:
1481         free_conntrack_hash();
1482 err_unreg_sockopt:
1483         nf_unregister_sockopt(&so_getorigdst);
1484
1485         return -ENOMEM;
1486 }