f8cd8e42961ef1f48c516dcdf4e051685745e902
[pandora-kernel.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/config.h>
21 #include <linux/types.h>
22 #include <linux/icmp.h>
23 #include <linux/ip.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/module.h>
27 #include <linux/skbuff.h>
28 #include <linux/proc_fs.h>
29 #include <linux/vmalloc.h>
30 #include <net/checksum.h>
31 #include <net/ip.h>
32 #include <linux/stddef.h>
33 #include <linux/sysctl.h>
34 #include <linux/slab.h>
35 #include <linux/random.h>
36 #include <linux/jhash.h>
37 #include <linux/err.h>
38 #include <linux/percpu.h>
39 #include <linux/moduleparam.h>
40 #include <linux/notifier.h>
41
42 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
43    registrations, conntrack timers*/
44 #define ASSERT_READ_LOCK(x)
45 #define ASSERT_WRITE_LOCK(x)
46
47 #include <linux/netfilter_ipv4/ip_conntrack.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
50 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
51 #include <linux/netfilter_ipv4/listhelp.h>
52
53 #define IP_CONNTRACK_VERSION    "2.3"
54
55 #if 0
56 #define DEBUGP printk
57 #else
58 #define DEBUGP(format, args...)
59 #endif
60
61 DEFINE_RWLOCK(ip_conntrack_lock);
62
63 /* ip_conntrack_standalone needs this */
64 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
65
66 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
67 LIST_HEAD(ip_conntrack_expect_list);
68 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
69 static LIST_HEAD(helpers);
70 unsigned int ip_conntrack_htable_size = 0;
71 int ip_conntrack_max;
72 struct list_head *ip_conntrack_hash;
73 static kmem_cache_t *ip_conntrack_cachep __read_mostly;
74 static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
75 struct ip_conntrack ip_conntrack_untracked;
76 unsigned int ip_ct_log_invalid;
77 static LIST_HEAD(unconfirmed);
78 static int ip_conntrack_vmalloc;
79
80 static unsigned int ip_conntrack_next_id = 1;
81 static unsigned int ip_conntrack_expect_next_id = 1;
82 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
83 struct notifier_block *ip_conntrack_chain;
84 struct notifier_block *ip_conntrack_expect_chain;
85
86 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
87
88 /* deliver cached events and clear cache entry - must be called with locally
89  * disabled softirqs */
90 static inline void
91 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
92 {
93         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
94         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
95                 notifier_call_chain(&ip_conntrack_chain, ecache->events,
96                                     ecache->ct);
97         ecache->events = 0;
98         ip_conntrack_put(ecache->ct);
99         ecache->ct = NULL;
100 }
101
102 /* Deliver all cached events for a particular conntrack. This is called
103  * by code prior to async packet handling or freeing the skb */
104 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
105 {
106         struct ip_conntrack_ecache *ecache;
107         
108         local_bh_disable();
109         ecache = &__get_cpu_var(ip_conntrack_ecache);
110         if (ecache->ct == ct)
111                 __ip_ct_deliver_cached_events(ecache);
112         local_bh_enable();
113 }
114
115 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
116 {
117         struct ip_conntrack_ecache *ecache;
118
119         /* take care of delivering potentially old events */
120         ecache = &__get_cpu_var(ip_conntrack_ecache);
121         BUG_ON(ecache->ct == ct);
122         if (ecache->ct)
123                 __ip_ct_deliver_cached_events(ecache);
124         /* initialize for this conntrack/packet */
125         ecache->ct = ct;
126         nf_conntrack_get(&ct->ct_general);
127 }
128
129 /* flush the event cache - touches other CPU's data and must not be called while
130  * packets are still passing through the code */
131 static void ip_ct_event_cache_flush(void)
132 {
133         struct ip_conntrack_ecache *ecache;
134         int cpu;
135
136         for_each_cpu(cpu) {
137                 ecache = &per_cpu(ip_conntrack_ecache, cpu);
138                 if (ecache->ct)
139                         ip_conntrack_put(ecache->ct);
140         }
141 }
142 #else
143 static inline void ip_ct_event_cache_flush(void) {}
144 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
145
146 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
147
148 static int ip_conntrack_hash_rnd_initted;
149 static unsigned int ip_conntrack_hash_rnd;
150
151 static u_int32_t
152 hash_conntrack(const struct ip_conntrack_tuple *tuple)
153 {
154 #if 0
155         dump_tuple(tuple);
156 #endif
157         return (jhash_3words(tuple->src.ip,
158                              (tuple->dst.ip ^ tuple->dst.protonum),
159                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
160                              ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
161 }
162
163 int
164 ip_ct_get_tuple(const struct iphdr *iph,
165                 const struct sk_buff *skb,
166                 unsigned int dataoff,
167                 struct ip_conntrack_tuple *tuple,
168                 const struct ip_conntrack_protocol *protocol)
169 {
170         /* Never happen */
171         if (iph->frag_off & htons(IP_OFFSET)) {
172                 printk("ip_conntrack_core: Frag of proto %u.\n",
173                        iph->protocol);
174                 return 0;
175         }
176
177         tuple->src.ip = iph->saddr;
178         tuple->dst.ip = iph->daddr;
179         tuple->dst.protonum = iph->protocol;
180         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
181
182         return protocol->pkt_to_tuple(skb, dataoff, tuple);
183 }
184
185 int
186 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
187                    const struct ip_conntrack_tuple *orig,
188                    const struct ip_conntrack_protocol *protocol)
189 {
190         inverse->src.ip = orig->dst.ip;
191         inverse->dst.ip = orig->src.ip;
192         inverse->dst.protonum = orig->dst.protonum;
193         inverse->dst.dir = !orig->dst.dir;
194
195         return protocol->invert_tuple(inverse, orig);
196 }
197
198
199 /* ip_conntrack_expect helper functions */
200 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
201 {
202         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
203         IP_NF_ASSERT(!timer_pending(&exp->timeout));
204         list_del(&exp->list);
205         CONNTRACK_STAT_INC(expect_delete);
206         exp->master->expecting--;
207         ip_conntrack_expect_put(exp);
208 }
209
210 static void expectation_timed_out(unsigned long ul_expect)
211 {
212         struct ip_conntrack_expect *exp = (void *)ul_expect;
213
214         write_lock_bh(&ip_conntrack_lock);
215         ip_ct_unlink_expect(exp);
216         write_unlock_bh(&ip_conntrack_lock);
217         ip_conntrack_expect_put(exp);
218 }
219
220 struct ip_conntrack_expect *
221 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
222 {
223         struct ip_conntrack_expect *i;
224         
225         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
226                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
227                         atomic_inc(&i->use);
228                         return i;
229                 }
230         }
231         return NULL;
232 }
233
234 /* Just find a expectation corresponding to a tuple. */
235 struct ip_conntrack_expect *
236 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
237 {
238         struct ip_conntrack_expect *i;
239         
240         read_lock_bh(&ip_conntrack_lock);
241         i = __ip_conntrack_expect_find(tuple);
242         read_unlock_bh(&ip_conntrack_lock);
243
244         return i;
245 }
246
247 /* If an expectation for this connection is found, it gets delete from
248  * global list then returned. */
249 static struct ip_conntrack_expect *
250 find_expectation(const struct ip_conntrack_tuple *tuple)
251 {
252         struct ip_conntrack_expect *i;
253
254         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
255                 /* If master is not in hash table yet (ie. packet hasn't left
256                    this machine yet), how can other end know about expected?
257                    Hence these are not the droids you are looking for (if
258                    master ct never got confirmed, we'd hold a reference to it
259                    and weird things would happen to future packets). */
260                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
261                     && is_confirmed(i->master)) {
262                         if (i->flags & IP_CT_EXPECT_PERMANENT) {
263                                 atomic_inc(&i->use);
264                                 return i;
265                         } else if (del_timer(&i->timeout)) {
266                                 ip_ct_unlink_expect(i);
267                                 return i;
268                         }
269                 }
270         }
271         return NULL;
272 }
273
274 /* delete all expectations for this conntrack */
275 void ip_ct_remove_expectations(struct ip_conntrack *ct)
276 {
277         struct ip_conntrack_expect *i, *tmp;
278
279         /* Optimization: most connection never expect any others. */
280         if (ct->expecting == 0)
281                 return;
282
283         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
284                 if (i->master == ct && del_timer(&i->timeout)) {
285                         ip_ct_unlink_expect(i);
286                         ip_conntrack_expect_put(i);
287                 }
288         }
289 }
290
291 static void
292 clean_from_lists(struct ip_conntrack *ct)
293 {
294         unsigned int ho, hr;
295         
296         DEBUGP("clean_from_lists(%p)\n", ct);
297         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
298
299         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
300         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
301         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
302         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
303
304         /* Destroy all pending expectations */
305         ip_ct_remove_expectations(ct);
306 }
307
308 static void
309 destroy_conntrack(struct nf_conntrack *nfct)
310 {
311         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
312         struct ip_conntrack_protocol *proto;
313
314         DEBUGP("destroy_conntrack(%p)\n", ct);
315         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
316         IP_NF_ASSERT(!timer_pending(&ct->timeout));
317
318         ip_conntrack_event(IPCT_DESTROY, ct);
319         set_bit(IPS_DYING_BIT, &ct->status);
320
321         /* To make sure we don't get any weird locking issues here:
322          * destroy_conntrack() MUST NOT be called with a write lock
323          * to ip_conntrack_lock!!! -HW */
324         proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
325         if (proto && proto->destroy)
326                 proto->destroy(ct);
327
328         if (ip_conntrack_destroyed)
329                 ip_conntrack_destroyed(ct);
330
331         write_lock_bh(&ip_conntrack_lock);
332         /* Expectations will have been removed in clean_from_lists,
333          * except TFTP can create an expectation on the first packet,
334          * before connection is in the list, so we need to clean here,
335          * too. */
336         ip_ct_remove_expectations(ct);
337
338         /* We overload first tuple to link into unconfirmed list. */
339         if (!is_confirmed(ct)) {
340                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
341                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
342         }
343
344         CONNTRACK_STAT_INC(delete);
345         write_unlock_bh(&ip_conntrack_lock);
346
347         if (ct->master)
348                 ip_conntrack_put(ct->master);
349
350         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
351         ip_conntrack_free(ct);
352 }
353
354 static void death_by_timeout(unsigned long ul_conntrack)
355 {
356         struct ip_conntrack *ct = (void *)ul_conntrack;
357
358         write_lock_bh(&ip_conntrack_lock);
359         /* Inside lock so preempt is disabled on module removal path.
360          * Otherwise we can get spurious warnings. */
361         CONNTRACK_STAT_INC(delete_list);
362         clean_from_lists(ct);
363         write_unlock_bh(&ip_conntrack_lock);
364         ip_conntrack_put(ct);
365 }
366
367 static inline int
368 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
369                     const struct ip_conntrack_tuple *tuple,
370                     const struct ip_conntrack *ignored_conntrack)
371 {
372         ASSERT_READ_LOCK(&ip_conntrack_lock);
373         return tuplehash_to_ctrack(i) != ignored_conntrack
374                 && ip_ct_tuple_equal(tuple, &i->tuple);
375 }
376
377 struct ip_conntrack_tuple_hash *
378 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
379                     const struct ip_conntrack *ignored_conntrack)
380 {
381         struct ip_conntrack_tuple_hash *h;
382         unsigned int hash = hash_conntrack(tuple);
383
384         ASSERT_READ_LOCK(&ip_conntrack_lock);
385         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
386                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
387                         CONNTRACK_STAT_INC(found);
388                         return h;
389                 }
390                 CONNTRACK_STAT_INC(searched);
391         }
392
393         return NULL;
394 }
395
396 /* Find a connection corresponding to a tuple. */
397 struct ip_conntrack_tuple_hash *
398 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
399                       const struct ip_conntrack *ignored_conntrack)
400 {
401         struct ip_conntrack_tuple_hash *h;
402
403         read_lock_bh(&ip_conntrack_lock);
404         h = __ip_conntrack_find(tuple, ignored_conntrack);
405         if (h)
406                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
407         read_unlock_bh(&ip_conntrack_lock);
408
409         return h;
410 }
411
412 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
413                                         unsigned int hash,
414                                         unsigned int repl_hash) 
415 {
416         ct->id = ++ip_conntrack_next_id;
417         list_prepend(&ip_conntrack_hash[hash],
418                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
419         list_prepend(&ip_conntrack_hash[repl_hash],
420                      &ct->tuplehash[IP_CT_DIR_REPLY].list);
421 }
422
423 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
424 {
425         unsigned int hash, repl_hash;
426
427         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
428         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
429
430         write_lock_bh(&ip_conntrack_lock);
431         __ip_conntrack_hash_insert(ct, hash, repl_hash);
432         write_unlock_bh(&ip_conntrack_lock);
433 }
434
435 /* Confirm a connection given skb; places it in hash table */
436 int
437 __ip_conntrack_confirm(struct sk_buff **pskb)
438 {
439         unsigned int hash, repl_hash;
440         struct ip_conntrack *ct;
441         enum ip_conntrack_info ctinfo;
442
443         ct = ip_conntrack_get(*pskb, &ctinfo);
444
445         /* ipt_REJECT uses ip_conntrack_attach to attach related
446            ICMP/TCP RST packets in other direction.  Actual packet
447            which created connection will be IP_CT_NEW or for an
448            expected connection, IP_CT_RELATED. */
449         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
450                 return NF_ACCEPT;
451
452         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
453         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
454
455         /* We're not in hash table, and we refuse to set up related
456            connections for unconfirmed conns.  But packet copies and
457            REJECT will give spurious warnings here. */
458         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
459
460         /* No external references means noone else could have
461            confirmed us. */
462         IP_NF_ASSERT(!is_confirmed(ct));
463         DEBUGP("Confirming conntrack %p\n", ct);
464
465         write_lock_bh(&ip_conntrack_lock);
466
467         /* See if there's one in the list already, including reverse:
468            NAT could have grabbed it without realizing, since we're
469            not in the hash.  If there is, we lost race. */
470         if (!LIST_FIND(&ip_conntrack_hash[hash],
471                        conntrack_tuple_cmp,
472                        struct ip_conntrack_tuple_hash *,
473                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
474             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
475                           conntrack_tuple_cmp,
476                           struct ip_conntrack_tuple_hash *,
477                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
478                 /* Remove from unconfirmed list */
479                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
480
481                 __ip_conntrack_hash_insert(ct, hash, repl_hash);
482                 /* Timer relative to confirmation time, not original
483                    setting time, otherwise we'd get timer wrap in
484                    weird delay cases. */
485                 ct->timeout.expires += jiffies;
486                 add_timer(&ct->timeout);
487                 atomic_inc(&ct->ct_general.use);
488                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
489                 CONNTRACK_STAT_INC(insert);
490                 write_unlock_bh(&ip_conntrack_lock);
491                 if (ct->helper)
492                         ip_conntrack_event_cache(IPCT_HELPER, *pskb);
493 #ifdef CONFIG_IP_NF_NAT_NEEDED
494                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
495                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
496                         ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
497 #endif
498                 ip_conntrack_event_cache(master_ct(ct) ?
499                                          IPCT_RELATED : IPCT_NEW, *pskb);
500
501                 return NF_ACCEPT;
502         }
503
504         CONNTRACK_STAT_INC(insert_failed);
505         write_unlock_bh(&ip_conntrack_lock);
506
507         return NF_DROP;
508 }
509
510 /* Returns true if a connection correspondings to the tuple (required
511    for NAT). */
512 int
513 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
514                          const struct ip_conntrack *ignored_conntrack)
515 {
516         struct ip_conntrack_tuple_hash *h;
517
518         read_lock_bh(&ip_conntrack_lock);
519         h = __ip_conntrack_find(tuple, ignored_conntrack);
520         read_unlock_bh(&ip_conntrack_lock);
521
522         return h != NULL;
523 }
524
525 /* There's a small race here where we may free a just-assured
526    connection.  Too bad: we're in trouble anyway. */
527 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
528 {
529         return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
530 }
531
532 static int early_drop(struct list_head *chain)
533 {
534         /* Traverse backwards: gives us oldest, which is roughly LRU */
535         struct ip_conntrack_tuple_hash *h;
536         struct ip_conntrack *ct = NULL;
537         int dropped = 0;
538
539         read_lock_bh(&ip_conntrack_lock);
540         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
541         if (h) {
542                 ct = tuplehash_to_ctrack(h);
543                 atomic_inc(&ct->ct_general.use);
544         }
545         read_unlock_bh(&ip_conntrack_lock);
546
547         if (!ct)
548                 return dropped;
549
550         if (del_timer(&ct->timeout)) {
551                 death_by_timeout((unsigned long)ct);
552                 dropped = 1;
553                 CONNTRACK_STAT_INC(early_drop);
554         }
555         ip_conntrack_put(ct);
556         return dropped;
557 }
558
559 static inline int helper_cmp(const struct ip_conntrack_helper *i,
560                              const struct ip_conntrack_tuple *rtuple)
561 {
562         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
563 }
564
565 static struct ip_conntrack_helper *
566 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
567 {
568         return LIST_FIND(&helpers, helper_cmp,
569                          struct ip_conntrack_helper *,
570                          tuple);
571 }
572
573 struct ip_conntrack_helper *
574 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
575 {
576         struct ip_conntrack_helper *helper;
577
578         /* need ip_conntrack_lock to assure that helper exists until
579          * try_module_get() is called */
580         read_lock_bh(&ip_conntrack_lock);
581
582         helper = __ip_conntrack_helper_find(tuple);
583         if (helper) {
584                 /* need to increase module usage count to assure helper will
585                  * not go away while the caller is e.g. busy putting a
586                  * conntrack in the hash that uses the helper */
587                 if (!try_module_get(helper->me))
588                         helper = NULL;
589         }
590
591         read_unlock_bh(&ip_conntrack_lock);
592
593         return helper;
594 }
595
596 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
597 {
598         module_put(helper->me);
599 }
600
601 struct ip_conntrack_protocol *
602 __ip_conntrack_proto_find(u_int8_t protocol)
603 {
604         return ip_ct_protos[protocol];
605 }
606
607 /* this is guaranteed to always return a valid protocol helper, since
608  * it falls back to generic_protocol */
609 struct ip_conntrack_protocol *
610 ip_conntrack_proto_find_get(u_int8_t protocol)
611 {
612         struct ip_conntrack_protocol *p;
613
614         preempt_disable();
615         p = __ip_conntrack_proto_find(protocol);
616         if (p) {
617                 if (!try_module_get(p->me))
618                         p = &ip_conntrack_generic_protocol;
619         }
620         preempt_enable();
621         
622         return p;
623 }
624
625 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
626 {
627         module_put(p->me);
628 }
629
630 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
631                                         struct ip_conntrack_tuple *repl)
632 {
633         struct ip_conntrack *conntrack;
634
635         if (!ip_conntrack_hash_rnd_initted) {
636                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
637                 ip_conntrack_hash_rnd_initted = 1;
638         }
639
640         if (ip_conntrack_max
641             && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
642                 unsigned int hash = hash_conntrack(orig);
643                 /* Try dropping from this hash chain. */
644                 if (!early_drop(&ip_conntrack_hash[hash])) {
645                         if (net_ratelimit())
646                                 printk(KERN_WARNING
647                                        "ip_conntrack: table full, dropping"
648                                        " packet.\n");
649                         return ERR_PTR(-ENOMEM);
650                 }
651         }
652
653         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
654         if (!conntrack) {
655                 DEBUGP("Can't allocate conntrack.\n");
656                 return ERR_PTR(-ENOMEM);
657         }
658
659         memset(conntrack, 0, sizeof(*conntrack));
660         atomic_set(&conntrack->ct_general.use, 1);
661         conntrack->ct_general.destroy = destroy_conntrack;
662         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
663         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
664         /* Don't set timer yet: wait for confirmation */
665         init_timer(&conntrack->timeout);
666         conntrack->timeout.data = (unsigned long)conntrack;
667         conntrack->timeout.function = death_by_timeout;
668
669         atomic_inc(&ip_conntrack_count);
670
671         return conntrack;
672 }
673
674 void
675 ip_conntrack_free(struct ip_conntrack *conntrack)
676 {
677         atomic_dec(&ip_conntrack_count);
678         kmem_cache_free(ip_conntrack_cachep, conntrack);
679 }
680
681 /* Allocate a new conntrack: we return -ENOMEM if classification
682  * failed due to stress.   Otherwise it really is unclassifiable */
683 static struct ip_conntrack_tuple_hash *
684 init_conntrack(struct ip_conntrack_tuple *tuple,
685                struct ip_conntrack_protocol *protocol,
686                struct sk_buff *skb)
687 {
688         struct ip_conntrack *conntrack;
689         struct ip_conntrack_tuple repl_tuple;
690         struct ip_conntrack_expect *exp;
691
692         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
693                 DEBUGP("Can't invert tuple.\n");
694                 return NULL;
695         }
696
697         conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
698         if (conntrack == NULL || IS_ERR(conntrack))
699                 return (struct ip_conntrack_tuple_hash *)conntrack;
700
701         if (!protocol->new(conntrack, skb)) {
702                 ip_conntrack_free(conntrack);
703                 return NULL;
704         }
705
706         write_lock_bh(&ip_conntrack_lock);
707         exp = find_expectation(tuple);
708
709         if (exp) {
710                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
711                         conntrack, exp);
712                 /* Welcome, Mr. Bond.  We've been expecting you... */
713                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
714                 conntrack->master = exp->master;
715 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
716                 conntrack->mark = exp->master->mark;
717 #endif
718 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
719     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
720                 /* this is ugly, but there is no other place where to put it */
721                 conntrack->nat.masq_index = exp->master->nat.masq_index;
722 #endif
723                 nf_conntrack_get(&conntrack->master->ct_general);
724                 CONNTRACK_STAT_INC(expect_new);
725         } else {
726                 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
727
728                 CONNTRACK_STAT_INC(new);
729         }
730
731         /* Overload tuple linked list to put us in unconfirmed list. */
732         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
733
734         write_unlock_bh(&ip_conntrack_lock);
735
736         if (exp) {
737                 if (exp->expectfn)
738                         exp->expectfn(conntrack, exp);
739                 ip_conntrack_expect_put(exp);
740         }
741
742         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
743 }
744
745 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
746 static inline struct ip_conntrack *
747 resolve_normal_ct(struct sk_buff *skb,
748                   struct ip_conntrack_protocol *proto,
749                   int *set_reply,
750                   unsigned int hooknum,
751                   enum ip_conntrack_info *ctinfo)
752 {
753         struct ip_conntrack_tuple tuple;
754         struct ip_conntrack_tuple_hash *h;
755         struct ip_conntrack *ct;
756
757         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
758
759         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
760                                 &tuple,proto))
761                 return NULL;
762
763         /* look for tuple match */
764         h = ip_conntrack_find_get(&tuple, NULL);
765         if (!h) {
766                 h = init_conntrack(&tuple, proto, skb);
767                 if (!h)
768                         return NULL;
769                 if (IS_ERR(h))
770                         return (void *)h;
771         }
772         ct = tuplehash_to_ctrack(h);
773
774         /* It exists; we have (non-exclusive) reference. */
775         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
776                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
777                 /* Please set reply bit if this packet OK */
778                 *set_reply = 1;
779         } else {
780                 /* Once we've had two way comms, always ESTABLISHED. */
781                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
782                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
783                                ct);
784                         *ctinfo = IP_CT_ESTABLISHED;
785                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
786                         DEBUGP("ip_conntrack_in: related packet for %p\n",
787                                ct);
788                         *ctinfo = IP_CT_RELATED;
789                 } else {
790                         DEBUGP("ip_conntrack_in: new packet for %p\n",
791                                ct);
792                         *ctinfo = IP_CT_NEW;
793                 }
794                 *set_reply = 0;
795         }
796         skb->nfct = &ct->ct_general;
797         skb->nfctinfo = *ctinfo;
798         return ct;
799 }
800
801 /* Netfilter hook itself. */
802 unsigned int ip_conntrack_in(unsigned int hooknum,
803                              struct sk_buff **pskb,
804                              const struct net_device *in,
805                              const struct net_device *out,
806                              int (*okfn)(struct sk_buff *))
807 {
808         struct ip_conntrack *ct;
809         enum ip_conntrack_info ctinfo;
810         struct ip_conntrack_protocol *proto;
811         int set_reply = 0;
812         int ret;
813
814         /* Previously seen (loopback or untracked)?  Ignore. */
815         if ((*pskb)->nfct) {
816                 CONNTRACK_STAT_INC(ignore);
817                 return NF_ACCEPT;
818         }
819
820         /* Never happen */
821         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
822                 if (net_ratelimit()) {
823                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
824                        (*pskb)->nh.iph->protocol, hooknum);
825                 }
826                 return NF_DROP;
827         }
828
829 /* Doesn't cover locally-generated broadcast, so not worth it. */
830 #if 0
831         /* Ignore broadcast: no `connection'. */
832         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
833                 printk("Broadcast packet!\n");
834                 return NF_ACCEPT;
835         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
836                    == htonl(0x000000FF)) {
837                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
838                        NIPQUAD((*pskb)->nh.iph->saddr),
839                        NIPQUAD((*pskb)->nh.iph->daddr),
840                        (*pskb)->sk, (*pskb)->pkt_type);
841         }
842 #endif
843
844         proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
845
846         /* It may be an special packet, error, unclean...
847          * inverse of the return code tells to the netfilter
848          * core what to do with the packet. */
849         if (proto->error != NULL 
850             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
851                 CONNTRACK_STAT_INC(error);
852                 CONNTRACK_STAT_INC(invalid);
853                 return -ret;
854         }
855
856         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
857                 /* Not valid part of a connection */
858                 CONNTRACK_STAT_INC(invalid);
859                 return NF_ACCEPT;
860         }
861
862         if (IS_ERR(ct)) {
863                 /* Too stressed to deal. */
864                 CONNTRACK_STAT_INC(drop);
865                 return NF_DROP;
866         }
867
868         IP_NF_ASSERT((*pskb)->nfct);
869
870         ret = proto->packet(ct, *pskb, ctinfo);
871         if (ret < 0) {
872                 /* Invalid: inverse of the return code tells
873                  * the netfilter core what to do*/
874                 nf_conntrack_put((*pskb)->nfct);
875                 (*pskb)->nfct = NULL;
876                 CONNTRACK_STAT_INC(invalid);
877                 return -ret;
878         }
879
880         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
881                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
882
883         return ret;
884 }
885
886 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
887                    const struct ip_conntrack_tuple *orig)
888 {
889         return ip_ct_invert_tuple(inverse, orig, 
890                                   __ip_conntrack_proto_find(orig->dst.protonum));
891 }
892
893 /* Would two expected things clash? */
894 static inline int expect_clash(const struct ip_conntrack_expect *a,
895                                const struct ip_conntrack_expect *b)
896 {
897         /* Part covered by intersection of masks must be unequal,
898            otherwise they clash */
899         struct ip_conntrack_tuple intersect_mask
900                 = { { a->mask.src.ip & b->mask.src.ip,
901                       { a->mask.src.u.all & b->mask.src.u.all } },
902                     { a->mask.dst.ip & b->mask.dst.ip,
903                       { a->mask.dst.u.all & b->mask.dst.u.all },
904                       a->mask.dst.protonum & b->mask.dst.protonum } };
905
906         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
907 }
908
909 static inline int expect_matches(const struct ip_conntrack_expect *a,
910                                  const struct ip_conntrack_expect *b)
911 {
912         return a->master == b->master
913                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
914                 && ip_ct_tuple_equal(&a->mask, &b->mask);
915 }
916
917 /* Generally a bad idea to call this: could have matched already. */
918 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
919 {
920         struct ip_conntrack_expect *i;
921
922         write_lock_bh(&ip_conntrack_lock);
923         /* choose the the oldest expectation to evict */
924         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
925                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
926                         ip_ct_unlink_expect(i);
927                         write_unlock_bh(&ip_conntrack_lock);
928                         ip_conntrack_expect_put(i);
929                         return;
930                 }
931         }
932         write_unlock_bh(&ip_conntrack_lock);
933 }
934
935 /* We don't increase the master conntrack refcount for non-fulfilled
936  * conntracks. During the conntrack destruction, the expectations are 
937  * always killed before the conntrack itself */
938 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
939 {
940         struct ip_conntrack_expect *new;
941
942         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
943         if (!new) {
944                 DEBUGP("expect_related: OOM allocating expect\n");
945                 return NULL;
946         }
947         new->master = me;
948         atomic_set(&new->use, 1);
949         return new;
950 }
951
952 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
953 {
954         if (atomic_dec_and_test(&exp->use))
955                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
956 }
957
958 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
959 {
960         atomic_inc(&exp->use);
961         exp->master->expecting++;
962         list_add(&exp->list, &ip_conntrack_expect_list);
963
964         init_timer(&exp->timeout);
965         exp->timeout.data = (unsigned long)exp;
966         exp->timeout.function = expectation_timed_out;
967         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
968         add_timer(&exp->timeout);
969
970         exp->id = ++ip_conntrack_expect_next_id;
971         atomic_inc(&exp->use);
972         CONNTRACK_STAT_INC(expect_create);
973 }
974
975 /* Race with expectations being used means we could have none to find; OK. */
976 static void evict_oldest_expect(struct ip_conntrack *master)
977 {
978         struct ip_conntrack_expect *i;
979
980         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
981                 if (i->master == master) {
982                         if (del_timer(&i->timeout)) {
983                                 ip_ct_unlink_expect(i);
984                                 ip_conntrack_expect_put(i);
985                         }
986                         break;
987                 }
988         }
989 }
990
991 static inline int refresh_timer(struct ip_conntrack_expect *i)
992 {
993         if (!del_timer(&i->timeout))
994                 return 0;
995
996         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
997         add_timer(&i->timeout);
998         return 1;
999 }
1000
1001 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1002 {
1003         struct ip_conntrack_expect *i;
1004         int ret;
1005
1006         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1007         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1008         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
1009
1010         write_lock_bh(&ip_conntrack_lock);
1011         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1012                 if (expect_matches(i, expect)) {
1013                         /* Refresh timer: if it's dying, ignore.. */
1014                         if (refresh_timer(i)) {
1015                                 ret = 0;
1016                                 goto out;
1017                         }
1018                 } else if (expect_clash(i, expect)) {
1019                         ret = -EBUSY;
1020                         goto out;
1021                 }
1022         }
1023
1024         /* Will be over limit? */
1025         if (expect->master->helper->max_expected && 
1026             expect->master->expecting >= expect->master->helper->max_expected)
1027                 evict_oldest_expect(expect->master);
1028
1029         ip_conntrack_expect_insert(expect);
1030         ip_conntrack_expect_event(IPEXP_NEW, expect);
1031         ret = 0;
1032 out:
1033         write_unlock_bh(&ip_conntrack_lock);
1034         return ret;
1035 }
1036
1037 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1038    implicitly racy: see __ip_conntrack_confirm */
1039 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1040                               const struct ip_conntrack_tuple *newreply)
1041 {
1042         write_lock_bh(&ip_conntrack_lock);
1043         /* Should be unconfirmed, so not in hash table yet */
1044         IP_NF_ASSERT(!is_confirmed(conntrack));
1045
1046         DEBUGP("Altering reply tuple of %p to ", conntrack);
1047         DUMP_TUPLE(newreply);
1048
1049         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1050         if (!conntrack->master && conntrack->expecting == 0)
1051                 conntrack->helper = __ip_conntrack_helper_find(newreply);
1052         write_unlock_bh(&ip_conntrack_lock);
1053 }
1054
1055 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1056 {
1057         BUG_ON(me->timeout == 0);
1058         write_lock_bh(&ip_conntrack_lock);
1059         list_prepend(&helpers, me);
1060         write_unlock_bh(&ip_conntrack_lock);
1061
1062         return 0;
1063 }
1064
1065 struct ip_conntrack_helper *
1066 __ip_conntrack_helper_find_byname(const char *name)
1067 {
1068         struct ip_conntrack_helper *h;
1069
1070         list_for_each_entry(h, &helpers, list) {
1071                 if (!strcmp(h->name, name))
1072                         return h;
1073         }
1074
1075         return NULL;
1076 }
1077
1078 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1079                          const struct ip_conntrack_helper *me)
1080 {
1081         if (tuplehash_to_ctrack(i)->helper == me) {
1082                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1083                 tuplehash_to_ctrack(i)->helper = NULL;
1084         }
1085         return 0;
1086 }
1087
1088 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1089 {
1090         unsigned int i;
1091         struct ip_conntrack_expect *exp, *tmp;
1092
1093         /* Need write lock here, to delete helper. */
1094         write_lock_bh(&ip_conntrack_lock);
1095         LIST_DELETE(&helpers, me);
1096
1097         /* Get rid of expectations */
1098         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1099                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1100                         ip_ct_unlink_expect(exp);
1101                         ip_conntrack_expect_put(exp);
1102                 }
1103         }
1104         /* Get rid of expecteds, set helpers to NULL. */
1105         LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1106         for (i = 0; i < ip_conntrack_htable_size; i++)
1107                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1108                             struct ip_conntrack_tuple_hash *, me);
1109         write_unlock_bh(&ip_conntrack_lock);
1110
1111         /* Someone could be still looking at the helper in a bh. */
1112         synchronize_net();
1113 }
1114
1115 static inline void ct_add_counters(struct ip_conntrack *ct,
1116                                    enum ip_conntrack_info ctinfo,
1117                                    const struct sk_buff *skb)
1118 {
1119 #ifdef CONFIG_IP_NF_CT_ACCT
1120         if (skb) {
1121                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1122                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
1123                                         ntohs(skb->nh.iph->tot_len);
1124         }
1125 #endif
1126 }
1127
1128 /* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
1129 void ip_ct_refresh_acct(struct ip_conntrack *ct, 
1130                         enum ip_conntrack_info ctinfo,
1131                         const struct sk_buff *skb,
1132                         unsigned long extra_jiffies)
1133 {
1134         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1135
1136         /* If not in hash table, timer will not be active yet */
1137         if (!is_confirmed(ct)) {
1138                 ct->timeout.expires = extra_jiffies;
1139                 ct_add_counters(ct, ctinfo, skb);
1140         } else {
1141                 write_lock_bh(&ip_conntrack_lock);
1142                 /* Need del_timer for race avoidance (may already be dying). */
1143                 if (del_timer(&ct->timeout)) {
1144                         ct->timeout.expires = jiffies + extra_jiffies;
1145                         add_timer(&ct->timeout);
1146                         /* FIXME: We loose some REFRESH events if this function
1147                          * is called without an skb.  I'll fix this later -HW */
1148                         if (skb)
1149                                 ip_conntrack_event_cache(IPCT_REFRESH, skb);
1150                 }
1151                 ct_add_counters(ct, ctinfo, skb);
1152                 write_unlock_bh(&ip_conntrack_lock);
1153         }
1154 }
1155
1156 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1157     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1158 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1159  * in ip_conntrack_core, since we don't want the protocols to autoload
1160  * or depend on ctnetlink */
1161 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1162                                const struct ip_conntrack_tuple *tuple)
1163 {
1164         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1165                 &tuple->src.u.tcp.port);
1166         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1167                 &tuple->dst.u.tcp.port);
1168         return 0;
1169
1170 nfattr_failure:
1171         return -1;
1172 }
1173
1174 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1175                                struct ip_conntrack_tuple *t)
1176 {
1177         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1178                 return -EINVAL;
1179
1180         t->src.u.tcp.port =
1181                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1182         t->dst.u.tcp.port =
1183                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1184
1185         return 0;
1186 }
1187 #endif
1188
1189 /* Returns new sk_buff, or NULL */
1190 struct sk_buff *
1191 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1192 {
1193         skb_orphan(skb);
1194
1195         local_bh_disable(); 
1196         skb = ip_defrag(skb, user);
1197         local_bh_enable();
1198
1199         if (skb)
1200                 ip_send_check(skb->nh.iph);
1201         return skb;
1202 }
1203
1204 /* Used by ipt_REJECT. */
1205 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1206 {
1207         struct ip_conntrack *ct;
1208         enum ip_conntrack_info ctinfo;
1209
1210         /* This ICMP is in reverse direction to the packet which caused it */
1211         ct = ip_conntrack_get(skb, &ctinfo);
1212         
1213         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1214                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1215         else
1216                 ctinfo = IP_CT_RELATED;
1217
1218         /* Attach to new skbuff, and increment count */
1219         nskb->nfct = &ct->ct_general;
1220         nskb->nfctinfo = ctinfo;
1221         nf_conntrack_get(nskb->nfct);
1222 }
1223
1224 static inline int
1225 do_iter(const struct ip_conntrack_tuple_hash *i,
1226         int (*iter)(struct ip_conntrack *i, void *data),
1227         void *data)
1228 {
1229         return iter(tuplehash_to_ctrack(i), data);
1230 }
1231
1232 /* Bring out ya dead! */
1233 static struct ip_conntrack_tuple_hash *
1234 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1235                 void *data, unsigned int *bucket)
1236 {
1237         struct ip_conntrack_tuple_hash *h = NULL;
1238
1239         write_lock_bh(&ip_conntrack_lock);
1240         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1241                 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1242                                 struct ip_conntrack_tuple_hash *, iter, data);
1243                 if (h)
1244                         break;
1245         }
1246         if (!h)
1247                 h = LIST_FIND_W(&unconfirmed, do_iter,
1248                                 struct ip_conntrack_tuple_hash *, iter, data);
1249         if (h)
1250                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1251         write_unlock_bh(&ip_conntrack_lock);
1252
1253         return h;
1254 }
1255
1256 void
1257 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1258 {
1259         struct ip_conntrack_tuple_hash *h;
1260         unsigned int bucket = 0;
1261
1262         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1263                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1264                 /* Time to push up daises... */
1265                 if (del_timer(&ct->timeout))
1266                         death_by_timeout((unsigned long)ct);
1267                 /* ... else the timer will get him soon. */
1268
1269                 ip_conntrack_put(ct);
1270         }
1271 }
1272
1273 /* Fast function for those who don't want to parse /proc (and I don't
1274    blame them). */
1275 /* Reversing the socket's dst/src point of view gives us the reply
1276    mapping. */
1277 static int
1278 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1279 {
1280         struct inet_sock *inet = inet_sk(sk);
1281         struct ip_conntrack_tuple_hash *h;
1282         struct ip_conntrack_tuple tuple;
1283         
1284         IP_CT_TUPLE_U_BLANK(&tuple);
1285         tuple.src.ip = inet->rcv_saddr;
1286         tuple.src.u.tcp.port = inet->sport;
1287         tuple.dst.ip = inet->daddr;
1288         tuple.dst.u.tcp.port = inet->dport;
1289         tuple.dst.protonum = IPPROTO_TCP;
1290
1291         /* We only do TCP at the moment: is there a better way? */
1292         if (strcmp(sk->sk_prot->name, "TCP")) {
1293                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1294                 return -ENOPROTOOPT;
1295         }
1296
1297         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1298                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1299                        *len, sizeof(struct sockaddr_in));
1300                 return -EINVAL;
1301         }
1302
1303         h = ip_conntrack_find_get(&tuple, NULL);
1304         if (h) {
1305                 struct sockaddr_in sin;
1306                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1307
1308                 sin.sin_family = AF_INET;
1309                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1310                         .tuple.dst.u.tcp.port;
1311                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1312                         .tuple.dst.ip;
1313
1314                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1315                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1316                 ip_conntrack_put(ct);
1317                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1318                         return -EFAULT;
1319                 else
1320                         return 0;
1321         }
1322         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1323                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1324                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1325         return -ENOENT;
1326 }
1327
1328 static struct nf_sockopt_ops so_getorigdst = {
1329         .pf             = PF_INET,
1330         .get_optmin     = SO_ORIGINAL_DST,
1331         .get_optmax     = SO_ORIGINAL_DST+1,
1332         .get            = &getorigdst,
1333 };
1334
1335 static int kill_all(struct ip_conntrack *i, void *data)
1336 {
1337         return 1;
1338 }
1339
1340 static void free_conntrack_hash(void)
1341 {
1342         if (ip_conntrack_vmalloc)
1343                 vfree(ip_conntrack_hash);
1344         else
1345                 free_pages((unsigned long)ip_conntrack_hash, 
1346                            get_order(sizeof(struct list_head)
1347                                      * ip_conntrack_htable_size));
1348 }
1349
1350 void ip_conntrack_flush()
1351 {
1352         /* This makes sure all current packets have passed through
1353            netfilter framework.  Roll on, two-stage module
1354            delete... */
1355         synchronize_net();
1356
1357         ip_ct_event_cache_flush();
1358  i_see_dead_people:
1359         ip_ct_iterate_cleanup(kill_all, NULL);
1360         if (atomic_read(&ip_conntrack_count) != 0) {
1361                 schedule();
1362                 goto i_see_dead_people;
1363         }
1364         /* wait until all references to ip_conntrack_untracked are dropped */
1365         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1366                 schedule();
1367 }
1368
1369 /* Mishearing the voices in his head, our hero wonders how he's
1370    supposed to kill the mall. */
1371 void ip_conntrack_cleanup(void)
1372 {
1373         ip_ct_attach = NULL;
1374         ip_conntrack_flush();
1375         kmem_cache_destroy(ip_conntrack_cachep);
1376         kmem_cache_destroy(ip_conntrack_expect_cachep);
1377         free_conntrack_hash();
1378         nf_unregister_sockopt(&so_getorigdst);
1379 }
1380
1381 static int hashsize;
1382 module_param(hashsize, int, 0400);
1383
1384 int __init ip_conntrack_init(void)
1385 {
1386         unsigned int i;
1387         int ret;
1388
1389         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1390          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1391         if (hashsize) {
1392                 ip_conntrack_htable_size = hashsize;
1393         } else {
1394                 ip_conntrack_htable_size
1395                         = (((num_physpages << PAGE_SHIFT) / 16384)
1396                            / sizeof(struct list_head));
1397                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1398                         ip_conntrack_htable_size = 8192;
1399                 if (ip_conntrack_htable_size < 16)
1400                         ip_conntrack_htable_size = 16;
1401         }
1402         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1403
1404         printk("ip_conntrack version %s (%u buckets, %d max)"
1405                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1406                ip_conntrack_htable_size, ip_conntrack_max,
1407                sizeof(struct ip_conntrack));
1408
1409         ret = nf_register_sockopt(&so_getorigdst);
1410         if (ret != 0) {
1411                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1412                 return ret;
1413         }
1414
1415         /* AK: the hash table is twice as big than needed because it
1416            uses list_head.  it would be much nicer to caches to use a
1417            single pointer list head here. */
1418         ip_conntrack_vmalloc = 0; 
1419         ip_conntrack_hash 
1420                 =(void*)__get_free_pages(GFP_KERNEL, 
1421                                          get_order(sizeof(struct list_head)
1422                                                    *ip_conntrack_htable_size));
1423         if (!ip_conntrack_hash) { 
1424                 ip_conntrack_vmalloc = 1;
1425                 printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
1426                 ip_conntrack_hash = vmalloc(sizeof(struct list_head)
1427                                             * ip_conntrack_htable_size);
1428         }
1429         if (!ip_conntrack_hash) {
1430                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1431                 goto err_unreg_sockopt;
1432         }
1433
1434         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1435                                                 sizeof(struct ip_conntrack), 0,
1436                                                 0, NULL, NULL);
1437         if (!ip_conntrack_cachep) {
1438                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1439                 goto err_free_hash;
1440         }
1441
1442         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1443                                         sizeof(struct ip_conntrack_expect),
1444                                         0, 0, NULL, NULL);
1445         if (!ip_conntrack_expect_cachep) {
1446                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1447                 goto err_free_conntrack_slab;
1448         }
1449
1450         /* Don't NEED lock here, but good form anyway. */
1451         write_lock_bh(&ip_conntrack_lock);
1452         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1453                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1454         /* Sew in builtin protocols. */
1455         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1456         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1457         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1458         write_unlock_bh(&ip_conntrack_lock);
1459
1460         for (i = 0; i < ip_conntrack_htable_size; i++)
1461                 INIT_LIST_HEAD(&ip_conntrack_hash[i]);
1462
1463         /* For use by ipt_REJECT */
1464         ip_ct_attach = ip_conntrack_attach;
1465
1466         /* Set up fake conntrack:
1467             - to never be deleted, not in any hashes */
1468         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1469         /*  - and look it like as a confirmed connection */
1470         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1471
1472         return ret;
1473
1474 err_free_conntrack_slab:
1475         kmem_cache_destroy(ip_conntrack_cachep);
1476 err_free_hash:
1477         free_conntrack_hash();
1478 err_unreg_sockopt:
1479         nf_unregister_sockopt(&so_getorigdst);
1480
1481         return -ENOMEM;
1482 }