e1000: FIX: Stop raw interrupts disabled nag from RT
[pandora-kernel.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/types.h>
21 #include <linux/icmp.h>
22 #include <linux/ip.h>
23 #include <linux/netfilter.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/module.h>
26 #include <linux/skbuff.h>
27 #include <linux/proc_fs.h>
28 #include <linux/vmalloc.h>
29 #include <net/checksum.h>
30 #include <net/ip.h>
31 #include <linux/stddef.h>
32 #include <linux/sysctl.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/jhash.h>
36 #include <linux/err.h>
37 #include <linux/percpu.h>
38 #include <linux/moduleparam.h>
39 #include <linux/notifier.h>
40
41 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42    registrations, conntrack timers*/
43 #include <linux/netfilter_ipv4/ip_conntrack.h>
44 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
45 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
46 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
47
48 #define IP_CONNTRACK_VERSION    "2.4"
49
50 #if 0
51 #define DEBUGP printk
52 #else
53 #define DEBUGP(format, args...)
54 #endif
55
56 DEFINE_RWLOCK(ip_conntrack_lock);
57
58 /* ip_conntrack_standalone needs this */
59 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
60
61 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
62 LIST_HEAD(ip_conntrack_expect_list);
63 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO] __read_mostly;
64 static LIST_HEAD(helpers);
65 unsigned int ip_conntrack_htable_size __read_mostly = 0;
66 int ip_conntrack_max __read_mostly;
67 struct list_head *ip_conntrack_hash __read_mostly;
68 static struct kmem_cache *ip_conntrack_cachep __read_mostly;
69 static struct kmem_cache *ip_conntrack_expect_cachep __read_mostly;
70 struct ip_conntrack ip_conntrack_untracked;
71 unsigned int ip_ct_log_invalid __read_mostly;
72 static LIST_HEAD(unconfirmed);
73 static int ip_conntrack_vmalloc __read_mostly;
74
75 static unsigned int ip_conntrack_next_id;
76 static unsigned int ip_conntrack_expect_next_id;
77 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
78 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
79 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
80
81 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
82
83 /* deliver cached events and clear cache entry - must be called with locally
84  * disabled softirqs */
85 static inline void
86 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
87 {
88         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
89         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
90                 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
91                                     ecache->ct);
92         ecache->events = 0;
93         ip_conntrack_put(ecache->ct);
94         ecache->ct = NULL;
95 }
96
97 /* Deliver all cached events for a particular conntrack. This is called
98  * by code prior to async packet handling or freeing the skb */
99 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
100 {
101         struct ip_conntrack_ecache *ecache;
102
103         local_bh_disable();
104         ecache = &__get_cpu_var(ip_conntrack_ecache);
105         if (ecache->ct == ct)
106                 __ip_ct_deliver_cached_events(ecache);
107         local_bh_enable();
108 }
109
110 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
111 {
112         struct ip_conntrack_ecache *ecache;
113
114         /* take care of delivering potentially old events */
115         ecache = &__get_cpu_var(ip_conntrack_ecache);
116         BUG_ON(ecache->ct == ct);
117         if (ecache->ct)
118                 __ip_ct_deliver_cached_events(ecache);
119         /* initialize for this conntrack/packet */
120         ecache->ct = ct;
121         nf_conntrack_get(&ct->ct_general);
122 }
123
124 /* flush the event cache - touches other CPU's data and must not be called while
125  * packets are still passing through the code */
126 static void ip_ct_event_cache_flush(void)
127 {
128         struct ip_conntrack_ecache *ecache;
129         int cpu;
130
131         for_each_possible_cpu(cpu) {
132                 ecache = &per_cpu(ip_conntrack_ecache, cpu);
133                 if (ecache->ct)
134                         ip_conntrack_put(ecache->ct);
135         }
136 }
137 #else
138 static inline void ip_ct_event_cache_flush(void) {}
139 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
140
141 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
142
143 static int ip_conntrack_hash_rnd_initted;
144 static unsigned int ip_conntrack_hash_rnd;
145
146 static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
147                             unsigned int size, unsigned int rnd)
148 {
149         return (jhash_3words((__force u32)tuple->src.ip,
150                              ((__force u32)tuple->dst.ip ^ tuple->dst.protonum),
151                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
152                              rnd) % size);
153 }
154
155 static u_int32_t
156 hash_conntrack(const struct ip_conntrack_tuple *tuple)
157 {
158         return __hash_conntrack(tuple, ip_conntrack_htable_size,
159                                 ip_conntrack_hash_rnd);
160 }
161
162 int
163 ip_ct_get_tuple(const struct iphdr *iph,
164                 const struct sk_buff *skb,
165                 unsigned int dataoff,
166                 struct ip_conntrack_tuple *tuple,
167                 const struct ip_conntrack_protocol *protocol)
168 {
169         /* Never happen */
170         if (iph->frag_off & htons(IP_OFFSET)) {
171                 printk("ip_conntrack_core: Frag of proto %u.\n",
172                        iph->protocol);
173                 return 0;
174         }
175
176         tuple->src.ip = iph->saddr;
177         tuple->dst.ip = iph->daddr;
178         tuple->dst.protonum = iph->protocol;
179         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
180
181         return protocol->pkt_to_tuple(skb, dataoff, tuple);
182 }
183
184 int
185 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
186                    const struct ip_conntrack_tuple *orig,
187                    const struct ip_conntrack_protocol *protocol)
188 {
189         inverse->src.ip = orig->dst.ip;
190         inverse->dst.ip = orig->src.ip;
191         inverse->dst.protonum = orig->dst.protonum;
192         inverse->dst.dir = !orig->dst.dir;
193
194         return protocol->invert_tuple(inverse, orig);
195 }
196
197
198 /* ip_conntrack_expect helper functions */
199 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
200 {
201         IP_NF_ASSERT(!timer_pending(&exp->timeout));
202         list_del(&exp->list);
203         CONNTRACK_STAT_INC(expect_delete);
204         exp->master->expecting--;
205         ip_conntrack_expect_put(exp);
206 }
207
208 static void expectation_timed_out(unsigned long ul_expect)
209 {
210         struct ip_conntrack_expect *exp = (void *)ul_expect;
211
212         write_lock_bh(&ip_conntrack_lock);
213         ip_ct_unlink_expect(exp);
214         write_unlock_bh(&ip_conntrack_lock);
215         ip_conntrack_expect_put(exp);
216 }
217
218 struct ip_conntrack_expect *
219 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
220 {
221         struct ip_conntrack_expect *i;
222
223         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
224                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask))
225                         return i;
226         }
227         return NULL;
228 }
229
230 /* Just find a expectation corresponding to a tuple. */
231 struct ip_conntrack_expect *
232 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
233 {
234         struct ip_conntrack_expect *i;
235
236         read_lock_bh(&ip_conntrack_lock);
237         i = __ip_conntrack_expect_find(tuple);
238         if (i)
239                 atomic_inc(&i->use);
240         read_unlock_bh(&ip_conntrack_lock);
241
242         return i;
243 }
244
245 /* If an expectation for this connection is found, it gets delete from
246  * global list then returned. */
247 static struct ip_conntrack_expect *
248 find_expectation(const struct ip_conntrack_tuple *tuple)
249 {
250         struct ip_conntrack_expect *i;
251
252         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
253                 /* If master is not in hash table yet (ie. packet hasn't left
254                    this machine yet), how can other end know about expected?
255                    Hence these are not the droids you are looking for (if
256                    master ct never got confirmed, we'd hold a reference to it
257                    and weird things would happen to future packets). */
258                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
259                     && is_confirmed(i->master)) {
260                         if (i->flags & IP_CT_EXPECT_PERMANENT) {
261                                 atomic_inc(&i->use);
262                                 return i;
263                         } else if (del_timer(&i->timeout)) {
264                                 ip_ct_unlink_expect(i);
265                                 return i;
266                         }
267                 }
268         }
269         return NULL;
270 }
271
272 /* delete all expectations for this conntrack */
273 void ip_ct_remove_expectations(struct ip_conntrack *ct)
274 {
275         struct ip_conntrack_expect *i, *tmp;
276
277         /* Optimization: most connection never expect any others. */
278         if (ct->expecting == 0)
279                 return;
280
281         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
282                 if (i->master == ct && del_timer(&i->timeout)) {
283                         ip_ct_unlink_expect(i);
284                         ip_conntrack_expect_put(i);
285                 }
286         }
287 }
288
289 static void
290 clean_from_lists(struct ip_conntrack *ct)
291 {
292         DEBUGP("clean_from_lists(%p)\n", ct);
293         list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
294         list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
295
296         /* Destroy all pending expectations */
297         ip_ct_remove_expectations(ct);
298 }
299
300 static void
301 destroy_conntrack(struct nf_conntrack *nfct)
302 {
303         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
304         struct ip_conntrack_protocol *proto;
305         struct ip_conntrack_helper *helper;
306         typeof(ip_conntrack_destroyed) destroyed;
307
308         DEBUGP("destroy_conntrack(%p)\n", ct);
309         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
310         IP_NF_ASSERT(!timer_pending(&ct->timeout));
311
312         ip_conntrack_event(IPCT_DESTROY, ct);
313         set_bit(IPS_DYING_BIT, &ct->status);
314
315         helper = ct->helper;
316         if (helper && helper->destroy)
317                 helper->destroy(ct);
318
319         /* To make sure we don't get any weird locking issues here:
320          * destroy_conntrack() MUST NOT be called with a write lock
321          * to ip_conntrack_lock!!! -HW */
322         rcu_read_lock();
323         proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
324         if (proto && proto->destroy)
325                 proto->destroy(ct);
326
327         destroyed = rcu_dereference(ip_conntrack_destroyed);
328         if (destroyed)
329                 destroyed(ct);
330
331         rcu_read_unlock();
332
333         write_lock_bh(&ip_conntrack_lock);
334         /* Expectations will have been removed in clean_from_lists,
335          * except TFTP can create an expectation on the first packet,
336          * before connection is in the list, so we need to clean here,
337          * too. */
338         ip_ct_remove_expectations(ct);
339
340         /* We overload first tuple to link into unconfirmed list. */
341         if (!is_confirmed(ct)) {
342                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
343                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
344         }
345
346         CONNTRACK_STAT_INC(delete);
347         write_unlock_bh(&ip_conntrack_lock);
348
349         if (ct->master)
350                 ip_conntrack_put(ct->master);
351
352         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
353         ip_conntrack_free(ct);
354 }
355
356 static void death_by_timeout(unsigned long ul_conntrack)
357 {
358         struct ip_conntrack *ct = (void *)ul_conntrack;
359
360         write_lock_bh(&ip_conntrack_lock);
361         /* Inside lock so preempt is disabled on module removal path.
362          * Otherwise we can get spurious warnings. */
363         CONNTRACK_STAT_INC(delete_list);
364         clean_from_lists(ct);
365         write_unlock_bh(&ip_conntrack_lock);
366         ip_conntrack_put(ct);
367 }
368
369 struct ip_conntrack_tuple_hash *
370 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
371                     const struct ip_conntrack *ignored_conntrack)
372 {
373         struct ip_conntrack_tuple_hash *h;
374         unsigned int hash = hash_conntrack(tuple);
375
376         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
377                 if (tuplehash_to_ctrack(h) != ignored_conntrack &&
378                     ip_ct_tuple_equal(tuple, &h->tuple)) {
379                         CONNTRACK_STAT_INC(found);
380                         return h;
381                 }
382                 CONNTRACK_STAT_INC(searched);
383         }
384
385         return NULL;
386 }
387
388 /* Find a connection corresponding to a tuple. */
389 struct ip_conntrack_tuple_hash *
390 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
391                       const struct ip_conntrack *ignored_conntrack)
392 {
393         struct ip_conntrack_tuple_hash *h;
394
395         read_lock_bh(&ip_conntrack_lock);
396         h = __ip_conntrack_find(tuple, ignored_conntrack);
397         if (h)
398                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
399         read_unlock_bh(&ip_conntrack_lock);
400
401         return h;
402 }
403
404 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
405                                         unsigned int hash,
406                                         unsigned int repl_hash)
407 {
408         ct->id = ++ip_conntrack_next_id;
409         list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
410                  &ip_conntrack_hash[hash]);
411         list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
412                  &ip_conntrack_hash[repl_hash]);
413 }
414
415 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
416 {
417         unsigned int hash, repl_hash;
418
419         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
420         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
421
422         write_lock_bh(&ip_conntrack_lock);
423         __ip_conntrack_hash_insert(ct, hash, repl_hash);
424         write_unlock_bh(&ip_conntrack_lock);
425 }
426
427 /* Confirm a connection given skb; places it in hash table */
428 int
429 __ip_conntrack_confirm(struct sk_buff **pskb)
430 {
431         unsigned int hash, repl_hash;
432         struct ip_conntrack_tuple_hash *h;
433         struct ip_conntrack *ct;
434         enum ip_conntrack_info ctinfo;
435
436         ct = ip_conntrack_get(*pskb, &ctinfo);
437
438         /* ipt_REJECT uses ip_conntrack_attach to attach related
439            ICMP/TCP RST packets in other direction.  Actual packet
440            which created connection will be IP_CT_NEW or for an
441            expected connection, IP_CT_RELATED. */
442         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
443                 return NF_ACCEPT;
444
445         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
446         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
447
448         /* We're not in hash table, and we refuse to set up related
449            connections for unconfirmed conns.  But packet copies and
450            REJECT will give spurious warnings here. */
451         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
452
453         /* No external references means noone else could have
454            confirmed us. */
455         IP_NF_ASSERT(!is_confirmed(ct));
456         DEBUGP("Confirming conntrack %p\n", ct);
457
458         write_lock_bh(&ip_conntrack_lock);
459
460         /* See if there's one in the list already, including reverse:
461            NAT could have grabbed it without realizing, since we're
462            not in the hash.  If there is, we lost race. */
463         list_for_each_entry(h, &ip_conntrack_hash[hash], list)
464                 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
465                                       &h->tuple))
466                         goto out;
467         list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list)
468                 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
469                                       &h->tuple))
470                         goto out;
471
472         /* Remove from unconfirmed list */
473         list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
474
475         __ip_conntrack_hash_insert(ct, hash, repl_hash);
476         /* Timer relative to confirmation time, not original
477            setting time, otherwise we'd get timer wrap in
478            weird delay cases. */
479         ct->timeout.expires += jiffies;
480         add_timer(&ct->timeout);
481         atomic_inc(&ct->ct_general.use);
482         set_bit(IPS_CONFIRMED_BIT, &ct->status);
483         CONNTRACK_STAT_INC(insert);
484         write_unlock_bh(&ip_conntrack_lock);
485         if (ct->helper)
486                 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
487 #ifdef CONFIG_IP_NF_NAT_NEEDED
488         if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
489             test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
490                 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
491 #endif
492         ip_conntrack_event_cache(master_ct(ct) ?
493                                  IPCT_RELATED : IPCT_NEW, *pskb);
494
495         return NF_ACCEPT;
496
497 out:
498         CONNTRACK_STAT_INC(insert_failed);
499         write_unlock_bh(&ip_conntrack_lock);
500         return NF_DROP;
501 }
502
503 /* Returns true if a connection correspondings to the tuple (required
504    for NAT). */
505 int
506 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
507                          const struct ip_conntrack *ignored_conntrack)
508 {
509         struct ip_conntrack_tuple_hash *h;
510
511         read_lock_bh(&ip_conntrack_lock);
512         h = __ip_conntrack_find(tuple, ignored_conntrack);
513         read_unlock_bh(&ip_conntrack_lock);
514
515         return h != NULL;
516 }
517
518 /* There's a small race here where we may free a just-assured
519    connection.  Too bad: we're in trouble anyway. */
520 static int early_drop(struct list_head *chain)
521 {
522         /* Traverse backwards: gives us oldest, which is roughly LRU */
523         struct ip_conntrack_tuple_hash *h;
524         struct ip_conntrack *ct = NULL, *tmp;
525         int dropped = 0;
526
527         read_lock_bh(&ip_conntrack_lock);
528         list_for_each_entry_reverse(h, chain, list) {
529                 tmp = tuplehash_to_ctrack(h);
530                 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
531                         ct = tmp;
532                         atomic_inc(&ct->ct_general.use);
533                         break;
534                 }
535         }
536         read_unlock_bh(&ip_conntrack_lock);
537
538         if (!ct)
539                 return dropped;
540
541         if (del_timer(&ct->timeout)) {
542                 death_by_timeout((unsigned long)ct);
543                 dropped = 1;
544                 CONNTRACK_STAT_INC_ATOMIC(early_drop);
545         }
546         ip_conntrack_put(ct);
547         return dropped;
548 }
549
550 static struct ip_conntrack_helper *
551 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
552 {
553         struct ip_conntrack_helper *h;
554
555         list_for_each_entry(h, &helpers, list) {
556                 if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
557                         return h;
558         }
559         return NULL;
560 }
561
562 struct ip_conntrack_helper *
563 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
564 {
565         struct ip_conntrack_helper *helper;
566
567         /* need ip_conntrack_lock to assure that helper exists until
568          * try_module_get() is called */
569         read_lock_bh(&ip_conntrack_lock);
570
571         helper = __ip_conntrack_helper_find(tuple);
572         if (helper) {
573                 /* need to increase module usage count to assure helper will
574                  * not go away while the caller is e.g. busy putting a
575                  * conntrack in the hash that uses the helper */
576                 if (!try_module_get(helper->me))
577                         helper = NULL;
578         }
579
580         read_unlock_bh(&ip_conntrack_lock);
581
582         return helper;
583 }
584
585 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
586 {
587         module_put(helper->me);
588 }
589
590 struct ip_conntrack_protocol *
591 __ip_conntrack_proto_find(u_int8_t protocol)
592 {
593         return ip_ct_protos[protocol];
594 }
595
596 /* this is guaranteed to always return a valid protocol helper, since
597  * it falls back to generic_protocol */
598 struct ip_conntrack_protocol *
599 ip_conntrack_proto_find_get(u_int8_t protocol)
600 {
601         struct ip_conntrack_protocol *p;
602
603         rcu_read_lock();
604         p = __ip_conntrack_proto_find(protocol);
605         if (p) {
606                 if (!try_module_get(p->me))
607                         p = &ip_conntrack_generic_protocol;
608         }
609         rcu_read_unlock();
610
611         return p;
612 }
613
614 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
615 {
616         module_put(p->me);
617 }
618
619 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
620                                         struct ip_conntrack_tuple *repl)
621 {
622         struct ip_conntrack *conntrack;
623
624         if (!ip_conntrack_hash_rnd_initted) {
625                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
626                 ip_conntrack_hash_rnd_initted = 1;
627         }
628
629         /* We don't want any race condition at early drop stage */
630         atomic_inc(&ip_conntrack_count);
631
632         if (ip_conntrack_max
633             && atomic_read(&ip_conntrack_count) > ip_conntrack_max) {
634                 unsigned int hash = hash_conntrack(orig);
635                 /* Try dropping from this hash chain. */
636                 if (!early_drop(&ip_conntrack_hash[hash])) {
637                         atomic_dec(&ip_conntrack_count);
638                         if (net_ratelimit())
639                                 printk(KERN_WARNING
640                                        "ip_conntrack: table full, dropping"
641                                        " packet.\n");
642                         return ERR_PTR(-ENOMEM);
643                 }
644         }
645
646         conntrack = kmem_cache_zalloc(ip_conntrack_cachep, GFP_ATOMIC);
647         if (!conntrack) {
648                 DEBUGP("Can't allocate conntrack.\n");
649                 atomic_dec(&ip_conntrack_count);
650                 return ERR_PTR(-ENOMEM);
651         }
652
653         atomic_set(&conntrack->ct_general.use, 1);
654         conntrack->ct_general.destroy = destroy_conntrack;
655         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
656         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
657         /* Don't set timer yet: wait for confirmation */
658         init_timer(&conntrack->timeout);
659         conntrack->timeout.data = (unsigned long)conntrack;
660         conntrack->timeout.function = death_by_timeout;
661
662         return conntrack;
663 }
664
665 void
666 ip_conntrack_free(struct ip_conntrack *conntrack)
667 {
668         atomic_dec(&ip_conntrack_count);
669         kmem_cache_free(ip_conntrack_cachep, conntrack);
670 }
671
672 /* Allocate a new conntrack: we return -ENOMEM if classification
673  * failed due to stress.   Otherwise it really is unclassifiable */
674 static struct ip_conntrack_tuple_hash *
675 init_conntrack(struct ip_conntrack_tuple *tuple,
676                struct ip_conntrack_protocol *protocol,
677                struct sk_buff *skb)
678 {
679         struct ip_conntrack *conntrack;
680         struct ip_conntrack_tuple repl_tuple;
681         struct ip_conntrack_expect *exp;
682
683         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
684                 DEBUGP("Can't invert tuple.\n");
685                 return NULL;
686         }
687
688         conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
689         if (conntrack == NULL || IS_ERR(conntrack))
690                 return (struct ip_conntrack_tuple_hash *)conntrack;
691
692         if (!protocol->new(conntrack, skb)) {
693                 ip_conntrack_free(conntrack);
694                 return NULL;
695         }
696
697         write_lock_bh(&ip_conntrack_lock);
698         exp = find_expectation(tuple);
699
700         if (exp) {
701                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
702                         conntrack, exp);
703                 /* Welcome, Mr. Bond.  We've been expecting you... */
704                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
705                 conntrack->master = exp->master;
706 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
707                 conntrack->mark = exp->master->mark;
708 #endif
709 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
710     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
711                 /* this is ugly, but there is no other place where to put it */
712                 conntrack->nat.masq_index = exp->master->nat.masq_index;
713 #endif
714 #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
715                 conntrack->secmark = exp->master->secmark;
716 #endif
717                 nf_conntrack_get(&conntrack->master->ct_general);
718                 CONNTRACK_STAT_INC(expect_new);
719         } else {
720                 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
721
722                 CONNTRACK_STAT_INC(new);
723         }
724
725         /* Overload tuple linked list to put us in unconfirmed list. */
726         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
727
728         write_unlock_bh(&ip_conntrack_lock);
729
730         if (exp) {
731                 if (exp->expectfn)
732                         exp->expectfn(conntrack, exp);
733                 ip_conntrack_expect_put(exp);
734         }
735
736         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
737 }
738
739 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
740 static inline struct ip_conntrack *
741 resolve_normal_ct(struct sk_buff *skb,
742                   struct ip_conntrack_protocol *proto,
743                   int *set_reply,
744                   unsigned int hooknum,
745                   enum ip_conntrack_info *ctinfo)
746 {
747         struct ip_conntrack_tuple tuple;
748         struct ip_conntrack_tuple_hash *h;
749         struct ip_conntrack *ct;
750
751         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
752
753         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
754                                 &tuple,proto))
755                 return NULL;
756
757         /* look for tuple match */
758         h = ip_conntrack_find_get(&tuple, NULL);
759         if (!h) {
760                 h = init_conntrack(&tuple, proto, skb);
761                 if (!h)
762                         return NULL;
763                 if (IS_ERR(h))
764                         return (void *)h;
765         }
766         ct = tuplehash_to_ctrack(h);
767
768         /* It exists; we have (non-exclusive) reference. */
769         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
770                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
771                 /* Please set reply bit if this packet OK */
772                 *set_reply = 1;
773         } else {
774                 /* Once we've had two way comms, always ESTABLISHED. */
775                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
776                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
777                                ct);
778                         *ctinfo = IP_CT_ESTABLISHED;
779                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
780                         DEBUGP("ip_conntrack_in: related packet for %p\n",
781                                ct);
782                         *ctinfo = IP_CT_RELATED;
783                 } else {
784                         DEBUGP("ip_conntrack_in: new packet for %p\n",
785                                ct);
786                         *ctinfo = IP_CT_NEW;
787                 }
788                 *set_reply = 0;
789         }
790         skb->nfct = &ct->ct_general;
791         skb->nfctinfo = *ctinfo;
792         return ct;
793 }
794
795 /* Netfilter hook itself. */
796 unsigned int ip_conntrack_in(unsigned int hooknum,
797                              struct sk_buff **pskb,
798                              const struct net_device *in,
799                              const struct net_device *out,
800                              int (*okfn)(struct sk_buff *))
801 {
802         struct ip_conntrack *ct;
803         enum ip_conntrack_info ctinfo;
804         struct ip_conntrack_protocol *proto;
805         int set_reply = 0;
806         int ret;
807
808         /* Previously seen (loopback or untracked)?  Ignore. */
809         if ((*pskb)->nfct) {
810                 CONNTRACK_STAT_INC_ATOMIC(ignore);
811                 return NF_ACCEPT;
812         }
813
814         /* Never happen */
815         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
816                 if (net_ratelimit()) {
817                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
818                        (*pskb)->nh.iph->protocol, hooknum);
819                 }
820                 return NF_DROP;
821         }
822
823 /* Doesn't cover locally-generated broadcast, so not worth it. */
824 #if 0
825         /* Ignore broadcast: no `connection'. */
826         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
827                 printk("Broadcast packet!\n");
828                 return NF_ACCEPT;
829         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
830                    == htonl(0x000000FF)) {
831                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
832                        NIPQUAD((*pskb)->nh.iph->saddr),
833                        NIPQUAD((*pskb)->nh.iph->daddr),
834                        (*pskb)->sk, (*pskb)->pkt_type);
835         }
836 #endif
837
838         /* rcu_read_lock()ed by nf_hook_slow */
839         proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
840
841         /* It may be an special packet, error, unclean...
842          * inverse of the return code tells to the netfilter
843          * core what to do with the packet. */
844         if (proto->error != NULL
845             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
846                 CONNTRACK_STAT_INC_ATOMIC(error);
847                 CONNTRACK_STAT_INC_ATOMIC(invalid);
848                 return -ret;
849         }
850
851         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
852                 /* Not valid part of a connection */
853                 CONNTRACK_STAT_INC_ATOMIC(invalid);
854                 return NF_ACCEPT;
855         }
856
857         if (IS_ERR(ct)) {
858                 /* Too stressed to deal. */
859                 CONNTRACK_STAT_INC_ATOMIC(drop);
860                 return NF_DROP;
861         }
862
863         IP_NF_ASSERT((*pskb)->nfct);
864
865         ret = proto->packet(ct, *pskb, ctinfo);
866         if (ret < 0) {
867                 /* Invalid: inverse of the return code tells
868                  * the netfilter core what to do*/
869                 nf_conntrack_put((*pskb)->nfct);
870                 (*pskb)->nfct = NULL;
871                 CONNTRACK_STAT_INC_ATOMIC(invalid);
872                 return -ret;
873         }
874
875         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
876                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
877
878         return ret;
879 }
880
881 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
882                    const struct ip_conntrack_tuple *orig)
883 {
884         struct ip_conntrack_protocol *proto;
885         int ret;
886
887         rcu_read_lock();
888         proto = __ip_conntrack_proto_find(orig->dst.protonum);
889         ret = ip_ct_invert_tuple(inverse, orig, proto);
890         rcu_read_unlock();
891
892         return ret;
893 }
894
895 /* Would two expected things clash? */
896 static inline int expect_clash(const struct ip_conntrack_expect *a,
897                                const struct ip_conntrack_expect *b)
898 {
899         /* Part covered by intersection of masks must be unequal,
900            otherwise they clash */
901         struct ip_conntrack_tuple intersect_mask
902                 = { { a->mask.src.ip & b->mask.src.ip,
903                       { a->mask.src.u.all & b->mask.src.u.all } },
904                     { a->mask.dst.ip & b->mask.dst.ip,
905                       { a->mask.dst.u.all & b->mask.dst.u.all },
906                       a->mask.dst.protonum & b->mask.dst.protonum } };
907
908         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
909 }
910
911 static inline int expect_matches(const struct ip_conntrack_expect *a,
912                                  const struct ip_conntrack_expect *b)
913 {
914         return a->master == b->master
915                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
916                 && ip_ct_tuple_equal(&a->mask, &b->mask);
917 }
918
919 /* Generally a bad idea to call this: could have matched already. */
920 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
921 {
922         struct ip_conntrack_expect *i;
923
924         write_lock_bh(&ip_conntrack_lock);
925         /* choose the the oldest expectation to evict */
926         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
927                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
928                         ip_ct_unlink_expect(i);
929                         write_unlock_bh(&ip_conntrack_lock);
930                         ip_conntrack_expect_put(i);
931                         return;
932                 }
933         }
934         write_unlock_bh(&ip_conntrack_lock);
935 }
936
937 /* We don't increase the master conntrack refcount for non-fulfilled
938  * conntracks. During the conntrack destruction, the expectations are
939  * always killed before the conntrack itself */
940 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
941 {
942         struct ip_conntrack_expect *new;
943
944         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
945         if (!new) {
946                 DEBUGP("expect_related: OOM allocating expect\n");
947                 return NULL;
948         }
949         new->master = me;
950         atomic_set(&new->use, 1);
951         return new;
952 }
953
954 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
955 {
956         if (atomic_dec_and_test(&exp->use))
957                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
958 }
959
960 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
961 {
962         atomic_inc(&exp->use);
963         exp->master->expecting++;
964         list_add(&exp->list, &ip_conntrack_expect_list);
965
966         init_timer(&exp->timeout);
967         exp->timeout.data = (unsigned long)exp;
968         exp->timeout.function = expectation_timed_out;
969         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
970         add_timer(&exp->timeout);
971
972         exp->id = ++ip_conntrack_expect_next_id;
973         atomic_inc(&exp->use);
974         CONNTRACK_STAT_INC(expect_create);
975 }
976
977 /* Race with expectations being used means we could have none to find; OK. */
978 static void evict_oldest_expect(struct ip_conntrack *master)
979 {
980         struct ip_conntrack_expect *i;
981
982         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
983                 if (i->master == master) {
984                         if (del_timer(&i->timeout)) {
985                                 ip_ct_unlink_expect(i);
986                                 ip_conntrack_expect_put(i);
987                         }
988                         break;
989                 }
990         }
991 }
992
993 static inline int refresh_timer(struct ip_conntrack_expect *i)
994 {
995         if (!del_timer(&i->timeout))
996                 return 0;
997
998         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
999         add_timer(&i->timeout);
1000         return 1;
1001 }
1002
1003 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1004 {
1005         struct ip_conntrack_expect *i;
1006         int ret;
1007
1008         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1009         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1010         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
1011
1012         write_lock_bh(&ip_conntrack_lock);
1013         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1014                 if (expect_matches(i, expect)) {
1015                         /* Refresh timer: if it's dying, ignore.. */
1016                         if (refresh_timer(i)) {
1017                                 ret = 0;
1018                                 goto out;
1019                         }
1020                 } else if (expect_clash(i, expect)) {
1021                         ret = -EBUSY;
1022                         goto out;
1023                 }
1024         }
1025
1026         /* Will be over limit? */
1027         if (expect->master->helper->max_expected &&
1028             expect->master->expecting >= expect->master->helper->max_expected)
1029                 evict_oldest_expect(expect->master);
1030
1031         ip_conntrack_expect_insert(expect);
1032         ip_conntrack_expect_event(IPEXP_NEW, expect);
1033         ret = 0;
1034 out:
1035         write_unlock_bh(&ip_conntrack_lock);
1036         return ret;
1037 }
1038
1039 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1040    implicitly racy: see __ip_conntrack_confirm */
1041 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1042                               const struct ip_conntrack_tuple *newreply)
1043 {
1044         write_lock_bh(&ip_conntrack_lock);
1045         /* Should be unconfirmed, so not in hash table yet */
1046         IP_NF_ASSERT(!is_confirmed(conntrack));
1047
1048         DEBUGP("Altering reply tuple of %p to ", conntrack);
1049         DUMP_TUPLE(newreply);
1050
1051         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1052         if (!conntrack->master && conntrack->expecting == 0)
1053                 conntrack->helper = __ip_conntrack_helper_find(newreply);
1054         write_unlock_bh(&ip_conntrack_lock);
1055 }
1056
1057 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1058 {
1059         BUG_ON(me->timeout == 0);
1060         write_lock_bh(&ip_conntrack_lock);
1061         list_add(&me->list, &helpers);
1062         write_unlock_bh(&ip_conntrack_lock);
1063
1064         return 0;
1065 }
1066
1067 struct ip_conntrack_helper *
1068 __ip_conntrack_helper_find_byname(const char *name)
1069 {
1070         struct ip_conntrack_helper *h;
1071
1072         list_for_each_entry(h, &helpers, list) {
1073                 if (!strcmp(h->name, name))
1074                         return h;
1075         }
1076
1077         return NULL;
1078 }
1079
1080 static inline void unhelp(struct ip_conntrack_tuple_hash *i,
1081                           const struct ip_conntrack_helper *me)
1082 {
1083         if (tuplehash_to_ctrack(i)->helper == me) {
1084                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1085                 tuplehash_to_ctrack(i)->helper = NULL;
1086         }
1087 }
1088
1089 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1090 {
1091         unsigned int i;
1092         struct ip_conntrack_tuple_hash *h;
1093         struct ip_conntrack_expect *exp, *tmp;
1094
1095         /* Need write lock here, to delete helper. */
1096         write_lock_bh(&ip_conntrack_lock);
1097         list_del(&me->list);
1098
1099         /* Get rid of expectations */
1100         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1101                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1102                         ip_ct_unlink_expect(exp);
1103                         ip_conntrack_expect_put(exp);
1104                 }
1105         }
1106         /* Get rid of expecteds, set helpers to NULL. */
1107         list_for_each_entry(h, &unconfirmed, list)
1108                 unhelp(h, me);
1109         for (i = 0; i < ip_conntrack_htable_size; i++) {
1110                 list_for_each_entry(h, &ip_conntrack_hash[i], list)
1111                         unhelp(h, me);
1112         }
1113         write_unlock_bh(&ip_conntrack_lock);
1114
1115         /* Someone could be still looking at the helper in a bh. */
1116         synchronize_net();
1117 }
1118
1119 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1120 void __ip_ct_refresh_acct(struct ip_conntrack *ct,
1121                         enum ip_conntrack_info ctinfo,
1122                         const struct sk_buff *skb,
1123                         unsigned long extra_jiffies,
1124                         int do_acct)
1125 {
1126         int event = 0;
1127
1128         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1129         IP_NF_ASSERT(skb);
1130
1131         write_lock_bh(&ip_conntrack_lock);
1132
1133         /* Only update if this is not a fixed timeout */
1134         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1135                 write_unlock_bh(&ip_conntrack_lock);
1136                 return;
1137         }
1138
1139         /* If not in hash table, timer will not be active yet */
1140         if (!is_confirmed(ct)) {
1141                 ct->timeout.expires = extra_jiffies;
1142                 event = IPCT_REFRESH;
1143         } else {
1144                 /* Need del_timer for race avoidance (may already be dying). */
1145                 if (del_timer(&ct->timeout)) {
1146                         ct->timeout.expires = jiffies + extra_jiffies;
1147                         add_timer(&ct->timeout);
1148                         event = IPCT_REFRESH;
1149                 }
1150         }
1151
1152 #ifdef CONFIG_IP_NF_CT_ACCT
1153         if (do_acct) {
1154                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1155                 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1156                                                 ntohs(skb->nh.iph->tot_len);
1157                 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1158                     || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1159                         event |= IPCT_COUNTER_FILLING;
1160         }
1161 #endif
1162
1163         write_unlock_bh(&ip_conntrack_lock);
1164
1165         /* must be unlocked when calling event cache */
1166         if (event)
1167                 ip_conntrack_event_cache(event, skb);
1168 }
1169
1170 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1171     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1172 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1173  * in ip_conntrack_core, since we don't want the protocols to autoload
1174  * or depend on ctnetlink */
1175 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1176                                const struct ip_conntrack_tuple *tuple)
1177 {
1178         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(__be16),
1179                 &tuple->src.u.tcp.port);
1180         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(__be16),
1181                 &tuple->dst.u.tcp.port);
1182         return 0;
1183
1184 nfattr_failure:
1185         return -1;
1186 }
1187
1188 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1189                                struct ip_conntrack_tuple *t)
1190 {
1191         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1192                 return -EINVAL;
1193
1194         t->src.u.tcp.port =
1195                 *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1196         t->dst.u.tcp.port =
1197                 *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1198
1199         return 0;
1200 }
1201 #endif
1202
1203 /* Returns new sk_buff, or NULL */
1204 struct sk_buff *
1205 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1206 {
1207         skb_orphan(skb);
1208
1209         local_bh_disable();
1210         skb = ip_defrag(skb, user);
1211         local_bh_enable();
1212
1213         if (skb)
1214                 ip_send_check(skb->nh.iph);
1215         return skb;
1216 }
1217
1218 /* Used by ipt_REJECT. */
1219 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1220 {
1221         struct ip_conntrack *ct;
1222         enum ip_conntrack_info ctinfo;
1223
1224         /* This ICMP is in reverse direction to the packet which caused it */
1225         ct = ip_conntrack_get(skb, &ctinfo);
1226
1227         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1228                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1229         else
1230                 ctinfo = IP_CT_RELATED;
1231
1232         /* Attach to new skbuff, and increment count */
1233         nskb->nfct = &ct->ct_general;
1234         nskb->nfctinfo = ctinfo;
1235         nf_conntrack_get(nskb->nfct);
1236 }
1237
1238 /* Bring out ya dead! */
1239 static struct ip_conntrack *
1240 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1241                 void *data, unsigned int *bucket)
1242 {
1243         struct ip_conntrack_tuple_hash *h;
1244         struct ip_conntrack *ct;
1245
1246         write_lock_bh(&ip_conntrack_lock);
1247         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1248                 list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) {
1249                         ct = tuplehash_to_ctrack(h);
1250                         if (iter(ct, data))
1251                                 goto found;
1252                 }
1253         }
1254         list_for_each_entry(h, &unconfirmed, list) {
1255                 ct = tuplehash_to_ctrack(h);
1256                 if (iter(ct, data))
1257                         set_bit(IPS_DYING_BIT, &ct->status);
1258         }
1259         write_unlock_bh(&ip_conntrack_lock);
1260         return NULL;
1261
1262 found:
1263         atomic_inc(&ct->ct_general.use);
1264         write_unlock_bh(&ip_conntrack_lock);
1265         return ct;
1266 }
1267
1268 void
1269 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1270 {
1271         struct ip_conntrack *ct;
1272         unsigned int bucket = 0;
1273
1274         while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1275                 /* Time to push up daises... */
1276                 if (del_timer(&ct->timeout))
1277                         death_by_timeout((unsigned long)ct);
1278                 /* ... else the timer will get him soon. */
1279
1280                 ip_conntrack_put(ct);
1281         }
1282 }
1283
1284 /* Fast function for those who don't want to parse /proc (and I don't
1285    blame them). */
1286 /* Reversing the socket's dst/src point of view gives us the reply
1287    mapping. */
1288 static int
1289 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1290 {
1291         struct inet_sock *inet = inet_sk(sk);
1292         struct ip_conntrack_tuple_hash *h;
1293         struct ip_conntrack_tuple tuple;
1294
1295         IP_CT_TUPLE_U_BLANK(&tuple);
1296         tuple.src.ip = inet->rcv_saddr;
1297         tuple.src.u.tcp.port = inet->sport;
1298         tuple.dst.ip = inet->daddr;
1299         tuple.dst.u.tcp.port = inet->dport;
1300         tuple.dst.protonum = IPPROTO_TCP;
1301
1302         /* We only do TCP at the moment: is there a better way? */
1303         if (strcmp(sk->sk_prot->name, "TCP")) {
1304                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1305                 return -ENOPROTOOPT;
1306         }
1307
1308         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1309                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1310                        *len, sizeof(struct sockaddr_in));
1311                 return -EINVAL;
1312         }
1313
1314         h = ip_conntrack_find_get(&tuple, NULL);
1315         if (h) {
1316                 struct sockaddr_in sin;
1317                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1318
1319                 sin.sin_family = AF_INET;
1320                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1321                         .tuple.dst.u.tcp.port;
1322                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1323                         .tuple.dst.ip;
1324                 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1325
1326                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1327                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1328                 ip_conntrack_put(ct);
1329                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1330                         return -EFAULT;
1331                 else
1332                         return 0;
1333         }
1334         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1335                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1336                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1337         return -ENOENT;
1338 }
1339
1340 static struct nf_sockopt_ops so_getorigdst = {
1341         .pf             = PF_INET,
1342         .get_optmin     = SO_ORIGINAL_DST,
1343         .get_optmax     = SO_ORIGINAL_DST+1,
1344         .get            = &getorigdst,
1345 };
1346
1347 static int kill_all(struct ip_conntrack *i, void *data)
1348 {
1349         return 1;
1350 }
1351
1352 void ip_conntrack_flush(void)
1353 {
1354         ip_ct_iterate_cleanup(kill_all, NULL);
1355 }
1356
1357 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1358 {
1359         if (vmalloced)
1360                 vfree(hash);
1361         else
1362                 free_pages((unsigned long)hash,
1363                            get_order(sizeof(struct list_head) * size));
1364 }
1365
1366 /* Mishearing the voices in his head, our hero wonders how he's
1367    supposed to kill the mall. */
1368 void ip_conntrack_cleanup(void)
1369 {
1370         rcu_assign_pointer(ip_ct_attach, NULL);
1371
1372         /* This makes sure all current packets have passed through
1373            netfilter framework.  Roll on, two-stage module
1374            delete... */
1375         synchronize_net();
1376
1377         ip_ct_event_cache_flush();
1378  i_see_dead_people:
1379         ip_conntrack_flush();
1380         if (atomic_read(&ip_conntrack_count) != 0) {
1381                 schedule();
1382                 goto i_see_dead_people;
1383         }
1384         /* wait until all references to ip_conntrack_untracked are dropped */
1385         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1386                 schedule();
1387
1388         kmem_cache_destroy(ip_conntrack_cachep);
1389         kmem_cache_destroy(ip_conntrack_expect_cachep);
1390         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1391                             ip_conntrack_htable_size);
1392         nf_unregister_sockopt(&so_getorigdst);
1393 }
1394
1395 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1396 {
1397         struct list_head *hash;
1398         unsigned int i;
1399
1400         *vmalloced = 0;
1401         hash = (void*)__get_free_pages(GFP_KERNEL,
1402                                        get_order(sizeof(struct list_head)
1403                                                  * size));
1404         if (!hash) {
1405                 *vmalloced = 1;
1406                 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1407                 hash = vmalloc(sizeof(struct list_head) * size);
1408         }
1409
1410         if (hash)
1411                 for (i = 0; i < size; i++)
1412                         INIT_LIST_HEAD(&hash[i]);
1413
1414         return hash;
1415 }
1416
1417 static int set_hashsize(const char *val, struct kernel_param *kp)
1418 {
1419         int i, bucket, hashsize, vmalloced;
1420         int old_vmalloced, old_size;
1421         int rnd;
1422         struct list_head *hash, *old_hash;
1423         struct ip_conntrack_tuple_hash *h;
1424
1425         /* On boot, we can set this without any fancy locking. */
1426         if (!ip_conntrack_htable_size)
1427                 return param_set_int(val, kp);
1428
1429         hashsize = simple_strtol(val, NULL, 0);
1430         if (!hashsize)
1431                 return -EINVAL;
1432
1433         hash = alloc_hashtable(hashsize, &vmalloced);
1434         if (!hash)
1435                 return -ENOMEM;
1436
1437         /* We have to rehash for the new table anyway, so we also can
1438          * use a new random seed */
1439         get_random_bytes(&rnd, 4);
1440
1441         write_lock_bh(&ip_conntrack_lock);
1442         for (i = 0; i < ip_conntrack_htable_size; i++) {
1443                 while (!list_empty(&ip_conntrack_hash[i])) {
1444                         h = list_entry(ip_conntrack_hash[i].next,
1445                                        struct ip_conntrack_tuple_hash, list);
1446                         list_del(&h->list);
1447                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1448                         list_add_tail(&h->list, &hash[bucket]);
1449                 }
1450         }
1451         old_size = ip_conntrack_htable_size;
1452         old_vmalloced = ip_conntrack_vmalloc;
1453         old_hash = ip_conntrack_hash;
1454
1455         ip_conntrack_htable_size = hashsize;
1456         ip_conntrack_vmalloc = vmalloced;
1457         ip_conntrack_hash = hash;
1458         ip_conntrack_hash_rnd = rnd;
1459         write_unlock_bh(&ip_conntrack_lock);
1460
1461         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1462         return 0;
1463 }
1464
1465 module_param_call(hashsize, set_hashsize, param_get_uint,
1466                   &ip_conntrack_htable_size, 0600);
1467
1468 int __init ip_conntrack_init(void)
1469 {
1470         unsigned int i;
1471         int ret;
1472
1473         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1474          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1475         if (!ip_conntrack_htable_size) {
1476                 ip_conntrack_htable_size
1477                         = (((num_physpages << PAGE_SHIFT) / 16384)
1478                            / sizeof(struct list_head));
1479                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1480                         ip_conntrack_htable_size = 8192;
1481                 if (ip_conntrack_htable_size < 16)
1482                         ip_conntrack_htable_size = 16;
1483         }
1484         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1485
1486         printk("ip_conntrack version %s (%u buckets, %d max)"
1487                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1488                ip_conntrack_htable_size, ip_conntrack_max,
1489                sizeof(struct ip_conntrack));
1490
1491         ret = nf_register_sockopt(&so_getorigdst);
1492         if (ret != 0) {
1493                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1494                 return ret;
1495         }
1496
1497         ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1498                                             &ip_conntrack_vmalloc);
1499         if (!ip_conntrack_hash) {
1500                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1501                 goto err_unreg_sockopt;
1502         }
1503
1504         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1505                                                 sizeof(struct ip_conntrack), 0,
1506                                                 0, NULL, NULL);
1507         if (!ip_conntrack_cachep) {
1508                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1509                 goto err_free_hash;
1510         }
1511
1512         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1513                                         sizeof(struct ip_conntrack_expect),
1514                                         0, 0, NULL, NULL);
1515         if (!ip_conntrack_expect_cachep) {
1516                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1517                 goto err_free_conntrack_slab;
1518         }
1519
1520         /* Don't NEED lock here, but good form anyway. */
1521         write_lock_bh(&ip_conntrack_lock);
1522         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1523                 rcu_assign_pointer(ip_ct_protos[i], &ip_conntrack_generic_protocol);
1524         /* Sew in builtin protocols. */
1525         rcu_assign_pointer(ip_ct_protos[IPPROTO_TCP], &ip_conntrack_protocol_tcp);
1526         rcu_assign_pointer(ip_ct_protos[IPPROTO_UDP], &ip_conntrack_protocol_udp);
1527         rcu_assign_pointer(ip_ct_protos[IPPROTO_ICMP], &ip_conntrack_protocol_icmp);
1528         write_unlock_bh(&ip_conntrack_lock);
1529
1530         /* For use by ipt_REJECT */
1531         rcu_assign_pointer(ip_ct_attach, ip_conntrack_attach);
1532
1533         /* Set up fake conntrack:
1534             - to never be deleted, not in any hashes */
1535         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1536         /*  - and look it like as a confirmed connection */
1537         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1538
1539         return ret;
1540
1541 err_free_conntrack_slab:
1542         kmem_cache_destroy(ip_conntrack_cachep);
1543 err_free_hash:
1544         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1545                             ip_conntrack_htable_size);
1546 err_unreg_sockopt:
1547         nf_unregister_sockopt(&so_getorigdst);
1548
1549         return -ENOMEM;
1550 }