Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[pandora-kernel.git] / net / ipv4 / netfilter / ip_tables.c
1 /*
2  * Packet matching code.
3  *
4  * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
5  * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  *
11  * 19 Jan 2002 Harald Welte <laforge@gnumonks.org>
12  *      - increase module usage count as soon as we have rules inside
13  *        a table
14  * 08 Oct 2005 Harald Welte <lafore@netfilter.org>
15  *      - Generalize into "x_tables" layer and "{ip,ip6,arp}_tables"
16  */
17 #include <linux/config.h>
18 #include <linux/cache.h>
19 #include <linux/capability.h>
20 #include <linux/skbuff.h>
21 #include <linux/kmod.h>
22 #include <linux/vmalloc.h>
23 #include <linux/netdevice.h>
24 #include <linux/module.h>
25 #include <linux/icmp.h>
26 #include <net/ip.h>
27 #include <asm/uaccess.h>
28 #include <asm/semaphore.h>
29 #include <linux/proc_fs.h>
30 #include <linux/err.h>
31 #include <linux/cpumask.h>
32
33 #include <linux/netfilter/x_tables.h>
34 #include <linux/netfilter_ipv4/ip_tables.h>
35
36 MODULE_LICENSE("GPL");
37 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
38 MODULE_DESCRIPTION("IPv4 packet filter");
39
40 /*#define DEBUG_IP_FIREWALL*/
41 /*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
42 /*#define DEBUG_IP_FIREWALL_USER*/
43
44 #ifdef DEBUG_IP_FIREWALL
45 #define dprintf(format, args...)  printk(format , ## args)
46 #else
47 #define dprintf(format, args...)
48 #endif
49
50 #ifdef DEBUG_IP_FIREWALL_USER
51 #define duprintf(format, args...) printk(format , ## args)
52 #else
53 #define duprintf(format, args...)
54 #endif
55
56 #ifdef CONFIG_NETFILTER_DEBUG
57 #define IP_NF_ASSERT(x)                                         \
58 do {                                                            \
59         if (!(x))                                               \
60                 printk("IP_NF_ASSERT: %s:%s:%u\n",              \
61                        __FUNCTION__, __FILE__, __LINE__);       \
62 } while(0)
63 #else
64 #define IP_NF_ASSERT(x)
65 #endif
66
67 #if 0
68 /* All the better to debug you with... */
69 #define static
70 #define inline
71 #endif
72
73 /*
74    We keep a set of rules for each CPU, so we can avoid write-locking
75    them in the softirq when updating the counters and therefore
76    only need to read-lock in the softirq; doing a write_lock_bh() in user
77    context stops packets coming through and allows user context to read
78    the counters or update the rules.
79
80    Hence the start of any table is given by get_table() below.  */
81
82 /* Returns whether matches rule or not. */
83 static inline int
84 ip_packet_match(const struct iphdr *ip,
85                 const char *indev,
86                 const char *outdev,
87                 const struct ipt_ip *ipinfo,
88                 int isfrag)
89 {
90         size_t i;
91         unsigned long ret;
92
93 #define FWINV(bool,invflg) ((bool) ^ !!(ipinfo->invflags & invflg))
94
95         if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
96                   IPT_INV_SRCIP)
97             || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
98                      IPT_INV_DSTIP)) {
99                 dprintf("Source or dest mismatch.\n");
100
101                 dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n",
102                         NIPQUAD(ip->saddr),
103                         NIPQUAD(ipinfo->smsk.s_addr),
104                         NIPQUAD(ipinfo->src.s_addr),
105                         ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : "");
106                 dprintf("DST: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n",
107                         NIPQUAD(ip->daddr),
108                         NIPQUAD(ipinfo->dmsk.s_addr),
109                         NIPQUAD(ipinfo->dst.s_addr),
110                         ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : "");
111                 return 0;
112         }
113
114         /* Look for ifname matches; this should unroll nicely. */
115         for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
116                 ret |= (((const unsigned long *)indev)[i]
117                         ^ ((const unsigned long *)ipinfo->iniface)[i])
118                         & ((const unsigned long *)ipinfo->iniface_mask)[i];
119         }
120
121         if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
122                 dprintf("VIA in mismatch (%s vs %s).%s\n",
123                         indev, ipinfo->iniface,
124                         ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":"");
125                 return 0;
126         }
127
128         for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
129                 ret |= (((const unsigned long *)outdev)[i]
130                         ^ ((const unsigned long *)ipinfo->outiface)[i])
131                         & ((const unsigned long *)ipinfo->outiface_mask)[i];
132         }
133
134         if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
135                 dprintf("VIA out mismatch (%s vs %s).%s\n",
136                         outdev, ipinfo->outiface,
137                         ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":"");
138                 return 0;
139         }
140
141         /* Check specific protocol */
142         if (ipinfo->proto
143             && FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
144                 dprintf("Packet protocol %hi does not match %hi.%s\n",
145                         ip->protocol, ipinfo->proto,
146                         ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
147                 return 0;
148         }
149
150         /* If we have a fragment rule but the packet is not a fragment
151          * then we return zero */
152         if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) {
153                 dprintf("Fragment rule but not fragment.%s\n",
154                         ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : "");
155                 return 0;
156         }
157
158         return 1;
159 }
160
161 static inline int
162 ip_checkentry(const struct ipt_ip *ip)
163 {
164         if (ip->flags & ~IPT_F_MASK) {
165                 duprintf("Unknown flag bits set: %08X\n",
166                          ip->flags & ~IPT_F_MASK);
167                 return 0;
168         }
169         if (ip->invflags & ~IPT_INV_MASK) {
170                 duprintf("Unknown invflag bits set: %08X\n",
171                          ip->invflags & ~IPT_INV_MASK);
172                 return 0;
173         }
174         return 1;
175 }
176
177 static unsigned int
178 ipt_error(struct sk_buff **pskb,
179           const struct net_device *in,
180           const struct net_device *out,
181           unsigned int hooknum,
182           const void *targinfo,
183           void *userinfo)
184 {
185         if (net_ratelimit())
186                 printk("ip_tables: error: `%s'\n", (char *)targinfo);
187
188         return NF_DROP;
189 }
190
191 static inline
192 int do_match(struct ipt_entry_match *m,
193              const struct sk_buff *skb,
194              const struct net_device *in,
195              const struct net_device *out,
196              int offset,
197              int *hotdrop)
198 {
199         /* Stop iteration if it doesn't match */
200         if (!m->u.kernel.match->match(skb, in, out, m->data, offset, 
201             skb->nh.iph->ihl*4, hotdrop))
202                 return 1;
203         else
204                 return 0;
205 }
206
207 static inline struct ipt_entry *
208 get_entry(void *base, unsigned int offset)
209 {
210         return (struct ipt_entry *)(base + offset);
211 }
212
213 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
214 unsigned int
215 ipt_do_table(struct sk_buff **pskb,
216              unsigned int hook,
217              const struct net_device *in,
218              const struct net_device *out,
219              struct ipt_table *table,
220              void *userdata)
221 {
222         static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
223         u_int16_t offset;
224         struct iphdr *ip;
225         u_int16_t datalen;
226         int hotdrop = 0;
227         /* Initializing verdict to NF_DROP keeps gcc happy. */
228         unsigned int verdict = NF_DROP;
229         const char *indev, *outdev;
230         void *table_base;
231         struct ipt_entry *e, *back;
232         struct xt_table_info *private = table->private;
233
234         /* Initialization */
235         ip = (*pskb)->nh.iph;
236         datalen = (*pskb)->len - ip->ihl * 4;
237         indev = in ? in->name : nulldevname;
238         outdev = out ? out->name : nulldevname;
239         /* We handle fragments by dealing with the first fragment as
240          * if it was a normal packet.  All other fragments are treated
241          * normally, except that they will NEVER match rules that ask
242          * things we don't know, ie. tcp syn flag or ports).  If the
243          * rule is also a fragment-specific rule, non-fragments won't
244          * match it. */
245         offset = ntohs(ip->frag_off) & IP_OFFSET;
246
247         read_lock_bh(&table->lock);
248         IP_NF_ASSERT(table->valid_hooks & (1 << hook));
249         table_base = (void *)private->entries[smp_processor_id()];
250         e = get_entry(table_base, private->hook_entry[hook]);
251
252         /* For return from builtin chain */
253         back = get_entry(table_base, private->underflow[hook]);
254
255         do {
256                 IP_NF_ASSERT(e);
257                 IP_NF_ASSERT(back);
258                 if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
259                         struct ipt_entry_target *t;
260
261                         if (IPT_MATCH_ITERATE(e, do_match,
262                                               *pskb, in, out,
263                                               offset, &hotdrop) != 0)
264                                 goto no_match;
265
266                         ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
267
268                         t = ipt_get_target(e);
269                         IP_NF_ASSERT(t->u.kernel.target);
270                         /* Standard target? */
271                         if (!t->u.kernel.target->target) {
272                                 int v;
273
274                                 v = ((struct ipt_standard_target *)t)->verdict;
275                                 if (v < 0) {
276                                         /* Pop from stack? */
277                                         if (v != IPT_RETURN) {
278                                                 verdict = (unsigned)(-v) - 1;
279                                                 break;
280                                         }
281                                         e = back;
282                                         back = get_entry(table_base,
283                                                          back->comefrom);
284                                         continue;
285                                 }
286                                 if (table_base + v != (void *)e + e->next_offset
287                                     && !(e->ip.flags & IPT_F_GOTO)) {
288                                         /* Save old back ptr in next entry */
289                                         struct ipt_entry *next
290                                                 = (void *)e + e->next_offset;
291                                         next->comefrom
292                                                 = (void *)back - table_base;
293                                         /* set back pointer to next entry */
294                                         back = next;
295                                 }
296
297                                 e = get_entry(table_base, v);
298                         } else {
299                                 /* Targets which reenter must return
300                                    abs. verdicts */
301 #ifdef CONFIG_NETFILTER_DEBUG
302                                 ((struct ipt_entry *)table_base)->comefrom
303                                         = 0xeeeeeeec;
304 #endif
305                                 verdict = t->u.kernel.target->target(pskb,
306                                                                      in, out,
307                                                                      hook,
308                                                                      t->data,
309                                                                      userdata);
310
311 #ifdef CONFIG_NETFILTER_DEBUG
312                                 if (((struct ipt_entry *)table_base)->comefrom
313                                     != 0xeeeeeeec
314                                     && verdict == IPT_CONTINUE) {
315                                         printk("Target %s reentered!\n",
316                                                t->u.kernel.target->name);
317                                         verdict = NF_DROP;
318                                 }
319                                 ((struct ipt_entry *)table_base)->comefrom
320                                         = 0x57acc001;
321 #endif
322                                 /* Target might have changed stuff. */
323                                 ip = (*pskb)->nh.iph;
324                                 datalen = (*pskb)->len - ip->ihl * 4;
325
326                                 if (verdict == IPT_CONTINUE)
327                                         e = (void *)e + e->next_offset;
328                                 else
329                                         /* Verdict */
330                                         break;
331                         }
332                 } else {
333
334                 no_match:
335                         e = (void *)e + e->next_offset;
336                 }
337         } while (!hotdrop);
338
339         read_unlock_bh(&table->lock);
340
341 #ifdef DEBUG_ALLOW_ALL
342         return NF_ACCEPT;
343 #else
344         if (hotdrop)
345                 return NF_DROP;
346         else return verdict;
347 #endif
348 }
349
350 /* All zeroes == unconditional rule. */
351 static inline int
352 unconditional(const struct ipt_ip *ip)
353 {
354         unsigned int i;
355
356         for (i = 0; i < sizeof(*ip)/sizeof(__u32); i++)
357                 if (((__u32 *)ip)[i])
358                         return 0;
359
360         return 1;
361 }
362
363 /* Figures out from what hook each rule can be called: returns 0 if
364    there are loops.  Puts hook bitmask in comefrom. */
365 static int
366 mark_source_chains(struct xt_table_info *newinfo,
367                    unsigned int valid_hooks, void *entry0)
368 {
369         unsigned int hook;
370
371         /* No recursion; use packet counter to save back ptrs (reset
372            to 0 as we leave), and comefrom to save source hook bitmask */
373         for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
374                 unsigned int pos = newinfo->hook_entry[hook];
375                 struct ipt_entry *e
376                         = (struct ipt_entry *)(entry0 + pos);
377
378                 if (!(valid_hooks & (1 << hook)))
379                         continue;
380
381                 /* Set initial back pointer. */
382                 e->counters.pcnt = pos;
383
384                 for (;;) {
385                         struct ipt_standard_target *t
386                                 = (void *)ipt_get_target(e);
387
388                         if (e->comefrom & (1 << NF_IP_NUMHOOKS)) {
389                                 printk("iptables: loop hook %u pos %u %08X.\n",
390                                        hook, pos, e->comefrom);
391                                 return 0;
392                         }
393                         e->comefrom
394                                 |= ((1 << hook) | (1 << NF_IP_NUMHOOKS));
395
396                         /* Unconditional return/END. */
397                         if (e->target_offset == sizeof(struct ipt_entry)
398                             && (strcmp(t->target.u.user.name,
399                                        IPT_STANDARD_TARGET) == 0)
400                             && t->verdict < 0
401                             && unconditional(&e->ip)) {
402                                 unsigned int oldpos, size;
403
404                                 /* Return: backtrack through the last
405                                    big jump. */
406                                 do {
407                                         e->comefrom ^= (1<<NF_IP_NUMHOOKS);
408 #ifdef DEBUG_IP_FIREWALL_USER
409                                         if (e->comefrom
410                                             & (1 << NF_IP_NUMHOOKS)) {
411                                                 duprintf("Back unset "
412                                                          "on hook %u "
413                                                          "rule %u\n",
414                                                          hook, pos);
415                                         }
416 #endif
417                                         oldpos = pos;
418                                         pos = e->counters.pcnt;
419                                         e->counters.pcnt = 0;
420
421                                         /* We're at the start. */
422                                         if (pos == oldpos)
423                                                 goto next;
424
425                                         e = (struct ipt_entry *)
426                                                 (entry0 + pos);
427                                 } while (oldpos == pos + e->next_offset);
428
429                                 /* Move along one */
430                                 size = e->next_offset;
431                                 e = (struct ipt_entry *)
432                                         (entry0 + pos + size);
433                                 e->counters.pcnt = pos;
434                                 pos += size;
435                         } else {
436                                 int newpos = t->verdict;
437
438                                 if (strcmp(t->target.u.user.name,
439                                            IPT_STANDARD_TARGET) == 0
440                                     && newpos >= 0) {
441                                         /* This a jump; chase it. */
442                                         duprintf("Jump rule %u -> %u\n",
443                                                  pos, newpos);
444                                 } else {
445                                         /* ... this is a fallthru */
446                                         newpos = pos + e->next_offset;
447                                 }
448                                 e = (struct ipt_entry *)
449                                         (entry0 + newpos);
450                                 e->counters.pcnt = pos;
451                                 pos = newpos;
452                         }
453                 }
454                 next:
455                 duprintf("Finished chain %u\n", hook);
456         }
457         return 1;
458 }
459
460 static inline int
461 cleanup_match(struct ipt_entry_match *m, unsigned int *i)
462 {
463         if (i && (*i)-- == 0)
464                 return 1;
465
466         if (m->u.kernel.match->destroy)
467                 m->u.kernel.match->destroy(m->data,
468                                            m->u.match_size - sizeof(*m));
469         module_put(m->u.kernel.match->me);
470         return 0;
471 }
472
473 static inline int
474 standard_check(const struct ipt_entry_target *t,
475                unsigned int max_offset)
476 {
477         struct ipt_standard_target *targ = (void *)t;
478
479         /* Check standard info. */
480         if (t->u.target_size
481             != IPT_ALIGN(sizeof(struct ipt_standard_target))) {
482                 duprintf("standard_check: target size %u != %u\n",
483                          t->u.target_size,
484                          IPT_ALIGN(sizeof(struct ipt_standard_target)));
485                 return 0;
486         }
487
488         if (targ->verdict >= 0
489             && targ->verdict > max_offset - sizeof(struct ipt_entry)) {
490                 duprintf("ipt_standard_check: bad verdict (%i)\n",
491                          targ->verdict);
492                 return 0;
493         }
494
495         if (targ->verdict < -NF_MAX_VERDICT - 1) {
496                 duprintf("ipt_standard_check: bad negative verdict (%i)\n",
497                          targ->verdict);
498                 return 0;
499         }
500         return 1;
501 }
502
503 static inline int
504 check_match(struct ipt_entry_match *m,
505             const char *name,
506             const struct ipt_ip *ip,
507             unsigned int hookmask,
508             unsigned int *i)
509 {
510         struct ipt_match *match;
511
512         match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name,
513                                                    m->u.user.revision),
514                                         "ipt_%s", m->u.user.name);
515         if (IS_ERR(match) || !match) {
516                 duprintf("check_match: `%s' not found\n", m->u.user.name);
517                 return match ? PTR_ERR(match) : -ENOENT;
518         }
519         m->u.kernel.match = match;
520
521         if (m->u.kernel.match->checkentry
522             && !m->u.kernel.match->checkentry(name, ip, m->data,
523                                               m->u.match_size - sizeof(*m),
524                                               hookmask)) {
525                 module_put(m->u.kernel.match->me);
526                 duprintf("ip_tables: check failed for `%s'.\n",
527                          m->u.kernel.match->name);
528                 return -EINVAL;
529         }
530
531         (*i)++;
532         return 0;
533 }
534
535 static struct ipt_target ipt_standard_target;
536
537 static inline int
538 check_entry(struct ipt_entry *e, const char *name, unsigned int size,
539             unsigned int *i)
540 {
541         struct ipt_entry_target *t;
542         struct ipt_target *target;
543         int ret;
544         unsigned int j;
545
546         if (!ip_checkentry(&e->ip)) {
547                 duprintf("ip_tables: ip check failed %p %s.\n", e, name);
548                 return -EINVAL;
549         }
550
551         j = 0;
552         ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom, &j);
553         if (ret != 0)
554                 goto cleanup_matches;
555
556         t = ipt_get_target(e);
557         target = try_then_request_module(xt_find_target(AF_INET,
558                                                      t->u.user.name,
559                                                      t->u.user.revision),
560                                          "ipt_%s", t->u.user.name);
561         if (IS_ERR(target) || !target) {
562                 duprintf("check_entry: `%s' not found\n", t->u.user.name);
563                 ret = target ? PTR_ERR(target) : -ENOENT;
564                 goto cleanup_matches;
565         }
566         t->u.kernel.target = target;
567
568         if (t->u.kernel.target == &ipt_standard_target) {
569                 if (!standard_check(t, size)) {
570                         ret = -EINVAL;
571                         goto cleanup_matches;
572                 }
573         } else if (t->u.kernel.target->checkentry
574                    && !t->u.kernel.target->checkentry(name, e, t->data,
575                                                       t->u.target_size
576                                                       - sizeof(*t),
577                                                       e->comefrom)) {
578                 module_put(t->u.kernel.target->me);
579                 duprintf("ip_tables: check failed for `%s'.\n",
580                          t->u.kernel.target->name);
581                 ret = -EINVAL;
582                 goto cleanup_matches;
583         }
584
585         (*i)++;
586         return 0;
587
588  cleanup_matches:
589         IPT_MATCH_ITERATE(e, cleanup_match, &j);
590         return ret;
591 }
592
593 static inline int
594 check_entry_size_and_hooks(struct ipt_entry *e,
595                            struct xt_table_info *newinfo,
596                            unsigned char *base,
597                            unsigned char *limit,
598                            const unsigned int *hook_entries,
599                            const unsigned int *underflows,
600                            unsigned int *i)
601 {
602         unsigned int h;
603
604         if ((unsigned long)e % __alignof__(struct ipt_entry) != 0
605             || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
606                 duprintf("Bad offset %p\n", e);
607                 return -EINVAL;
608         }
609
610         if (e->next_offset
611             < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) {
612                 duprintf("checking: element %p size %u\n",
613                          e, e->next_offset);
614                 return -EINVAL;
615         }
616
617         /* Check hooks & underflows */
618         for (h = 0; h < NF_IP_NUMHOOKS; h++) {
619                 if ((unsigned char *)e - base == hook_entries[h])
620                         newinfo->hook_entry[h] = hook_entries[h];
621                 if ((unsigned char *)e - base == underflows[h])
622                         newinfo->underflow[h] = underflows[h];
623         }
624
625         /* FIXME: underflows must be unconditional, standard verdicts
626            < 0 (not IPT_RETURN). --RR */
627
628         /* Clear counters and comefrom */
629         e->counters = ((struct xt_counters) { 0, 0 });
630         e->comefrom = 0;
631
632         (*i)++;
633         return 0;
634 }
635
636 static inline int
637 cleanup_entry(struct ipt_entry *e, unsigned int *i)
638 {
639         struct ipt_entry_target *t;
640
641         if (i && (*i)-- == 0)
642                 return 1;
643
644         /* Cleanup all matches */
645         IPT_MATCH_ITERATE(e, cleanup_match, NULL);
646         t = ipt_get_target(e);
647         if (t->u.kernel.target->destroy)
648                 t->u.kernel.target->destroy(t->data,
649                                             t->u.target_size - sizeof(*t));
650         module_put(t->u.kernel.target->me);
651         return 0;
652 }
653
654 /* Checks and translates the user-supplied table segment (held in
655    newinfo) */
656 static int
657 translate_table(const char *name,
658                 unsigned int valid_hooks,
659                 struct xt_table_info *newinfo,
660                 void *entry0,
661                 unsigned int size,
662                 unsigned int number,
663                 const unsigned int *hook_entries,
664                 const unsigned int *underflows)
665 {
666         unsigned int i;
667         int ret;
668
669         newinfo->size = size;
670         newinfo->number = number;
671
672         /* Init all hooks to impossible value. */
673         for (i = 0; i < NF_IP_NUMHOOKS; i++) {
674                 newinfo->hook_entry[i] = 0xFFFFFFFF;
675                 newinfo->underflow[i] = 0xFFFFFFFF;
676         }
677
678         duprintf("translate_table: size %u\n", newinfo->size);
679         i = 0;
680         /* Walk through entries, checking offsets. */
681         ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
682                                 check_entry_size_and_hooks,
683                                 newinfo,
684                                 entry0,
685                                 entry0 + size,
686                                 hook_entries, underflows, &i);
687         if (ret != 0)
688                 return ret;
689
690         if (i != number) {
691                 duprintf("translate_table: %u not %u entries\n",
692                          i, number);
693                 return -EINVAL;
694         }
695
696         /* Check hooks all assigned */
697         for (i = 0; i < NF_IP_NUMHOOKS; i++) {
698                 /* Only hooks which are valid */
699                 if (!(valid_hooks & (1 << i)))
700                         continue;
701                 if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
702                         duprintf("Invalid hook entry %u %u\n",
703                                  i, hook_entries[i]);
704                         return -EINVAL;
705                 }
706                 if (newinfo->underflow[i] == 0xFFFFFFFF) {
707                         duprintf("Invalid underflow %u %u\n",
708                                  i, underflows[i]);
709                         return -EINVAL;
710                 }
711         }
712
713         if (!mark_source_chains(newinfo, valid_hooks, entry0))
714                 return -ELOOP;
715
716         /* Finally, each sanity check must pass */
717         i = 0;
718         ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
719                                 check_entry, name, size, &i);
720
721         if (ret != 0) {
722                 IPT_ENTRY_ITERATE(entry0, newinfo->size,
723                                   cleanup_entry, &i);
724                 return ret;
725         }
726
727         /* And one copy for every other CPU */
728         for_each_cpu(i) {
729                 if (newinfo->entries[i] && newinfo->entries[i] != entry0)
730                         memcpy(newinfo->entries[i], entry0, newinfo->size);
731         }
732
733         return ret;
734 }
735
736 /* Gets counters. */
737 static inline int
738 add_entry_to_counter(const struct ipt_entry *e,
739                      struct xt_counters total[],
740                      unsigned int *i)
741 {
742         ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
743
744         (*i)++;
745         return 0;
746 }
747
748 static inline int
749 set_entry_to_counter(const struct ipt_entry *e,
750                      struct ipt_counters total[],
751                      unsigned int *i)
752 {
753         SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
754
755         (*i)++;
756         return 0;
757 }
758
759 static void
760 get_counters(const struct xt_table_info *t,
761              struct xt_counters counters[])
762 {
763         unsigned int cpu;
764         unsigned int i;
765         unsigned int curcpu;
766
767         /* Instead of clearing (by a previous call to memset())
768          * the counters and using adds, we set the counters
769          * with data used by 'current' CPU
770          * We dont care about preemption here.
771          */
772         curcpu = raw_smp_processor_id();
773
774         i = 0;
775         IPT_ENTRY_ITERATE(t->entries[curcpu],
776                           t->size,
777                           set_entry_to_counter,
778                           counters,
779                           &i);
780
781         for_each_cpu(cpu) {
782                 if (cpu == curcpu)
783                         continue;
784                 i = 0;
785                 IPT_ENTRY_ITERATE(t->entries[cpu],
786                                   t->size,
787                                   add_entry_to_counter,
788                                   counters,
789                                   &i);
790         }
791 }
792
793 static int
794 copy_entries_to_user(unsigned int total_size,
795                      struct ipt_table *table,
796                      void __user *userptr)
797 {
798         unsigned int off, num, countersize;
799         struct ipt_entry *e;
800         struct xt_counters *counters;
801         struct xt_table_info *private = table->private;
802         int ret = 0;
803         void *loc_cpu_entry;
804
805         /* We need atomic snapshot of counters: rest doesn't change
806            (other than comefrom, which userspace doesn't care
807            about). */
808         countersize = sizeof(struct xt_counters) * private->number;
809         counters = vmalloc_node(countersize, numa_node_id());
810
811         if (counters == NULL)
812                 return -ENOMEM;
813
814         /* First, sum counters... */
815         write_lock_bh(&table->lock);
816         get_counters(private, counters);
817         write_unlock_bh(&table->lock);
818
819         /* choose the copy that is on our node/cpu, ...
820          * This choice is lazy (because current thread is
821          * allowed to migrate to another cpu)
822          */
823         loc_cpu_entry = private->entries[raw_smp_processor_id()];
824         /* ... then copy entire thing ... */
825         if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
826                 ret = -EFAULT;
827                 goto free_counters;
828         }
829
830         /* FIXME: use iterator macros --RR */
831         /* ... then go back and fix counters and names */
832         for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
833                 unsigned int i;
834                 struct ipt_entry_match *m;
835                 struct ipt_entry_target *t;
836
837                 e = (struct ipt_entry *)(loc_cpu_entry + off);
838                 if (copy_to_user(userptr + off
839                                  + offsetof(struct ipt_entry, counters),
840                                  &counters[num],
841                                  sizeof(counters[num])) != 0) {
842                         ret = -EFAULT;
843                         goto free_counters;
844                 }
845
846                 for (i = sizeof(struct ipt_entry);
847                      i < e->target_offset;
848                      i += m->u.match_size) {
849                         m = (void *)e + i;
850
851                         if (copy_to_user(userptr + off + i
852                                          + offsetof(struct ipt_entry_match,
853                                                     u.user.name),
854                                          m->u.kernel.match->name,
855                                          strlen(m->u.kernel.match->name)+1)
856                             != 0) {
857                                 ret = -EFAULT;
858                                 goto free_counters;
859                         }
860                 }
861
862                 t = ipt_get_target(e);
863                 if (copy_to_user(userptr + off + e->target_offset
864                                  + offsetof(struct ipt_entry_target,
865                                             u.user.name),
866                                  t->u.kernel.target->name,
867                                  strlen(t->u.kernel.target->name)+1) != 0) {
868                         ret = -EFAULT;
869                         goto free_counters;
870                 }
871         }
872
873  free_counters:
874         vfree(counters);
875         return ret;
876 }
877
878 static int
879 get_entries(const struct ipt_get_entries *entries,
880             struct ipt_get_entries __user *uptr)
881 {
882         int ret;
883         struct ipt_table *t;
884
885         t = xt_find_table_lock(AF_INET, entries->name);
886         if (t && !IS_ERR(t)) {
887                 struct xt_table_info *private = t->private;
888                 duprintf("t->private->number = %u\n",
889                          private->number);
890                 if (entries->size == private->size)
891                         ret = copy_entries_to_user(private->size,
892                                                    t, uptr->entrytable);
893                 else {
894                         duprintf("get_entries: I've got %u not %u!\n",
895                                  private->size,
896                                  entries->size);
897                         ret = -EINVAL;
898                 }
899                 module_put(t->me);
900                 xt_table_unlock(t);
901         } else
902                 ret = t ? PTR_ERR(t) : -ENOENT;
903
904         return ret;
905 }
906
907 static int
908 do_replace(void __user *user, unsigned int len)
909 {
910         int ret;
911         struct ipt_replace tmp;
912         struct ipt_table *t;
913         struct xt_table_info *newinfo, *oldinfo;
914         struct xt_counters *counters;
915         void *loc_cpu_entry, *loc_cpu_old_entry;
916
917         if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
918                 return -EFAULT;
919
920         /* Hack: Causes ipchains to give correct error msg --RR */
921         if (len != sizeof(tmp) + tmp.size)
922                 return -ENOPROTOOPT;
923
924         /* overflow check */
925         if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS -
926                         SMP_CACHE_BYTES)
927                 return -ENOMEM;
928         if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
929                 return -ENOMEM;
930
931         newinfo = xt_alloc_table_info(tmp.size);
932         if (!newinfo)
933                 return -ENOMEM;
934
935         /* choose the copy that is our node/cpu */
936         loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
937         if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
938                            tmp.size) != 0) {
939                 ret = -EFAULT;
940                 goto free_newinfo;
941         }
942
943         counters = vmalloc(tmp.num_counters * sizeof(struct xt_counters));
944         if (!counters) {
945                 ret = -ENOMEM;
946                 goto free_newinfo;
947         }
948
949         ret = translate_table(tmp.name, tmp.valid_hooks,
950                               newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
951                               tmp.hook_entry, tmp.underflow);
952         if (ret != 0)
953                 goto free_newinfo_counters;
954
955         duprintf("ip_tables: Translated table\n");
956
957         t = try_then_request_module(xt_find_table_lock(AF_INET, tmp.name),
958                                     "iptable_%s", tmp.name);
959         if (!t || IS_ERR(t)) {
960                 ret = t ? PTR_ERR(t) : -ENOENT;
961                 goto free_newinfo_counters_untrans;
962         }
963
964         /* You lied! */
965         if (tmp.valid_hooks != t->valid_hooks) {
966                 duprintf("Valid hook crap: %08X vs %08X\n",
967                          tmp.valid_hooks, t->valid_hooks);
968                 ret = -EINVAL;
969                 goto put_module;
970         }
971
972         oldinfo = xt_replace_table(t, tmp.num_counters, newinfo, &ret);
973         if (!oldinfo)
974                 goto put_module;
975
976         /* Update module usage count based on number of rules */
977         duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
978                 oldinfo->number, oldinfo->initial_entries, newinfo->number);
979         if ((oldinfo->number > oldinfo->initial_entries) || 
980             (newinfo->number <= oldinfo->initial_entries)) 
981                 module_put(t->me);
982         if ((oldinfo->number > oldinfo->initial_entries) &&
983             (newinfo->number <= oldinfo->initial_entries))
984                 module_put(t->me);
985
986         /* Get the old counters. */
987         get_counters(oldinfo, counters);
988         /* Decrease module usage counts and free resource */
989         loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
990         IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
991         xt_free_table_info(oldinfo);
992         if (copy_to_user(tmp.counters, counters,
993                          sizeof(struct xt_counters) * tmp.num_counters) != 0)
994                 ret = -EFAULT;
995         vfree(counters);
996         xt_table_unlock(t);
997         return ret;
998
999  put_module:
1000         module_put(t->me);
1001         xt_table_unlock(t);
1002  free_newinfo_counters_untrans:
1003         IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
1004  free_newinfo_counters:
1005         vfree(counters);
1006  free_newinfo:
1007         xt_free_table_info(newinfo);
1008         return ret;
1009 }
1010
1011 /* We're lazy, and add to the first CPU; overflow works its fey magic
1012  * and everything is OK. */
1013 static inline int
1014 add_counter_to_entry(struct ipt_entry *e,
1015                      const struct xt_counters addme[],
1016                      unsigned int *i)
1017 {
1018 #if 0
1019         duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
1020                  *i,
1021                  (long unsigned int)e->counters.pcnt,
1022                  (long unsigned int)e->counters.bcnt,
1023                  (long unsigned int)addme[*i].pcnt,
1024                  (long unsigned int)addme[*i].bcnt);
1025 #endif
1026
1027         ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1028
1029         (*i)++;
1030         return 0;
1031 }
1032
1033 static int
1034 do_add_counters(void __user *user, unsigned int len)
1035 {
1036         unsigned int i;
1037         struct xt_counters_info tmp, *paddc;
1038         struct ipt_table *t;
1039         struct xt_table_info *private;
1040         int ret = 0;
1041         void *loc_cpu_entry;
1042
1043         if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
1044                 return -EFAULT;
1045
1046         if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct xt_counters))
1047                 return -EINVAL;
1048
1049         paddc = vmalloc_node(len, numa_node_id());
1050         if (!paddc)
1051                 return -ENOMEM;
1052
1053         if (copy_from_user(paddc, user, len) != 0) {
1054                 ret = -EFAULT;
1055                 goto free;
1056         }
1057
1058         t = xt_find_table_lock(AF_INET, tmp.name);
1059         if (!t || IS_ERR(t)) {
1060                 ret = t ? PTR_ERR(t) : -ENOENT;
1061                 goto free;
1062         }
1063
1064         write_lock_bh(&t->lock);
1065         private = t->private;
1066         if (private->number != paddc->num_counters) {
1067                 ret = -EINVAL;
1068                 goto unlock_up_free;
1069         }
1070
1071         i = 0;
1072         /* Choose the copy that is on our node */
1073         loc_cpu_entry = private->entries[raw_smp_processor_id()];
1074         IPT_ENTRY_ITERATE(loc_cpu_entry,
1075                           private->size,
1076                           add_counter_to_entry,
1077                           paddc->counters,
1078                           &i);
1079  unlock_up_free:
1080         write_unlock_bh(&t->lock);
1081         xt_table_unlock(t);
1082         module_put(t->me);
1083  free:
1084         vfree(paddc);
1085
1086         return ret;
1087 }
1088
1089 static int
1090 do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
1091 {
1092         int ret;
1093
1094         if (!capable(CAP_NET_ADMIN))
1095                 return -EPERM;
1096
1097         switch (cmd) {
1098         case IPT_SO_SET_REPLACE:
1099                 ret = do_replace(user, len);
1100                 break;
1101
1102         case IPT_SO_SET_ADD_COUNTERS:
1103                 ret = do_add_counters(user, len);
1104                 break;
1105
1106         default:
1107                 duprintf("do_ipt_set_ctl:  unknown request %i\n", cmd);
1108                 ret = -EINVAL;
1109         }
1110
1111         return ret;
1112 }
1113
1114 static int
1115 do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
1116 {
1117         int ret;
1118
1119         if (!capable(CAP_NET_ADMIN))
1120                 return -EPERM;
1121
1122         switch (cmd) {
1123         case IPT_SO_GET_INFO: {
1124                 char name[IPT_TABLE_MAXNAMELEN];
1125                 struct ipt_table *t;
1126
1127                 if (*len != sizeof(struct ipt_getinfo)) {
1128                         duprintf("length %u != %u\n", *len,
1129                                  sizeof(struct ipt_getinfo));
1130                         ret = -EINVAL;
1131                         break;
1132                 }
1133
1134                 if (copy_from_user(name, user, sizeof(name)) != 0) {
1135                         ret = -EFAULT;
1136                         break;
1137                 }
1138                 name[IPT_TABLE_MAXNAMELEN-1] = '\0';
1139
1140                 t = try_then_request_module(xt_find_table_lock(AF_INET, name),
1141                                             "iptable_%s", name);
1142                 if (t && !IS_ERR(t)) {
1143                         struct ipt_getinfo info;
1144                         struct xt_table_info *private = t->private;
1145
1146                         info.valid_hooks = t->valid_hooks;
1147                         memcpy(info.hook_entry, private->hook_entry,
1148                                sizeof(info.hook_entry));
1149                         memcpy(info.underflow, private->underflow,
1150                                sizeof(info.underflow));
1151                         info.num_entries = private->number;
1152                         info.size = private->size;
1153                         memcpy(info.name, name, sizeof(info.name));
1154
1155                         if (copy_to_user(user, &info, *len) != 0)
1156                                 ret = -EFAULT;
1157                         else
1158                                 ret = 0;
1159                         xt_table_unlock(t);
1160                         module_put(t->me);
1161                 } else
1162                         ret = t ? PTR_ERR(t) : -ENOENT;
1163         }
1164         break;
1165
1166         case IPT_SO_GET_ENTRIES: {
1167                 struct ipt_get_entries get;
1168
1169                 if (*len < sizeof(get)) {
1170                         duprintf("get_entries: %u < %u\n", *len, sizeof(get));
1171                         ret = -EINVAL;
1172                 } else if (copy_from_user(&get, user, sizeof(get)) != 0) {
1173                         ret = -EFAULT;
1174                 } else if (*len != sizeof(struct ipt_get_entries) + get.size) {
1175                         duprintf("get_entries: %u != %u\n", *len,
1176                                  sizeof(struct ipt_get_entries) + get.size);
1177                         ret = -EINVAL;
1178                 } else
1179                         ret = get_entries(&get, user);
1180                 break;
1181         }
1182
1183         case IPT_SO_GET_REVISION_MATCH:
1184         case IPT_SO_GET_REVISION_TARGET: {
1185                 struct ipt_get_revision rev;
1186                 int target;
1187
1188                 if (*len != sizeof(rev)) {
1189                         ret = -EINVAL;
1190                         break;
1191                 }
1192                 if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
1193                         ret = -EFAULT;
1194                         break;
1195                 }
1196
1197                 if (cmd == IPT_SO_GET_REVISION_TARGET)
1198                         target = 1;
1199                 else
1200                         target = 0;
1201
1202                 try_then_request_module(xt_find_revision(AF_INET, rev.name,
1203                                                          rev.revision,
1204                                                          target, &ret),
1205                                         "ipt_%s", rev.name);
1206                 break;
1207         }
1208
1209         default:
1210                 duprintf("do_ipt_get_ctl: unknown request %i\n", cmd);
1211                 ret = -EINVAL;
1212         }
1213
1214         return ret;
1215 }
1216
1217 int ipt_register_table(struct xt_table *table, const struct ipt_replace *repl)
1218 {
1219         int ret;
1220         struct xt_table_info *newinfo;
1221         static struct xt_table_info bootstrap
1222                 = { 0, 0, 0, { 0 }, { 0 }, { } };
1223         void *loc_cpu_entry;
1224
1225         newinfo = xt_alloc_table_info(repl->size);
1226         if (!newinfo)
1227                 return -ENOMEM;
1228
1229         /* choose the copy on our node/cpu
1230          * but dont care of preemption
1231          */
1232         loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
1233         memcpy(loc_cpu_entry, repl->entries, repl->size);
1234
1235         ret = translate_table(table->name, table->valid_hooks,
1236                               newinfo, loc_cpu_entry, repl->size,
1237                               repl->num_entries,
1238                               repl->hook_entry,
1239                               repl->underflow);
1240         if (ret != 0) {
1241                 xt_free_table_info(newinfo);
1242                 return ret;
1243         }
1244
1245         if (xt_register_table(table, &bootstrap, newinfo) != 0) {
1246                 xt_free_table_info(newinfo);
1247                 return ret;
1248         }
1249
1250         return 0;
1251 }
1252
1253 void ipt_unregister_table(struct ipt_table *table)
1254 {
1255         struct xt_table_info *private;
1256         void *loc_cpu_entry;
1257
1258         private = xt_unregister_table(table);
1259
1260         /* Decrease module usage counts and free resources */
1261         loc_cpu_entry = private->entries[raw_smp_processor_id()];
1262         IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL);
1263         xt_free_table_info(private);
1264 }
1265
1266 /* Returns 1 if the type and code is matched by the range, 0 otherwise */
1267 static inline int
1268 icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
1269                      u_int8_t type, u_int8_t code,
1270                      int invert)
1271 {
1272         return ((test_type == 0xFF) || (type == test_type && code >= min_code && code <= max_code))
1273                 ^ invert;
1274 }
1275
1276 static int
1277 icmp_match(const struct sk_buff *skb,
1278            const struct net_device *in,
1279            const struct net_device *out,
1280            const void *matchinfo,
1281            int offset,
1282            unsigned int protoff,
1283            int *hotdrop)
1284 {
1285         struct icmphdr _icmph, *ic;
1286         const struct ipt_icmp *icmpinfo = matchinfo;
1287
1288         /* Must not be a fragment. */
1289         if (offset)
1290                 return 0;
1291
1292         ic = skb_header_pointer(skb, protoff, sizeof(_icmph), &_icmph);
1293         if (ic == NULL) {
1294                 /* We've been asked to examine this packet, and we
1295                  * can't.  Hence, no choice but to drop.
1296                  */
1297                 duprintf("Dropping evil ICMP tinygram.\n");
1298                 *hotdrop = 1;
1299                 return 0;
1300         }
1301
1302         return icmp_type_code_match(icmpinfo->type,
1303                                     icmpinfo->code[0],
1304                                     icmpinfo->code[1],
1305                                     ic->type, ic->code,
1306                                     !!(icmpinfo->invflags&IPT_ICMP_INV));
1307 }
1308
1309 /* Called when user tries to insert an entry of this type. */
1310 static int
1311 icmp_checkentry(const char *tablename,
1312            const void *info,
1313            void *matchinfo,
1314            unsigned int matchsize,
1315            unsigned int hook_mask)
1316 {
1317         const struct ipt_ip *ip = info;
1318         const struct ipt_icmp *icmpinfo = matchinfo;
1319
1320         /* Must specify proto == ICMP, and no unknown invflags */
1321         return ip->proto == IPPROTO_ICMP
1322                 && !(ip->invflags & IPT_INV_PROTO)
1323                 && matchsize == IPT_ALIGN(sizeof(struct ipt_icmp))
1324                 && !(icmpinfo->invflags & ~IPT_ICMP_INV);
1325 }
1326
1327 /* The built-in targets: standard (NULL) and error. */
1328 static struct ipt_target ipt_standard_target = {
1329         .name           = IPT_STANDARD_TARGET,
1330 };
1331
1332 static struct ipt_target ipt_error_target = {
1333         .name           = IPT_ERROR_TARGET,
1334         .target         = ipt_error,
1335 };
1336
1337 static struct nf_sockopt_ops ipt_sockopts = {
1338         .pf             = PF_INET,
1339         .set_optmin     = IPT_BASE_CTL,
1340         .set_optmax     = IPT_SO_SET_MAX+1,
1341         .set            = do_ipt_set_ctl,
1342         .get_optmin     = IPT_BASE_CTL,
1343         .get_optmax     = IPT_SO_GET_MAX+1,
1344         .get            = do_ipt_get_ctl,
1345 };
1346
1347 static struct ipt_match icmp_matchstruct = {
1348         .name           = "icmp",
1349         .match          = &icmp_match,
1350         .checkentry     = &icmp_checkentry,
1351 };
1352
1353 static int __init init(void)
1354 {
1355         int ret;
1356
1357         xt_proto_init(AF_INET);
1358
1359         /* Noone else will be downing sem now, so we won't sleep */
1360         xt_register_target(AF_INET, &ipt_standard_target);
1361         xt_register_target(AF_INET, &ipt_error_target);
1362         xt_register_match(AF_INET, &icmp_matchstruct);
1363
1364         /* Register setsockopt */
1365         ret = nf_register_sockopt(&ipt_sockopts);
1366         if (ret < 0) {
1367                 duprintf("Unable to register sockopts.\n");
1368                 return ret;
1369         }
1370
1371         printk("ip_tables: (C) 2000-2006 Netfilter Core Team\n");
1372         return 0;
1373 }
1374
1375 static void __exit fini(void)
1376 {
1377         nf_unregister_sockopt(&ipt_sockopts);
1378
1379         xt_unregister_match(AF_INET, &icmp_matchstruct);
1380         xt_unregister_target(AF_INET, &ipt_error_target);
1381         xt_unregister_target(AF_INET, &ipt_standard_target);
1382
1383         xt_proto_fini(AF_INET);
1384 }
1385
1386 EXPORT_SYMBOL(ipt_register_table);
1387 EXPORT_SYMBOL(ipt_unregister_table);
1388 EXPORT_SYMBOL(ipt_do_table);
1389 module_init(init);
1390 module_exit(fini);