Merge branch 'topic/cleanup' into for-linus
[pandora-kernel.git] / net / netfilter / ipvs / ip_vs_core.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the Netfilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19  * and others.
20  *
21  * Changes:
22  *      Paul `Rusty' Russell            properly handle non-linear skbs
23  *      Harald Welte                    don't use nfcache
24  *
25  */
26
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/ip.h>
30 #include <linux/tcp.h>
31 #include <linux/icmp.h>
32
33 #include <net/ip.h>
34 #include <net/tcp.h>
35 #include <net/udp.h>
36 #include <net/icmp.h>                   /* for icmp_send */
37 #include <net/route.h>
38
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv4.h>
41
42 #ifdef CONFIG_IP_VS_IPV6
43 #include <net/ipv6.h>
44 #include <linux/netfilter_ipv6.h>
45 #endif
46
47 #include <net/ip_vs.h>
48
49
50 EXPORT_SYMBOL(register_ip_vs_scheduler);
51 EXPORT_SYMBOL(unregister_ip_vs_scheduler);
52 EXPORT_SYMBOL(ip_vs_skb_replace);
53 EXPORT_SYMBOL(ip_vs_proto_name);
54 EXPORT_SYMBOL(ip_vs_conn_new);
55 EXPORT_SYMBOL(ip_vs_conn_in_get);
56 EXPORT_SYMBOL(ip_vs_conn_out_get);
57 #ifdef CONFIG_IP_VS_PROTO_TCP
58 EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
59 #endif
60 EXPORT_SYMBOL(ip_vs_conn_put);
61 #ifdef CONFIG_IP_VS_DEBUG
62 EXPORT_SYMBOL(ip_vs_get_debug_level);
63 #endif
64
65
66 /* ID used in ICMP lookups */
67 #define icmp_id(icmph)          (((icmph)->un).echo.id)
68 #define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
69
70 const char *ip_vs_proto_name(unsigned proto)
71 {
72         static char buf[20];
73
74         switch (proto) {
75         case IPPROTO_IP:
76                 return "IP";
77         case IPPROTO_UDP:
78                 return "UDP";
79         case IPPROTO_TCP:
80                 return "TCP";
81         case IPPROTO_ICMP:
82                 return "ICMP";
83 #ifdef CONFIG_IP_VS_IPV6
84         case IPPROTO_ICMPV6:
85                 return "ICMPv6";
86 #endif
87         default:
88                 sprintf(buf, "IP_%d", proto);
89                 return buf;
90         }
91 }
92
93 void ip_vs_init_hash_table(struct list_head *table, int rows)
94 {
95         while (--rows >= 0)
96                 INIT_LIST_HEAD(&table[rows]);
97 }
98
99 static inline void
100 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
101 {
102         struct ip_vs_dest *dest = cp->dest;
103         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
104                 spin_lock(&dest->stats.lock);
105                 dest->stats.ustats.inpkts++;
106                 dest->stats.ustats.inbytes += skb->len;
107                 spin_unlock(&dest->stats.lock);
108
109                 spin_lock(&dest->svc->stats.lock);
110                 dest->svc->stats.ustats.inpkts++;
111                 dest->svc->stats.ustats.inbytes += skb->len;
112                 spin_unlock(&dest->svc->stats.lock);
113
114                 spin_lock(&ip_vs_stats.lock);
115                 ip_vs_stats.ustats.inpkts++;
116                 ip_vs_stats.ustats.inbytes += skb->len;
117                 spin_unlock(&ip_vs_stats.lock);
118         }
119 }
120
121
122 static inline void
123 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
124 {
125         struct ip_vs_dest *dest = cp->dest;
126         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
127                 spin_lock(&dest->stats.lock);
128                 dest->stats.ustats.outpkts++;
129                 dest->stats.ustats.outbytes += skb->len;
130                 spin_unlock(&dest->stats.lock);
131
132                 spin_lock(&dest->svc->stats.lock);
133                 dest->svc->stats.ustats.outpkts++;
134                 dest->svc->stats.ustats.outbytes += skb->len;
135                 spin_unlock(&dest->svc->stats.lock);
136
137                 spin_lock(&ip_vs_stats.lock);
138                 ip_vs_stats.ustats.outpkts++;
139                 ip_vs_stats.ustats.outbytes += skb->len;
140                 spin_unlock(&ip_vs_stats.lock);
141         }
142 }
143
144
145 static inline void
146 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
147 {
148         spin_lock(&cp->dest->stats.lock);
149         cp->dest->stats.ustats.conns++;
150         spin_unlock(&cp->dest->stats.lock);
151
152         spin_lock(&svc->stats.lock);
153         svc->stats.ustats.conns++;
154         spin_unlock(&svc->stats.lock);
155
156         spin_lock(&ip_vs_stats.lock);
157         ip_vs_stats.ustats.conns++;
158         spin_unlock(&ip_vs_stats.lock);
159 }
160
161
162 static inline int
163 ip_vs_set_state(struct ip_vs_conn *cp, int direction,
164                 const struct sk_buff *skb,
165                 struct ip_vs_protocol *pp)
166 {
167         if (unlikely(!pp->state_transition))
168                 return 0;
169         return pp->state_transition(cp, direction, skb, pp);
170 }
171
172
173 /*
174  *  IPVS persistent scheduling function
175  *  It creates a connection entry according to its template if exists,
176  *  or selects a server and creates a connection entry plus a template.
177  *  Locking: we are svc user (svc->refcnt), so we hold all dests too
178  *  Protocols supported: TCP, UDP
179  */
180 static struct ip_vs_conn *
181 ip_vs_sched_persist(struct ip_vs_service *svc,
182                     const struct sk_buff *skb,
183                     __be16 ports[2])
184 {
185         struct ip_vs_conn *cp = NULL;
186         struct ip_vs_iphdr iph;
187         struct ip_vs_dest *dest;
188         struct ip_vs_conn *ct;
189         __be16  dport;                  /* destination port to forward */
190         union nf_inet_addr snet;        /* source network of the client,
191                                            after masking */
192
193         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
194
195         /* Mask saddr with the netmask to adjust template granularity */
196 #ifdef CONFIG_IP_VS_IPV6
197         if (svc->af == AF_INET6)
198                 ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
199         else
200 #endif
201                 snet.ip = iph.saddr.ip & svc->netmask;
202
203         IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
204                       "mnet %s\n",
205                       IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
206                       IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
207                       IP_VS_DBG_ADDR(svc->af, &snet));
208
209         /*
210          * As far as we know, FTP is a very complicated network protocol, and
211          * it uses control connection and data connections. For active FTP,
212          * FTP server initialize data connection to the client, its source port
213          * is often 20. For passive FTP, FTP server tells the clients the port
214          * that it passively listens to,  and the client issues the data
215          * connection. In the tunneling or direct routing mode, the load
216          * balancer is on the client-to-server half of connection, the port
217          * number is unknown to the load balancer. So, a conn template like
218          * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
219          * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
220          * is created for other persistent services.
221          */
222         if (ports[1] == svc->port) {
223                 /* Check if a template already exists */
224                 if (svc->port != FTPPORT)
225                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
226                                              &iph.daddr, ports[1]);
227                 else
228                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
229                                              &iph.daddr, 0);
230
231                 if (!ct || !ip_vs_check_template(ct)) {
232                         /*
233                          * No template found or the dest of the connection
234                          * template is not available.
235                          */
236                         dest = svc->scheduler->schedule(svc, skb);
237                         if (dest == NULL) {
238                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
239                                 return NULL;
240                         }
241
242                         /*
243                          * Create a template like <protocol,caddr,0,
244                          * vaddr,vport,daddr,dport> for non-ftp service,
245                          * and <protocol,caddr,0,vaddr,0,daddr,0>
246                          * for ftp service.
247                          */
248                         if (svc->port != FTPPORT)
249                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
250                                                     &snet, 0,
251                                                     &iph.daddr,
252                                                     ports[1],
253                                                     &dest->addr, dest->port,
254                                                     IP_VS_CONN_F_TEMPLATE,
255                                                     dest);
256                         else
257                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
258                                                     &snet, 0,
259                                                     &iph.daddr, 0,
260                                                     &dest->addr, 0,
261                                                     IP_VS_CONN_F_TEMPLATE,
262                                                     dest);
263                         if (ct == NULL)
264                                 return NULL;
265
266                         ct->timeout = svc->timeout;
267                 } else {
268                         /* set destination with the found template */
269                         dest = ct->dest;
270                 }
271                 dport = dest->port;
272         } else {
273                 /*
274                  * Note: persistent fwmark-based services and persistent
275                  * port zero service are handled here.
276                  * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
277                  * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
278                  */
279                 if (svc->fwmark) {
280                         union nf_inet_addr fwmark = {
281                                 .ip = htonl(svc->fwmark)
282                         };
283
284                         ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
285                                              &fwmark, 0);
286                 } else
287                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
288                                              &iph.daddr, 0);
289
290                 if (!ct || !ip_vs_check_template(ct)) {
291                         /*
292                          * If it is not persistent port zero, return NULL,
293                          * otherwise create a connection template.
294                          */
295                         if (svc->port)
296                                 return NULL;
297
298                         dest = svc->scheduler->schedule(svc, skb);
299                         if (dest == NULL) {
300                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
301                                 return NULL;
302                         }
303
304                         /*
305                          * Create a template according to the service
306                          */
307                         if (svc->fwmark) {
308                                 union nf_inet_addr fwmark = {
309                                         .ip = htonl(svc->fwmark)
310                                 };
311
312                                 ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
313                                                     &snet, 0,
314                                                     &fwmark, 0,
315                                                     &dest->addr, 0,
316                                                     IP_VS_CONN_F_TEMPLATE,
317                                                     dest);
318                         } else
319                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
320                                                     &snet, 0,
321                                                     &iph.daddr, 0,
322                                                     &dest->addr, 0,
323                                                     IP_VS_CONN_F_TEMPLATE,
324                                                     dest);
325                         if (ct == NULL)
326                                 return NULL;
327
328                         ct->timeout = svc->timeout;
329                 } else {
330                         /* set destination with the found template */
331                         dest = ct->dest;
332                 }
333                 dport = ports[1];
334         }
335
336         /*
337          *    Create a new connection according to the template
338          */
339         cp = ip_vs_conn_new(svc->af, iph.protocol,
340                             &iph.saddr, ports[0],
341                             &iph.daddr, ports[1],
342                             &dest->addr, dport,
343                             0,
344                             dest);
345         if (cp == NULL) {
346                 ip_vs_conn_put(ct);
347                 return NULL;
348         }
349
350         /*
351          *    Add its control
352          */
353         ip_vs_control_add(cp, ct);
354         ip_vs_conn_put(ct);
355
356         ip_vs_conn_stats(cp, svc);
357         return cp;
358 }
359
360
361 /*
362  *  IPVS main scheduling function
363  *  It selects a server according to the virtual service, and
364  *  creates a connection entry.
365  *  Protocols supported: TCP, UDP
366  */
367 struct ip_vs_conn *
368 ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
369 {
370         struct ip_vs_conn *cp = NULL;
371         struct ip_vs_iphdr iph;
372         struct ip_vs_dest *dest;
373         __be16 _ports[2], *pptr;
374
375         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
376         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
377         if (pptr == NULL)
378                 return NULL;
379
380         /*
381          *    Persistent service
382          */
383         if (svc->flags & IP_VS_SVC_F_PERSISTENT)
384                 return ip_vs_sched_persist(svc, skb, pptr);
385
386         /*
387          *    Non-persistent service
388          */
389         if (!svc->fwmark && pptr[1] != svc->port) {
390                 if (!svc->port)
391                         IP_VS_ERR("Schedule: port zero only supported "
392                                   "in persistent services, "
393                                   "check your ipvs configuration\n");
394                 return NULL;
395         }
396
397         dest = svc->scheduler->schedule(svc, skb);
398         if (dest == NULL) {
399                 IP_VS_DBG(1, "Schedule: no dest found.\n");
400                 return NULL;
401         }
402
403         /*
404          *    Create a connection entry.
405          */
406         cp = ip_vs_conn_new(svc->af, iph.protocol,
407                             &iph.saddr, pptr[0],
408                             &iph.daddr, pptr[1],
409                             &dest->addr, dest->port ? dest->port : pptr[1],
410                             0,
411                             dest);
412         if (cp == NULL)
413                 return NULL;
414
415         IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
416                       "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
417                       ip_vs_fwd_tag(cp),
418                       IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
419                       IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
420                       IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
421                       cp->flags, atomic_read(&cp->refcnt));
422
423         ip_vs_conn_stats(cp, svc);
424         return cp;
425 }
426
427
428 /*
429  *  Pass or drop the packet.
430  *  Called by ip_vs_in, when the virtual service is available but
431  *  no destination is available for a new connection.
432  */
433 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
434                 struct ip_vs_protocol *pp)
435 {
436         __be16 _ports[2], *pptr;
437         struct ip_vs_iphdr iph;
438         int unicast;
439         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
440
441         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
442         if (pptr == NULL) {
443                 ip_vs_service_put(svc);
444                 return NF_DROP;
445         }
446
447 #ifdef CONFIG_IP_VS_IPV6
448         if (svc->af == AF_INET6)
449                 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
450         else
451 #endif
452                 unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
453
454         /* if it is fwmark-based service, the cache_bypass sysctl is up
455            and the destination is a non-local unicast, then create
456            a cache_bypass connection entry */
457         if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
458                 int ret, cs;
459                 struct ip_vs_conn *cp;
460                 union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
461
462                 ip_vs_service_put(svc);
463
464                 /* create a new connection entry */
465                 IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
466                 cp = ip_vs_conn_new(svc->af, iph.protocol,
467                                     &iph.saddr, pptr[0],
468                                     &iph.daddr, pptr[1],
469                                     &daddr, 0,
470                                     IP_VS_CONN_F_BYPASS,
471                                     NULL);
472                 if (cp == NULL)
473                         return NF_DROP;
474
475                 /* statistics */
476                 ip_vs_in_stats(cp, skb);
477
478                 /* set state */
479                 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
480
481                 /* transmit the first SYN packet */
482                 ret = cp->packet_xmit(skb, cp, pp);
483                 /* do not touch skb anymore */
484
485                 atomic_inc(&cp->in_pkts);
486                 ip_vs_conn_put(cp);
487                 return ret;
488         }
489
490         /*
491          * When the virtual ftp service is presented, packets destined
492          * for other services on the VIP may get here (except services
493          * listed in the ipvs table), pass the packets, because it is
494          * not ipvs job to decide to drop the packets.
495          */
496         if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
497                 ip_vs_service_put(svc);
498                 return NF_ACCEPT;
499         }
500
501         ip_vs_service_put(svc);
502
503         /*
504          * Notify the client that the destination is unreachable, and
505          * release the socket buffer.
506          * Since it is in IP layer, the TCP socket is not actually
507          * created, the TCP RST packet cannot be sent, instead that
508          * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
509          */
510 #ifdef CONFIG_IP_VS_IPV6
511         if (svc->af == AF_INET6)
512                 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0,
513                             skb->dev);
514         else
515 #endif
516                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
517
518         return NF_DROP;
519 }
520
521
522 /*
523  *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
524  *      chain, and is used for VS/NAT.
525  *      It detects packets for VS/NAT connections and sends the packets
526  *      immediately. This can avoid that iptable_nat mangles the packets
527  *      for VS/NAT.
528  */
529 static unsigned int ip_vs_post_routing(unsigned int hooknum,
530                                        struct sk_buff *skb,
531                                        const struct net_device *in,
532                                        const struct net_device *out,
533                                        int (*okfn)(struct sk_buff *))
534 {
535         if (!skb->ipvs_property)
536                 return NF_ACCEPT;
537         /* The packet was sent from IPVS, exit this chain */
538         return NF_STOP;
539 }
540
541 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
542 {
543         return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
544 }
545
546 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
547 {
548         int err = ip_defrag(skb, user);
549
550         if (!err)
551                 ip_send_check(ip_hdr(skb));
552
553         return err;
554 }
555
556 #ifdef CONFIG_IP_VS_IPV6
557 static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
558 {
559         /* TODO IPv6: Find out what to do here for IPv6 */
560         return 0;
561 }
562 #endif
563
564 /*
565  * Packet has been made sufficiently writable in caller
566  * - inout: 1=in->out, 0=out->in
567  */
568 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
569                     struct ip_vs_conn *cp, int inout)
570 {
571         struct iphdr *iph        = ip_hdr(skb);
572         unsigned int icmp_offset = iph->ihl*4;
573         struct icmphdr *icmph    = (struct icmphdr *)(skb_network_header(skb) +
574                                                       icmp_offset);
575         struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
576
577         if (inout) {
578                 iph->saddr = cp->vaddr.ip;
579                 ip_send_check(iph);
580                 ciph->daddr = cp->vaddr.ip;
581                 ip_send_check(ciph);
582         } else {
583                 iph->daddr = cp->daddr.ip;
584                 ip_send_check(iph);
585                 ciph->saddr = cp->daddr.ip;
586                 ip_send_check(ciph);
587         }
588
589         /* the TCP/UDP port */
590         if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
591                 __be16 *ports = (void *)ciph + ciph->ihl*4;
592
593                 if (inout)
594                         ports[1] = cp->vport;
595                 else
596                         ports[0] = cp->dport;
597         }
598
599         /* And finally the ICMP checksum */
600         icmph->checksum = 0;
601         icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
602         skb->ip_summed = CHECKSUM_UNNECESSARY;
603
604         if (inout)
605                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
606                         "Forwarding altered outgoing ICMP");
607         else
608                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
609                         "Forwarding altered incoming ICMP");
610 }
611
612 #ifdef CONFIG_IP_VS_IPV6
613 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
614                     struct ip_vs_conn *cp, int inout)
615 {
616         struct ipv6hdr *iph      = ipv6_hdr(skb);
617         unsigned int icmp_offset = sizeof(struct ipv6hdr);
618         struct icmp6hdr *icmph   = (struct icmp6hdr *)(skb_network_header(skb) +
619                                                       icmp_offset);
620         struct ipv6hdr *ciph     = (struct ipv6hdr *)(icmph + 1);
621
622         if (inout) {
623                 iph->saddr = cp->vaddr.in6;
624                 ciph->daddr = cp->vaddr.in6;
625         } else {
626                 iph->daddr = cp->daddr.in6;
627                 ciph->saddr = cp->daddr.in6;
628         }
629
630         /* the TCP/UDP port */
631         if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) {
632                 __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
633
634                 if (inout)
635                         ports[1] = cp->vport;
636                 else
637                         ports[0] = cp->dport;
638         }
639
640         /* And finally the ICMP checksum */
641         icmph->icmp6_cksum = 0;
642         /* TODO IPv6: is this correct for ICMPv6? */
643         ip_vs_checksum_complete(skb, icmp_offset);
644         skb->ip_summed = CHECKSUM_UNNECESSARY;
645
646         if (inout)
647                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
648                         "Forwarding altered outgoing ICMPv6");
649         else
650                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
651                         "Forwarding altered incoming ICMPv6");
652 }
653 #endif
654
655 /* Handle relevant response ICMP messages - forward to the right
656  * destination host. Used for NAT and local client.
657  */
658 static int handle_response_icmp(int af, struct sk_buff *skb,
659                                 union nf_inet_addr *snet,
660                                 __u8 protocol, struct ip_vs_conn *cp,
661                                 struct ip_vs_protocol *pp,
662                                 unsigned int offset, unsigned int ihl)
663 {
664         unsigned int verdict = NF_DROP;
665
666         if (IP_VS_FWD_METHOD(cp) != 0) {
667                 IP_VS_ERR("shouldn't reach here, because the box is on the "
668                           "half connection in the tun/dr module.\n");
669         }
670
671         /* Ensure the checksum is correct */
672         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
673                 /* Failed checksum! */
674                 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
675                               IP_VS_DBG_ADDR(af, snet));
676                 goto out;
677         }
678
679         if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol)
680                 offset += 2 * sizeof(__u16);
681         if (!skb_make_writable(skb, offset))
682                 goto out;
683
684 #ifdef CONFIG_IP_VS_IPV6
685         if (af == AF_INET6)
686                 ip_vs_nat_icmp_v6(skb, pp, cp, 1);
687         else
688 #endif
689                 ip_vs_nat_icmp(skb, pp, cp, 1);
690
691         /* do the statistics and put it back */
692         ip_vs_out_stats(cp, skb);
693
694         skb->ipvs_property = 1;
695         verdict = NF_ACCEPT;
696
697 out:
698         __ip_vs_conn_put(cp);
699
700         return verdict;
701 }
702
703 /*
704  *      Handle ICMP messages in the inside-to-outside direction (outgoing).
705  *      Find any that might be relevant, check against existing connections.
706  *      Currently handles error types - unreachable, quench, ttl exceeded.
707  */
708 static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
709 {
710         struct iphdr *iph;
711         struct icmphdr  _icmph, *ic;
712         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
713         struct ip_vs_iphdr ciph;
714         struct ip_vs_conn *cp;
715         struct ip_vs_protocol *pp;
716         unsigned int offset, ihl;
717         union nf_inet_addr snet;
718
719         *related = 1;
720
721         /* reassemble IP fragments */
722         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
723                 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
724                         return NF_STOLEN;
725         }
726
727         iph = ip_hdr(skb);
728         offset = ihl = iph->ihl * 4;
729         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
730         if (ic == NULL)
731                 return NF_DROP;
732
733         IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
734                   ic->type, ntohs(icmp_id(ic)),
735                   &iph->saddr, &iph->daddr);
736
737         /*
738          * Work through seeing if this is for us.
739          * These checks are supposed to be in an order that means easy
740          * things are checked first to speed up processing.... however
741          * this means that some packets will manage to get a long way
742          * down this stack and then be rejected, but that's life.
743          */
744         if ((ic->type != ICMP_DEST_UNREACH) &&
745             (ic->type != ICMP_SOURCE_QUENCH) &&
746             (ic->type != ICMP_TIME_EXCEEDED)) {
747                 *related = 0;
748                 return NF_ACCEPT;
749         }
750
751         /* Now find the contained IP header */
752         offset += sizeof(_icmph);
753         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
754         if (cih == NULL)
755                 return NF_ACCEPT; /* The packet looks wrong, ignore */
756
757         pp = ip_vs_proto_get(cih->protocol);
758         if (!pp)
759                 return NF_ACCEPT;
760
761         /* Is the embedded protocol header present? */
762         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
763                      pp->dont_defrag))
764                 return NF_ACCEPT;
765
766         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
767
768         offset += cih->ihl * 4;
769
770         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
771         /* The embedded headers contain source and dest in reverse order */
772         cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
773         if (!cp)
774                 return NF_ACCEPT;
775
776         snet.ip = iph->saddr;
777         return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
778                                     pp, offset, ihl);
779 }
780
781 #ifdef CONFIG_IP_VS_IPV6
782 static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
783 {
784         struct ipv6hdr *iph;
785         struct icmp6hdr _icmph, *ic;
786         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
787                                            within the ICMP */
788         struct ip_vs_iphdr ciph;
789         struct ip_vs_conn *cp;
790         struct ip_vs_protocol *pp;
791         unsigned int offset;
792         union nf_inet_addr snet;
793
794         *related = 1;
795
796         /* reassemble IP fragments */
797         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
798                 if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
799                         return NF_STOLEN;
800         }
801
802         iph = ipv6_hdr(skb);
803         offset = sizeof(struct ipv6hdr);
804         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
805         if (ic == NULL)
806                 return NF_DROP;
807
808         IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
809                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
810                   &iph->saddr, &iph->daddr);
811
812         /*
813          * Work through seeing if this is for us.
814          * These checks are supposed to be in an order that means easy
815          * things are checked first to speed up processing.... however
816          * this means that some packets will manage to get a long way
817          * down this stack and then be rejected, but that's life.
818          */
819         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
820             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
821             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
822                 *related = 0;
823                 return NF_ACCEPT;
824         }
825
826         /* Now find the contained IP header */
827         offset += sizeof(_icmph);
828         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
829         if (cih == NULL)
830                 return NF_ACCEPT; /* The packet looks wrong, ignore */
831
832         pp = ip_vs_proto_get(cih->nexthdr);
833         if (!pp)
834                 return NF_ACCEPT;
835
836         /* Is the embedded protocol header present? */
837         /* TODO: we don't support fragmentation at the moment anyways */
838         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
839                 return NF_ACCEPT;
840
841         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
842
843         offset += sizeof(struct ipv6hdr);
844
845         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
846         /* The embedded headers contain source and dest in reverse order */
847         cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
848         if (!cp)
849                 return NF_ACCEPT;
850
851         ipv6_addr_copy(&snet.in6, &iph->saddr);
852         return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
853                                     pp, offset, sizeof(struct ipv6hdr));
854 }
855 #endif
856
857 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
858 {
859         struct tcphdr _tcph, *th;
860
861         th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
862         if (th == NULL)
863                 return 0;
864         return th->rst;
865 }
866
867 /* Handle response packets: rewrite addresses and send away...
868  * Used for NAT and local client.
869  */
870 static unsigned int
871 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
872                 struct ip_vs_conn *cp, int ihl)
873 {
874         IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
875
876         if (!skb_make_writable(skb, ihl))
877                 goto drop;
878
879         /* mangle the packet */
880         if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
881                 goto drop;
882
883 #ifdef CONFIG_IP_VS_IPV6
884         if (af == AF_INET6)
885                 ipv6_hdr(skb)->saddr = cp->vaddr.in6;
886         else
887 #endif
888         {
889                 ip_hdr(skb)->saddr = cp->vaddr.ip;
890                 ip_send_check(ip_hdr(skb));
891         }
892
893         /* For policy routing, packets originating from this
894          * machine itself may be routed differently to packets
895          * passing through.  We want this packet to be routed as
896          * if it came from this machine itself.  So re-compute
897          * the routing information.
898          */
899 #ifdef CONFIG_IP_VS_IPV6
900         if (af == AF_INET6) {
901                 if (ip6_route_me_harder(skb) != 0)
902                         goto drop;
903         } else
904 #endif
905                 if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
906                         goto drop;
907
908         IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
909
910         ip_vs_out_stats(cp, skb);
911         ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
912         ip_vs_conn_put(cp);
913
914         skb->ipvs_property = 1;
915
916         LeaveFunction(11);
917         return NF_ACCEPT;
918
919 drop:
920         ip_vs_conn_put(cp);
921         kfree_skb(skb);
922         return NF_STOLEN;
923 }
924
925 /*
926  *      It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
927  *      Check if outgoing packet belongs to the established ip_vs_conn.
928  */
929 static unsigned int
930 ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
931           const struct net_device *in, const struct net_device *out,
932           int (*okfn)(struct sk_buff *))
933 {
934         struct ip_vs_iphdr iph;
935         struct ip_vs_protocol *pp;
936         struct ip_vs_conn *cp;
937         int af;
938
939         EnterFunction(11);
940
941         af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
942
943         if (skb->ipvs_property)
944                 return NF_ACCEPT;
945
946         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
947 #ifdef CONFIG_IP_VS_IPV6
948         if (af == AF_INET6) {
949                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
950                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
951
952                         if (related)
953                                 return verdict;
954                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
955                 }
956         } else
957 #endif
958                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
959                         int related, verdict = ip_vs_out_icmp(skb, &related);
960
961                         if (related)
962                                 return verdict;
963                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
964                 }
965
966         pp = ip_vs_proto_get(iph.protocol);
967         if (unlikely(!pp))
968                 return NF_ACCEPT;
969
970         /* reassemble IP fragments */
971 #ifdef CONFIG_IP_VS_IPV6
972         if (af == AF_INET6) {
973                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
974                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
975
976                         if (related)
977                                 return verdict;
978
979                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
980                 }
981         } else
982 #endif
983                 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
984                              !pp->dont_defrag)) {
985                         if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
986                                 return NF_STOLEN;
987
988                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
989                 }
990
991         /*
992          * Check if the packet belongs to an existing entry
993          */
994         cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
995
996         if (unlikely(!cp)) {
997                 if (sysctl_ip_vs_nat_icmp_send &&
998                     (pp->protocol == IPPROTO_TCP ||
999                      pp->protocol == IPPROTO_UDP)) {
1000                         __be16 _ports[2], *pptr;
1001
1002                         pptr = skb_header_pointer(skb, iph.len,
1003                                                   sizeof(_ports), _ports);
1004                         if (pptr == NULL)
1005                                 return NF_ACCEPT;       /* Not for me */
1006                         if (ip_vs_lookup_real_service(af, iph.protocol,
1007                                                       &iph.saddr,
1008                                                       pptr[0])) {
1009                                 /*
1010                                  * Notify the real server: there is no
1011                                  * existing entry if it is not RST
1012                                  * packet or not TCP packet.
1013                                  */
1014                                 if (iph.protocol != IPPROTO_TCP
1015                                     || !is_tcp_reset(skb, iph.len)) {
1016 #ifdef CONFIG_IP_VS_IPV6
1017                                         if (af == AF_INET6)
1018                                                 icmpv6_send(skb,
1019                                                             ICMPV6_DEST_UNREACH,
1020                                                             ICMPV6_PORT_UNREACH,
1021                                                             0, skb->dev);
1022                                         else
1023 #endif
1024                                                 icmp_send(skb,
1025                                                           ICMP_DEST_UNREACH,
1026                                                           ICMP_PORT_UNREACH, 0);
1027                                         return NF_DROP;
1028                                 }
1029                         }
1030                 }
1031                 IP_VS_DBG_PKT(12, pp, skb, 0,
1032                               "packet continues traversal as normal");
1033                 return NF_ACCEPT;
1034         }
1035
1036         return handle_response(af, skb, pp, cp, iph.len);
1037 }
1038
1039
1040 /*
1041  *      Handle ICMP messages in the outside-to-inside direction (incoming).
1042  *      Find any that might be relevant, check against existing connections,
1043  *      forward to the right destination host if relevant.
1044  *      Currently handles error types - unreachable, quench, ttl exceeded.
1045  */
1046 static int
1047 ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1048 {
1049         struct iphdr *iph;
1050         struct icmphdr  _icmph, *ic;
1051         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
1052         struct ip_vs_iphdr ciph;
1053         struct ip_vs_conn *cp;
1054         struct ip_vs_protocol *pp;
1055         unsigned int offset, ihl, verdict;
1056         union nf_inet_addr snet;
1057
1058         *related = 1;
1059
1060         /* reassemble IP fragments */
1061         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1062                 if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
1063                                             IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
1064                         return NF_STOLEN;
1065         }
1066
1067         iph = ip_hdr(skb);
1068         offset = ihl = iph->ihl * 4;
1069         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1070         if (ic == NULL)
1071                 return NF_DROP;
1072
1073         IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1074                   ic->type, ntohs(icmp_id(ic)),
1075                   &iph->saddr, &iph->daddr);
1076
1077         /*
1078          * Work through seeing if this is for us.
1079          * These checks are supposed to be in an order that means easy
1080          * things are checked first to speed up processing.... however
1081          * this means that some packets will manage to get a long way
1082          * down this stack and then be rejected, but that's life.
1083          */
1084         if ((ic->type != ICMP_DEST_UNREACH) &&
1085             (ic->type != ICMP_SOURCE_QUENCH) &&
1086             (ic->type != ICMP_TIME_EXCEEDED)) {
1087                 *related = 0;
1088                 return NF_ACCEPT;
1089         }
1090
1091         /* Now find the contained IP header */
1092         offset += sizeof(_icmph);
1093         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1094         if (cih == NULL)
1095                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1096
1097         pp = ip_vs_proto_get(cih->protocol);
1098         if (!pp)
1099                 return NF_ACCEPT;
1100
1101         /* Is the embedded protocol header present? */
1102         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1103                      pp->dont_defrag))
1104                 return NF_ACCEPT;
1105
1106         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
1107
1108         offset += cih->ihl * 4;
1109
1110         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1111         /* The embedded headers contain source and dest in reverse order */
1112         cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
1113         if (!cp) {
1114                 /* The packet could also belong to a local client */
1115                 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
1116                 if (cp) {
1117                         snet.ip = iph->saddr;
1118                         return handle_response_icmp(AF_INET, skb, &snet,
1119                                                     cih->protocol, cp, pp,
1120                                                     offset, ihl);
1121                 }
1122                 return NF_ACCEPT;
1123         }
1124
1125         verdict = NF_DROP;
1126
1127         /* Ensure the checksum is correct */
1128         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1129                 /* Failed checksum! */
1130                 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1131                           &iph->saddr);
1132                 goto out;
1133         }
1134
1135         /* do the statistics and put it back */
1136         ip_vs_in_stats(cp, skb);
1137         if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1138                 offset += 2 * sizeof(__u16);
1139         verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
1140         /* do not touch skb anymore */
1141
1142   out:
1143         __ip_vs_conn_put(cp);
1144
1145         return verdict;
1146 }
1147
1148 #ifdef CONFIG_IP_VS_IPV6
1149 static int
1150 ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1151 {
1152         struct ipv6hdr *iph;
1153         struct icmp6hdr _icmph, *ic;
1154         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
1155                                            within the ICMP */
1156         struct ip_vs_iphdr ciph;
1157         struct ip_vs_conn *cp;
1158         struct ip_vs_protocol *pp;
1159         unsigned int offset, verdict;
1160         union nf_inet_addr snet;
1161
1162         *related = 1;
1163
1164         /* reassemble IP fragments */
1165         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1166                 if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
1167                                                IP_DEFRAG_VS_IN :
1168                                                IP_DEFRAG_VS_FWD))
1169                         return NF_STOLEN;
1170         }
1171
1172         iph = ipv6_hdr(skb);
1173         offset = sizeof(struct ipv6hdr);
1174         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1175         if (ic == NULL)
1176                 return NF_DROP;
1177
1178         IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
1179                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
1180                   &iph->saddr, &iph->daddr);
1181
1182         /*
1183          * Work through seeing if this is for us.
1184          * These checks are supposed to be in an order that means easy
1185          * things are checked first to speed up processing.... however
1186          * this means that some packets will manage to get a long way
1187          * down this stack and then be rejected, but that's life.
1188          */
1189         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1190             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1191             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1192                 *related = 0;
1193                 return NF_ACCEPT;
1194         }
1195
1196         /* Now find the contained IP header */
1197         offset += sizeof(_icmph);
1198         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1199         if (cih == NULL)
1200                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1201
1202         pp = ip_vs_proto_get(cih->nexthdr);
1203         if (!pp)
1204                 return NF_ACCEPT;
1205
1206         /* Is the embedded protocol header present? */
1207         /* TODO: we don't support fragmentation at the moment anyways */
1208         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1209                 return NF_ACCEPT;
1210
1211         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
1212
1213         offset += sizeof(struct ipv6hdr);
1214
1215         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1216         /* The embedded headers contain source and dest in reverse order */
1217         cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
1218         if (!cp) {
1219                 /* The packet could also belong to a local client */
1220                 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
1221                 if (cp) {
1222                         ipv6_addr_copy(&snet.in6, &iph->saddr);
1223                         return handle_response_icmp(AF_INET6, skb, &snet,
1224                                                     cih->nexthdr,
1225                                                     cp, pp, offset,
1226                                                     sizeof(struct ipv6hdr));
1227                 }
1228                 return NF_ACCEPT;
1229         }
1230
1231         verdict = NF_DROP;
1232
1233         /* do the statistics and put it back */
1234         ip_vs_in_stats(cp, skb);
1235         if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr)
1236                 offset += 2 * sizeof(__u16);
1237         verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
1238         /* do not touch skb anymore */
1239
1240         __ip_vs_conn_put(cp);
1241
1242         return verdict;
1243 }
1244 #endif
1245
1246
1247 /*
1248  *      Check if it's for virtual services, look it up,
1249  *      and send it on its way...
1250  */
1251 static unsigned int
1252 ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1253          const struct net_device *in, const struct net_device *out,
1254          int (*okfn)(struct sk_buff *))
1255 {
1256         struct ip_vs_iphdr iph;
1257         struct ip_vs_protocol *pp;
1258         struct ip_vs_conn *cp;
1259         int ret, restart, af;
1260
1261         af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
1262
1263         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1264
1265         /*
1266          *      Big tappo: only PACKET_HOST, including loopback for local client
1267          *      Don't handle local packets on IPv6 for now
1268          */
1269         if (unlikely(skb->pkt_type != PACKET_HOST)) {
1270                 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
1271                               skb->pkt_type,
1272                               iph.protocol,
1273                               IP_VS_DBG_ADDR(af, &iph.daddr));
1274                 return NF_ACCEPT;
1275         }
1276
1277         if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1278                 int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
1279
1280                 if (related)
1281                         return verdict;
1282                 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1283         }
1284
1285         /* Protocol supported? */
1286         pp = ip_vs_proto_get(iph.protocol);
1287         if (unlikely(!pp))
1288                 return NF_ACCEPT;
1289
1290         /*
1291          * Check if the packet belongs to an existing connection entry
1292          */
1293         cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
1294
1295         if (unlikely(!cp)) {
1296                 int v;
1297
1298                 /* For local client packets, it could be a response */
1299                 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1300                 if (cp)
1301                         return handle_response(af, skb, pp, cp, iph.len);
1302
1303                 if (!pp->conn_schedule(af, skb, pp, &v, &cp))
1304                         return v;
1305         }
1306
1307         if (unlikely(!cp)) {
1308                 /* sorry, all this trouble for a no-hit :) */
1309                 IP_VS_DBG_PKT(12, pp, skb, 0,
1310                               "packet continues traversal as normal");
1311                 return NF_ACCEPT;
1312         }
1313
1314         IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
1315
1316         /* Check the server status */
1317         if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1318                 /* the destination server is not available */
1319
1320                 if (sysctl_ip_vs_expire_nodest_conn) {
1321                         /* try to expire the connection immediately */
1322                         ip_vs_conn_expire_now(cp);
1323                 }
1324                 /* don't restart its timer, and silently
1325                    drop the packet. */
1326                 __ip_vs_conn_put(cp);
1327                 return NF_DROP;
1328         }
1329
1330         ip_vs_in_stats(cp, skb);
1331         restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
1332         if (cp->packet_xmit)
1333                 ret = cp->packet_xmit(skb, cp, pp);
1334                 /* do not touch skb anymore */
1335         else {
1336                 IP_VS_DBG_RL("warning: packet_xmit is null");
1337                 ret = NF_ACCEPT;
1338         }
1339
1340         /* Increase its packet counter and check if it is needed
1341          * to be synchronized
1342          *
1343          * Sync connection if it is about to close to
1344          * encorage the standby servers to update the connections timeout
1345          */
1346         atomic_inc(&cp->in_pkts);
1347         if (af == AF_INET &&
1348             (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1349             (((cp->protocol != IPPROTO_TCP ||
1350                cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1351               (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
1352                == sysctl_ip_vs_sync_threshold[0])) ||
1353              ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1354               ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
1355                (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1356                (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1357                 ip_vs_sync_conn(cp);
1358         cp->old_state = cp->state;
1359
1360         ip_vs_conn_put(cp);
1361         return ret;
1362 }
1363
1364
1365 /*
1366  *      It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1367  *      related packets destined for 0.0.0.0/0.
1368  *      When fwmark-based virtual service is used, such as transparent
1369  *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
1370  *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1371  *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1372  *      and send them to ip_vs_in_icmp.
1373  */
1374 static unsigned int
1375 ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1376                    const struct net_device *in, const struct net_device *out,
1377                    int (*okfn)(struct sk_buff *))
1378 {
1379         int r;
1380
1381         if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1382                 return NF_ACCEPT;
1383
1384         return ip_vs_in_icmp(skb, &r, hooknum);
1385 }
1386
1387 #ifdef CONFIG_IP_VS_IPV6
1388 static unsigned int
1389 ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1390                       const struct net_device *in, const struct net_device *out,
1391                       int (*okfn)(struct sk_buff *))
1392 {
1393         int r;
1394
1395         if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1396                 return NF_ACCEPT;
1397
1398         return ip_vs_in_icmp_v6(skb, &r, hooknum);
1399 }
1400 #endif
1401
1402
1403 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1404         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1405          * or VS/NAT(change destination), so that filtering rules can be
1406          * applied to IPVS. */
1407         {
1408                 .hook           = ip_vs_in,
1409                 .owner          = THIS_MODULE,
1410                 .pf             = PF_INET,
1411                 .hooknum        = NF_INET_LOCAL_IN,
1412                 .priority       = 100,
1413         },
1414         /* After packet filtering, change source only for VS/NAT */
1415         {
1416                 .hook           = ip_vs_out,
1417                 .owner          = THIS_MODULE,
1418                 .pf             = PF_INET,
1419                 .hooknum        = NF_INET_FORWARD,
1420                 .priority       = 100,
1421         },
1422         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1423          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1424         {
1425                 .hook           = ip_vs_forward_icmp,
1426                 .owner          = THIS_MODULE,
1427                 .pf             = PF_INET,
1428                 .hooknum        = NF_INET_FORWARD,
1429                 .priority       = 99,
1430         },
1431         /* Before the netfilter connection tracking, exit from POST_ROUTING */
1432         {
1433                 .hook           = ip_vs_post_routing,
1434                 .owner          = THIS_MODULE,
1435                 .pf             = PF_INET,
1436                 .hooknum        = NF_INET_POST_ROUTING,
1437                 .priority       = NF_IP_PRI_NAT_SRC-1,
1438         },
1439 #ifdef CONFIG_IP_VS_IPV6
1440         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1441          * or VS/NAT(change destination), so that filtering rules can be
1442          * applied to IPVS. */
1443         {
1444                 .hook           = ip_vs_in,
1445                 .owner          = THIS_MODULE,
1446                 .pf             = PF_INET6,
1447                 .hooknum        = NF_INET_LOCAL_IN,
1448                 .priority       = 100,
1449         },
1450         /* After packet filtering, change source only for VS/NAT */
1451         {
1452                 .hook           = ip_vs_out,
1453                 .owner          = THIS_MODULE,
1454                 .pf             = PF_INET6,
1455                 .hooknum        = NF_INET_FORWARD,
1456                 .priority       = 100,
1457         },
1458         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1459          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1460         {
1461                 .hook           = ip_vs_forward_icmp_v6,
1462                 .owner          = THIS_MODULE,
1463                 .pf             = PF_INET6,
1464                 .hooknum        = NF_INET_FORWARD,
1465                 .priority       = 99,
1466         },
1467         /* Before the netfilter connection tracking, exit from POST_ROUTING */
1468         {
1469                 .hook           = ip_vs_post_routing,
1470                 .owner          = THIS_MODULE,
1471                 .pf             = PF_INET6,
1472                 .hooknum        = NF_INET_POST_ROUTING,
1473                 .priority       = NF_IP6_PRI_NAT_SRC-1,
1474         },
1475 #endif
1476 };
1477
1478
1479 /*
1480  *      Initialize IP Virtual Server
1481  */
1482 static int __init ip_vs_init(void)
1483 {
1484         int ret;
1485
1486         ip_vs_estimator_init();
1487
1488         ret = ip_vs_control_init();
1489         if (ret < 0) {
1490                 IP_VS_ERR("can't setup control.\n");
1491                 goto cleanup_estimator;
1492         }
1493
1494         ip_vs_protocol_init();
1495
1496         ret = ip_vs_app_init();
1497         if (ret < 0) {
1498                 IP_VS_ERR("can't setup application helper.\n");
1499                 goto cleanup_protocol;
1500         }
1501
1502         ret = ip_vs_conn_init();
1503         if (ret < 0) {
1504                 IP_VS_ERR("can't setup connection table.\n");
1505                 goto cleanup_app;
1506         }
1507
1508         ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1509         if (ret < 0) {
1510                 IP_VS_ERR("can't register hooks.\n");
1511                 goto cleanup_conn;
1512         }
1513
1514         IP_VS_INFO("ipvs loaded.\n");
1515         return ret;
1516
1517   cleanup_conn:
1518         ip_vs_conn_cleanup();
1519   cleanup_app:
1520         ip_vs_app_cleanup();
1521   cleanup_protocol:
1522         ip_vs_protocol_cleanup();
1523         ip_vs_control_cleanup();
1524   cleanup_estimator:
1525         ip_vs_estimator_cleanup();
1526         return ret;
1527 }
1528
1529 static void __exit ip_vs_cleanup(void)
1530 {
1531         nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1532         ip_vs_conn_cleanup();
1533         ip_vs_app_cleanup();
1534         ip_vs_protocol_cleanup();
1535         ip_vs_control_cleanup();
1536         ip_vs_estimator_cleanup();
1537         IP_VS_INFO("ipvs unloaded.\n");
1538 }
1539
1540 module_init(ip_vs_init);
1541 module_exit(ip_vs_cleanup);
1542 MODULE_LICENSE("GPL");