ipvs: changes for local client
[pandora-kernel.git] / net / netfilter / ipvs / ip_vs_core.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the Netfilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19  * and others.
20  *
21  * Changes:
22  *      Paul `Rusty' Russell            properly handle non-linear skbs
23  *      Harald Welte                    don't use nfcache
24  *
25  */
26
27 #define KMSG_COMPONENT "IPVS"
28 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
29
30 #include <linux/module.h>
31 #include <linux/kernel.h>
32 #include <linux/ip.h>
33 #include <linux/tcp.h>
34 #include <linux/sctp.h>
35 #include <linux/icmp.h>
36 #include <linux/slab.h>
37
38 #include <net/ip.h>
39 #include <net/tcp.h>
40 #include <net/udp.h>
41 #include <net/icmp.h>                   /* for icmp_send */
42 #include <net/route.h>
43 #include <net/ip6_checksum.h>
44
45 #include <linux/netfilter.h>
46 #include <linux/netfilter_ipv4.h>
47
48 #ifdef CONFIG_IP_VS_IPV6
49 #include <net/ipv6.h>
50 #include <linux/netfilter_ipv6.h>
51 #include <net/ip6_route.h>
52 #endif
53
54 #include <net/ip_vs.h>
55
56
57 EXPORT_SYMBOL(register_ip_vs_scheduler);
58 EXPORT_SYMBOL(unregister_ip_vs_scheduler);
59 EXPORT_SYMBOL(ip_vs_proto_name);
60 EXPORT_SYMBOL(ip_vs_conn_new);
61 EXPORT_SYMBOL(ip_vs_conn_in_get);
62 EXPORT_SYMBOL(ip_vs_conn_out_get);
63 #ifdef CONFIG_IP_VS_PROTO_TCP
64 EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
65 #endif
66 EXPORT_SYMBOL(ip_vs_conn_put);
67 #ifdef CONFIG_IP_VS_DEBUG
68 EXPORT_SYMBOL(ip_vs_get_debug_level);
69 #endif
70
71
72 /* ID used in ICMP lookups */
73 #define icmp_id(icmph)          (((icmph)->un).echo.id)
74 #define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
75
76 const char *ip_vs_proto_name(unsigned proto)
77 {
78         static char buf[20];
79
80         switch (proto) {
81         case IPPROTO_IP:
82                 return "IP";
83         case IPPROTO_UDP:
84                 return "UDP";
85         case IPPROTO_TCP:
86                 return "TCP";
87         case IPPROTO_SCTP:
88                 return "SCTP";
89         case IPPROTO_ICMP:
90                 return "ICMP";
91 #ifdef CONFIG_IP_VS_IPV6
92         case IPPROTO_ICMPV6:
93                 return "ICMPv6";
94 #endif
95         default:
96                 sprintf(buf, "IP_%d", proto);
97                 return buf;
98         }
99 }
100
101 void ip_vs_init_hash_table(struct list_head *table, int rows)
102 {
103         while (--rows >= 0)
104                 INIT_LIST_HEAD(&table[rows]);
105 }
106
107 static inline void
108 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
109 {
110         struct ip_vs_dest *dest = cp->dest;
111         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
112                 spin_lock(&dest->stats.lock);
113                 dest->stats.ustats.inpkts++;
114                 dest->stats.ustats.inbytes += skb->len;
115                 spin_unlock(&dest->stats.lock);
116
117                 spin_lock(&dest->svc->stats.lock);
118                 dest->svc->stats.ustats.inpkts++;
119                 dest->svc->stats.ustats.inbytes += skb->len;
120                 spin_unlock(&dest->svc->stats.lock);
121
122                 spin_lock(&ip_vs_stats.lock);
123                 ip_vs_stats.ustats.inpkts++;
124                 ip_vs_stats.ustats.inbytes += skb->len;
125                 spin_unlock(&ip_vs_stats.lock);
126         }
127 }
128
129
130 static inline void
131 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
132 {
133         struct ip_vs_dest *dest = cp->dest;
134         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
135                 spin_lock(&dest->stats.lock);
136                 dest->stats.ustats.outpkts++;
137                 dest->stats.ustats.outbytes += skb->len;
138                 spin_unlock(&dest->stats.lock);
139
140                 spin_lock(&dest->svc->stats.lock);
141                 dest->svc->stats.ustats.outpkts++;
142                 dest->svc->stats.ustats.outbytes += skb->len;
143                 spin_unlock(&dest->svc->stats.lock);
144
145                 spin_lock(&ip_vs_stats.lock);
146                 ip_vs_stats.ustats.outpkts++;
147                 ip_vs_stats.ustats.outbytes += skb->len;
148                 spin_unlock(&ip_vs_stats.lock);
149         }
150 }
151
152
153 static inline void
154 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
155 {
156         spin_lock(&cp->dest->stats.lock);
157         cp->dest->stats.ustats.conns++;
158         spin_unlock(&cp->dest->stats.lock);
159
160         spin_lock(&svc->stats.lock);
161         svc->stats.ustats.conns++;
162         spin_unlock(&svc->stats.lock);
163
164         spin_lock(&ip_vs_stats.lock);
165         ip_vs_stats.ustats.conns++;
166         spin_unlock(&ip_vs_stats.lock);
167 }
168
169
170 static inline int
171 ip_vs_set_state(struct ip_vs_conn *cp, int direction,
172                 const struct sk_buff *skb,
173                 struct ip_vs_protocol *pp)
174 {
175         if (unlikely(!pp->state_transition))
176                 return 0;
177         return pp->state_transition(cp, direction, skb, pp);
178 }
179
180 static inline void
181 ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
182                               struct sk_buff *skb, int protocol,
183                               const union nf_inet_addr *caddr, __be16 cport,
184                               const union nf_inet_addr *vaddr, __be16 vport,
185                               struct ip_vs_conn_param *p)
186 {
187         ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
188         p->pe = svc->pe;
189         if (p->pe && p->pe->fill_param)
190                 p->pe->fill_param(p, skb);
191 }
192
193 /*
194  *  IPVS persistent scheduling function
195  *  It creates a connection entry according to its template if exists,
196  *  or selects a server and creates a connection entry plus a template.
197  *  Locking: we are svc user (svc->refcnt), so we hold all dests too
198  *  Protocols supported: TCP, UDP
199  */
200 static struct ip_vs_conn *
201 ip_vs_sched_persist(struct ip_vs_service *svc,
202                     struct sk_buff *skb,
203                     __be16 ports[2])
204 {
205         struct ip_vs_conn *cp = NULL;
206         struct ip_vs_iphdr iph;
207         struct ip_vs_dest *dest;
208         struct ip_vs_conn *ct;
209         __be16 dport = 0;               /* destination port to forward */
210         unsigned int flags;
211         struct ip_vs_conn_param param;
212         union nf_inet_addr snet;        /* source network of the client,
213                                            after masking */
214
215         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
216
217         /* Mask saddr with the netmask to adjust template granularity */
218 #ifdef CONFIG_IP_VS_IPV6
219         if (svc->af == AF_INET6)
220                 ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
221         else
222 #endif
223                 snet.ip = iph.saddr.ip & svc->netmask;
224
225         IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
226                       "mnet %s\n",
227                       IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
228                       IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
229                       IP_VS_DBG_ADDR(svc->af, &snet));
230
231         /*
232          * As far as we know, FTP is a very complicated network protocol, and
233          * it uses control connection and data connections. For active FTP,
234          * FTP server initialize data connection to the client, its source port
235          * is often 20. For passive FTP, FTP server tells the clients the port
236          * that it passively listens to,  and the client issues the data
237          * connection. In the tunneling or direct routing mode, the load
238          * balancer is on the client-to-server half of connection, the port
239          * number is unknown to the load balancer. So, a conn template like
240          * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
241          * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
242          * is created for other persistent services.
243          */
244         {
245                 int protocol = iph.protocol;
246                 const union nf_inet_addr *vaddr = &iph.daddr;
247                 const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
248                 __be16 vport = 0;
249
250                 if (ports[1] == svc->port) {
251                         /* non-FTP template:
252                          * <protocol, caddr, 0, vaddr, vport, daddr, dport>
253                          * FTP template:
254                          * <protocol, caddr, 0, vaddr, 0, daddr, 0>
255                          */
256                         if (svc->port != FTPPORT)
257                                 vport = ports[1];
258                 } else {
259                         /* Note: persistent fwmark-based services and
260                          * persistent port zero service are handled here.
261                          * fwmark template:
262                          * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
263                          * port zero template:
264                          * <protocol,caddr,0,vaddr,0,daddr,0>
265                          */
266                         if (svc->fwmark) {
267                                 protocol = IPPROTO_IP;
268                                 vaddr = &fwmark;
269                         }
270                 }
271                 ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
272                                               vaddr, vport, &param);
273         }
274
275         /* Check if a template already exists */
276         ct = ip_vs_ct_in_get(&param);
277         if (!ct || !ip_vs_check_template(ct)) {
278                 /* No template found or the dest of the connection
279                  * template is not available.
280                  */
281                 dest = svc->scheduler->schedule(svc, skb);
282                 if (!dest) {
283                         IP_VS_DBG(1, "p-schedule: no dest found.\n");
284                         kfree(param.pe_data);
285                         return NULL;
286                 }
287
288                 if (ports[1] == svc->port && svc->port != FTPPORT)
289                         dport = dest->port;
290
291                 /* Create a template
292                  * This adds param.pe_data to the template,
293                  * and thus param.pe_data will be destroyed
294                  * when the template expires */
295                 ct = ip_vs_conn_new(&param, &dest->addr, dport,
296                                     IP_VS_CONN_F_TEMPLATE, dest);
297                 if (ct == NULL) {
298                         kfree(param.pe_data);
299                         return NULL;
300                 }
301
302                 ct->timeout = svc->timeout;
303         } else {
304                 /* set destination with the found template */
305                 dest = ct->dest;
306                 kfree(param.pe_data);
307         }
308
309         dport = ports[1];
310         if (dport == svc->port && dest->port)
311                 dport = dest->port;
312
313         flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
314                  && iph.protocol == IPPROTO_UDP)?
315                 IP_VS_CONN_F_ONE_PACKET : 0;
316
317         /*
318          *    Create a new connection according to the template
319          */
320         ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0],
321                               &iph.daddr, ports[1], &param);
322         cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest);
323         if (cp == NULL) {
324                 ip_vs_conn_put(ct);
325                 return NULL;
326         }
327
328         /*
329          *    Add its control
330          */
331         ip_vs_control_add(cp, ct);
332         ip_vs_conn_put(ct);
333
334         ip_vs_conn_stats(cp, svc);
335         return cp;
336 }
337
338
339 /*
340  *  IPVS main scheduling function
341  *  It selects a server according to the virtual service, and
342  *  creates a connection entry.
343  *  Protocols supported: TCP, UDP
344  */
345 struct ip_vs_conn *
346 ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
347                struct ip_vs_protocol *pp, int *ignored)
348 {
349         struct ip_vs_conn *cp = NULL;
350         struct ip_vs_iphdr iph;
351         struct ip_vs_dest *dest;
352         __be16 _ports[2], *pptr;
353         unsigned int flags;
354
355         *ignored = 1;
356         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
357         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
358         if (pptr == NULL)
359                 return NULL;
360
361         /*
362          * FTPDATA needs this check when using local real server.
363          * Never schedule Active FTPDATA connections from real server.
364          * For LVS-NAT they must be already created. For other methods
365          * with persistence the connection is created on SYN+ACK.
366          */
367         if (pptr[0] == FTPDATA) {
368                 IP_VS_DBG_PKT(12, pp, skb, 0, "Not scheduling FTPDATA");
369                 return NULL;
370         }
371
372         /*
373          * Do not schedule replies from local real server. It is risky
374          * for fwmark services but mostly for persistent services.
375          */
376         if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
377             (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
378             (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
379                 IP_VS_DBG_PKT(12, pp, skb, 0,
380                               "Not scheduling reply for existing connection");
381                 __ip_vs_conn_put(cp);
382                 return NULL;
383         }
384
385         /*
386          *    Persistent service
387          */
388         if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
389                 *ignored = 0;
390                 return ip_vs_sched_persist(svc, skb, pptr);
391         }
392
393         /*
394          *    Non-persistent service
395          */
396         if (!svc->fwmark && pptr[1] != svc->port) {
397                 if (!svc->port)
398                         pr_err("Schedule: port zero only supported "
399                                "in persistent services, "
400                                "check your ipvs configuration\n");
401                 return NULL;
402         }
403
404         *ignored = 0;
405
406         dest = svc->scheduler->schedule(svc, skb);
407         if (dest == NULL) {
408                 IP_VS_DBG(1, "Schedule: no dest found.\n");
409                 return NULL;
410         }
411
412         flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
413                  && iph.protocol == IPPROTO_UDP)?
414                 IP_VS_CONN_F_ONE_PACKET : 0;
415
416         /*
417          *    Create a connection entry.
418          */
419         {
420                 struct ip_vs_conn_param p;
421                 ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr,
422                                       pptr[0], &iph.daddr, pptr[1], &p);
423                 cp = ip_vs_conn_new(&p, &dest->addr,
424                                     dest->port ? dest->port : pptr[1],
425                                     flags, dest);
426                 if (!cp)
427                         return NULL;
428         }
429
430         IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
431                       "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
432                       ip_vs_fwd_tag(cp),
433                       IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
434                       IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
435                       IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
436                       cp->flags, atomic_read(&cp->refcnt));
437
438         ip_vs_conn_stats(cp, svc);
439         return cp;
440 }
441
442
443 /*
444  *  Pass or drop the packet.
445  *  Called by ip_vs_in, when the virtual service is available but
446  *  no destination is available for a new connection.
447  */
448 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
449                 struct ip_vs_protocol *pp)
450 {
451         __be16 _ports[2], *pptr;
452         struct ip_vs_iphdr iph;
453         int unicast;
454         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
455
456         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
457         if (pptr == NULL) {
458                 ip_vs_service_put(svc);
459                 return NF_DROP;
460         }
461
462 #ifdef CONFIG_IP_VS_IPV6
463         if (svc->af == AF_INET6)
464                 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
465         else
466 #endif
467                 unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
468
469         /* if it is fwmark-based service, the cache_bypass sysctl is up
470            and the destination is a non-local unicast, then create
471            a cache_bypass connection entry */
472         if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
473                 int ret, cs;
474                 struct ip_vs_conn *cp;
475                 unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
476                                       iph.protocol == IPPROTO_UDP)?
477                                       IP_VS_CONN_F_ONE_PACKET : 0;
478                 union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
479
480                 ip_vs_service_put(svc);
481
482                 /* create a new connection entry */
483                 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
484                 {
485                         struct ip_vs_conn_param p;
486                         ip_vs_conn_fill_param(svc->af, iph.protocol,
487                                               &iph.saddr, pptr[0],
488                                               &iph.daddr, pptr[1], &p);
489                         cp = ip_vs_conn_new(&p, &daddr, 0,
490                                             IP_VS_CONN_F_BYPASS | flags,
491                                             NULL);
492                         if (!cp)
493                                 return NF_DROP;
494                 }
495
496                 /* statistics */
497                 ip_vs_in_stats(cp, skb);
498
499                 /* set state */
500                 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
501
502                 /* transmit the first SYN packet */
503                 ret = cp->packet_xmit(skb, cp, pp);
504                 /* do not touch skb anymore */
505
506                 atomic_inc(&cp->in_pkts);
507                 ip_vs_conn_put(cp);
508                 return ret;
509         }
510
511         /*
512          * When the virtual ftp service is presented, packets destined
513          * for other services on the VIP may get here (except services
514          * listed in the ipvs table), pass the packets, because it is
515          * not ipvs job to decide to drop the packets.
516          */
517         if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
518                 ip_vs_service_put(svc);
519                 return NF_ACCEPT;
520         }
521
522         ip_vs_service_put(svc);
523
524         /*
525          * Notify the client that the destination is unreachable, and
526          * release the socket buffer.
527          * Since it is in IP layer, the TCP socket is not actually
528          * created, the TCP RST packet cannot be sent, instead that
529          * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
530          */
531 #ifdef CONFIG_IP_VS_IPV6
532         if (svc->af == AF_INET6) {
533                 if (!skb->dev) {
534                         struct net *net = dev_net(skb_dst(skb)->dev);
535
536                         skb->dev = net->loopback_dev;
537                 }
538                 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
539         } else
540 #endif
541                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
542
543         return NF_DROP;
544 }
545
546 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
547 {
548         return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
549 }
550
551 static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
552 {
553         if (NF_INET_LOCAL_IN == hooknum)
554                 return IP_DEFRAG_VS_IN;
555         if (NF_INET_FORWARD == hooknum)
556                 return IP_DEFRAG_VS_FWD;
557         return IP_DEFRAG_VS_OUT;
558 }
559
560 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
561 {
562         int err = ip_defrag(skb, user);
563
564         if (!err)
565                 ip_send_check(ip_hdr(skb));
566
567         return err;
568 }
569
570 #ifdef CONFIG_IP_VS_IPV6
571 static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
572 {
573         /* TODO IPv6: Find out what to do here for IPv6 */
574         return 0;
575 }
576 #endif
577
578 /*
579  * Packet has been made sufficiently writable in caller
580  * - inout: 1=in->out, 0=out->in
581  */
582 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
583                     struct ip_vs_conn *cp, int inout)
584 {
585         struct iphdr *iph        = ip_hdr(skb);
586         unsigned int icmp_offset = iph->ihl*4;
587         struct icmphdr *icmph    = (struct icmphdr *)(skb_network_header(skb) +
588                                                       icmp_offset);
589         struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
590
591         if (inout) {
592                 iph->saddr = cp->vaddr.ip;
593                 ip_send_check(iph);
594                 ciph->daddr = cp->vaddr.ip;
595                 ip_send_check(ciph);
596         } else {
597                 iph->daddr = cp->daddr.ip;
598                 ip_send_check(iph);
599                 ciph->saddr = cp->daddr.ip;
600                 ip_send_check(ciph);
601         }
602
603         /* the TCP/UDP/SCTP port */
604         if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
605             IPPROTO_SCTP == ciph->protocol) {
606                 __be16 *ports = (void *)ciph + ciph->ihl*4;
607
608                 if (inout)
609                         ports[1] = cp->vport;
610                 else
611                         ports[0] = cp->dport;
612         }
613
614         /* And finally the ICMP checksum */
615         icmph->checksum = 0;
616         icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
617         skb->ip_summed = CHECKSUM_UNNECESSARY;
618
619         if (inout)
620                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
621                         "Forwarding altered outgoing ICMP");
622         else
623                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
624                         "Forwarding altered incoming ICMP");
625 }
626
627 #ifdef CONFIG_IP_VS_IPV6
628 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
629                     struct ip_vs_conn *cp, int inout)
630 {
631         struct ipv6hdr *iph      = ipv6_hdr(skb);
632         unsigned int icmp_offset = sizeof(struct ipv6hdr);
633         struct icmp6hdr *icmph   = (struct icmp6hdr *)(skb_network_header(skb) +
634                                                       icmp_offset);
635         struct ipv6hdr *ciph     = (struct ipv6hdr *)(icmph + 1);
636
637         if (inout) {
638                 iph->saddr = cp->vaddr.in6;
639                 ciph->daddr = cp->vaddr.in6;
640         } else {
641                 iph->daddr = cp->daddr.in6;
642                 ciph->saddr = cp->daddr.in6;
643         }
644
645         /* the TCP/UDP/SCTP port */
646         if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr ||
647             IPPROTO_SCTP == ciph->nexthdr) {
648                 __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
649
650                 if (inout)
651                         ports[1] = cp->vport;
652                 else
653                         ports[0] = cp->dport;
654         }
655
656         /* And finally the ICMP checksum */
657         icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
658                                               skb->len - icmp_offset,
659                                               IPPROTO_ICMPV6, 0);
660         skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
661         skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
662         skb->ip_summed = CHECKSUM_PARTIAL;
663
664         if (inout)
665                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
666                         "Forwarding altered outgoing ICMPv6");
667         else
668                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
669                         "Forwarding altered incoming ICMPv6");
670 }
671 #endif
672
673 /* Handle relevant response ICMP messages - forward to the right
674  * destination host. Used for NAT and local client.
675  */
676 static int handle_response_icmp(int af, struct sk_buff *skb,
677                                 union nf_inet_addr *snet,
678                                 __u8 protocol, struct ip_vs_conn *cp,
679                                 struct ip_vs_protocol *pp,
680                                 unsigned int offset, unsigned int ihl)
681 {
682         unsigned int verdict = NF_DROP;
683
684         if (IP_VS_FWD_METHOD(cp) != 0) {
685                 pr_err("shouldn't reach here, because the box is on the "
686                        "half connection in the tun/dr module.\n");
687         }
688
689         /* Ensure the checksum is correct */
690         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
691                 /* Failed checksum! */
692                 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
693                               IP_VS_DBG_ADDR(af, snet));
694                 goto out;
695         }
696
697         if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
698             IPPROTO_SCTP == protocol)
699                 offset += 2 * sizeof(__u16);
700         if (!skb_make_writable(skb, offset))
701                 goto out;
702
703 #ifdef CONFIG_IP_VS_IPV6
704         if (af == AF_INET6)
705                 ip_vs_nat_icmp_v6(skb, pp, cp, 1);
706         else
707 #endif
708                 ip_vs_nat_icmp(skb, pp, cp, 1);
709
710 #ifdef CONFIG_IP_VS_IPV6
711         if (af == AF_INET6) {
712                 if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
713                         goto out;
714         } else
715 #endif
716                 if ((sysctl_ip_vs_snat_reroute ||
717                      skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
718                     ip_route_me_harder(skb, RTN_LOCAL) != 0)
719                         goto out;
720
721         /* do the statistics and put it back */
722         ip_vs_out_stats(cp, skb);
723
724         skb->ipvs_property = 1;
725         if (!(cp->flags & IP_VS_CONN_F_NFCT))
726                 ip_vs_notrack(skb);
727         else
728                 ip_vs_update_conntrack(skb, cp, 0);
729         verdict = NF_ACCEPT;
730
731 out:
732         __ip_vs_conn_put(cp);
733
734         return verdict;
735 }
736
737 /*
738  *      Handle ICMP messages in the inside-to-outside direction (outgoing).
739  *      Find any that might be relevant, check against existing connections.
740  *      Currently handles error types - unreachable, quench, ttl exceeded.
741  */
742 static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
743                           unsigned int hooknum)
744 {
745         struct iphdr *iph;
746         struct icmphdr  _icmph, *ic;
747         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
748         struct ip_vs_iphdr ciph;
749         struct ip_vs_conn *cp;
750         struct ip_vs_protocol *pp;
751         unsigned int offset, ihl;
752         union nf_inet_addr snet;
753
754         *related = 1;
755
756         /* reassemble IP fragments */
757         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
758                 if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
759                         return NF_STOLEN;
760         }
761
762         iph = ip_hdr(skb);
763         offset = ihl = iph->ihl * 4;
764         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
765         if (ic == NULL)
766                 return NF_DROP;
767
768         IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
769                   ic->type, ntohs(icmp_id(ic)),
770                   &iph->saddr, &iph->daddr);
771
772         /*
773          * Work through seeing if this is for us.
774          * These checks are supposed to be in an order that means easy
775          * things are checked first to speed up processing.... however
776          * this means that some packets will manage to get a long way
777          * down this stack and then be rejected, but that's life.
778          */
779         if ((ic->type != ICMP_DEST_UNREACH) &&
780             (ic->type != ICMP_SOURCE_QUENCH) &&
781             (ic->type != ICMP_TIME_EXCEEDED)) {
782                 *related = 0;
783                 return NF_ACCEPT;
784         }
785
786         /* Now find the contained IP header */
787         offset += sizeof(_icmph);
788         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
789         if (cih == NULL)
790                 return NF_ACCEPT; /* The packet looks wrong, ignore */
791
792         pp = ip_vs_proto_get(cih->protocol);
793         if (!pp)
794                 return NF_ACCEPT;
795
796         /* Is the embedded protocol header present? */
797         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
798                      pp->dont_defrag))
799                 return NF_ACCEPT;
800
801         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
802
803         offset += cih->ihl * 4;
804
805         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
806         /* The embedded headers contain source and dest in reverse order */
807         cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
808         if (!cp)
809                 return NF_ACCEPT;
810
811         snet.ip = iph->saddr;
812         return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
813                                     pp, offset, ihl);
814 }
815
816 #ifdef CONFIG_IP_VS_IPV6
817 static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
818                              unsigned int hooknum)
819 {
820         struct ipv6hdr *iph;
821         struct icmp6hdr _icmph, *ic;
822         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
823                                            within the ICMP */
824         struct ip_vs_iphdr ciph;
825         struct ip_vs_conn *cp;
826         struct ip_vs_protocol *pp;
827         unsigned int offset;
828         union nf_inet_addr snet;
829
830         *related = 1;
831
832         /* reassemble IP fragments */
833         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
834                 if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
835                         return NF_STOLEN;
836         }
837
838         iph = ipv6_hdr(skb);
839         offset = sizeof(struct ipv6hdr);
840         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
841         if (ic == NULL)
842                 return NF_DROP;
843
844         IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
845                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
846                   &iph->saddr, &iph->daddr);
847
848         /*
849          * Work through seeing if this is for us.
850          * These checks are supposed to be in an order that means easy
851          * things are checked first to speed up processing.... however
852          * this means that some packets will manage to get a long way
853          * down this stack and then be rejected, but that's life.
854          */
855         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
856             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
857             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
858                 *related = 0;
859                 return NF_ACCEPT;
860         }
861
862         /* Now find the contained IP header */
863         offset += sizeof(_icmph);
864         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
865         if (cih == NULL)
866                 return NF_ACCEPT; /* The packet looks wrong, ignore */
867
868         pp = ip_vs_proto_get(cih->nexthdr);
869         if (!pp)
870                 return NF_ACCEPT;
871
872         /* Is the embedded protocol header present? */
873         /* TODO: we don't support fragmentation at the moment anyways */
874         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
875                 return NF_ACCEPT;
876
877         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
878
879         offset += sizeof(struct ipv6hdr);
880
881         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
882         /* The embedded headers contain source and dest in reverse order */
883         cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
884         if (!cp)
885                 return NF_ACCEPT;
886
887         ipv6_addr_copy(&snet.in6, &iph->saddr);
888         return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
889                                     pp, offset, sizeof(struct ipv6hdr));
890 }
891 #endif
892
893 /*
894  * Check if sctp chunc is ABORT chunk
895  */
896 static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
897 {
898         sctp_chunkhdr_t *sch, schunk;
899         sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t),
900                         sizeof(schunk), &schunk);
901         if (sch == NULL)
902                 return 0;
903         if (sch->type == SCTP_CID_ABORT)
904                 return 1;
905         return 0;
906 }
907
908 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
909 {
910         struct tcphdr _tcph, *th;
911
912         th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
913         if (th == NULL)
914                 return 0;
915         return th->rst;
916 }
917
918 /* Handle response packets: rewrite addresses and send away...
919  * Used for NAT and local client.
920  */
921 static unsigned int
922 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
923                 struct ip_vs_conn *cp, int ihl)
924 {
925         IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
926
927         if (!skb_make_writable(skb, ihl))
928                 goto drop;
929
930         /* mangle the packet */
931         if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
932                 goto drop;
933
934 #ifdef CONFIG_IP_VS_IPV6
935         if (af == AF_INET6)
936                 ipv6_hdr(skb)->saddr = cp->vaddr.in6;
937         else
938 #endif
939         {
940                 ip_hdr(skb)->saddr = cp->vaddr.ip;
941                 ip_send_check(ip_hdr(skb));
942         }
943
944         /*
945          * nf_iterate does not expect change in the skb->dst->dev.
946          * It looks like it is not fatal to enable this code for hooks
947          * where our handlers are at the end of the chain list and
948          * when all next handlers use skb->dst->dev and not outdev.
949          * It will definitely route properly the inout NAT traffic
950          * when multiple paths are used.
951          */
952
953         /* For policy routing, packets originating from this
954          * machine itself may be routed differently to packets
955          * passing through.  We want this packet to be routed as
956          * if it came from this machine itself.  So re-compute
957          * the routing information.
958          */
959 #ifdef CONFIG_IP_VS_IPV6
960         if (af == AF_INET6) {
961                 if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0)
962                         goto drop;
963         } else
964 #endif
965                 if ((sysctl_ip_vs_snat_reroute ||
966                      skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
967                     ip_route_me_harder(skb, RTN_LOCAL) != 0)
968                         goto drop;
969
970         IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
971
972         ip_vs_out_stats(cp, skb);
973         ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
974         skb->ipvs_property = 1;
975         if (!(cp->flags & IP_VS_CONN_F_NFCT))
976                 ip_vs_notrack(skb);
977         else
978                 ip_vs_update_conntrack(skb, cp, 0);
979         ip_vs_conn_put(cp);
980
981         LeaveFunction(11);
982         return NF_ACCEPT;
983
984 drop:
985         ip_vs_conn_put(cp);
986         kfree_skb(skb);
987         LeaveFunction(11);
988         return NF_STOLEN;
989 }
990
991 /*
992  *      Check if outgoing packet belongs to the established ip_vs_conn.
993  */
994 static unsigned int
995 ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
996 {
997         struct ip_vs_iphdr iph;
998         struct ip_vs_protocol *pp;
999         struct ip_vs_conn *cp;
1000
1001         EnterFunction(11);
1002
1003         /* Already marked as IPVS request or reply? */
1004         if (skb->ipvs_property)
1005                 return NF_ACCEPT;
1006
1007         /* Bad... Do not break raw sockets */
1008         if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
1009                      af == AF_INET)) {
1010                 struct sock *sk = skb->sk;
1011                 struct inet_sock *inet = inet_sk(skb->sk);
1012
1013                 if (inet && sk->sk_family == PF_INET && inet->nodefrag)
1014                         return NF_ACCEPT;
1015         }
1016
1017         if (unlikely(!skb_dst(skb)))
1018                 return NF_ACCEPT;
1019
1020         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1021 #ifdef CONFIG_IP_VS_IPV6
1022         if (af == AF_INET6) {
1023                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1024                         int related;
1025                         int verdict = ip_vs_out_icmp_v6(skb, &related,
1026                                                         hooknum);
1027
1028                         if (related)
1029                                 return verdict;
1030                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1031                 }
1032         } else
1033 #endif
1034                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1035                         int related;
1036                         int verdict = ip_vs_out_icmp(skb, &related, hooknum);
1037
1038                         if (related)
1039                                 return verdict;
1040                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1041                 }
1042
1043         pp = ip_vs_proto_get(iph.protocol);
1044         if (unlikely(!pp))
1045                 return NF_ACCEPT;
1046
1047         /* reassemble IP fragments */
1048 #ifdef CONFIG_IP_VS_IPV6
1049         if (af == AF_INET6) {
1050                 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1051                         if (ip_vs_gather_frags_v6(skb,
1052                                                   ip_vs_defrag_user(hooknum)))
1053                                 return NF_STOLEN;
1054                 }
1055
1056                 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1057         } else
1058 #endif
1059                 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
1060                              !pp->dont_defrag)) {
1061                         if (ip_vs_gather_frags(skb,
1062                                                ip_vs_defrag_user(hooknum)))
1063                                 return NF_STOLEN;
1064
1065                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1066                 }
1067
1068         /*
1069          * Check if the packet belongs to an existing entry
1070          */
1071         cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1072
1073         if (likely(cp))
1074                 return handle_response(af, skb, pp, cp, iph.len);
1075         if (sysctl_ip_vs_nat_icmp_send &&
1076             (pp->protocol == IPPROTO_TCP ||
1077              pp->protocol == IPPROTO_UDP ||
1078              pp->protocol == IPPROTO_SCTP)) {
1079                 __be16 _ports[2], *pptr;
1080
1081                 pptr = skb_header_pointer(skb, iph.len,
1082                                           sizeof(_ports), _ports);
1083                 if (pptr == NULL)
1084                         return NF_ACCEPT;       /* Not for me */
1085                 if (ip_vs_lookup_real_service(af, iph.protocol,
1086                                               &iph.saddr,
1087                                               pptr[0])) {
1088                         /*
1089                          * Notify the real server: there is no
1090                          * existing entry if it is not RST
1091                          * packet or not TCP packet.
1092                          */
1093                         if ((iph.protocol != IPPROTO_TCP &&
1094                              iph.protocol != IPPROTO_SCTP)
1095                              || ((iph.protocol == IPPROTO_TCP
1096                                   && !is_tcp_reset(skb, iph.len))
1097                                  || (iph.protocol == IPPROTO_SCTP
1098                                         && !is_sctp_abort(skb,
1099                                                 iph.len)))) {
1100 #ifdef CONFIG_IP_VS_IPV6
1101                                 if (af == AF_INET6) {
1102                                         struct net *net =
1103                                                 dev_net(skb_dst(skb)->dev);
1104
1105                                         if (!skb->dev)
1106                                                 skb->dev = net->loopback_dev;
1107                                         icmpv6_send(skb,
1108                                                     ICMPV6_DEST_UNREACH,
1109                                                     ICMPV6_PORT_UNREACH,
1110                                                     0);
1111                                 } else
1112 #endif
1113                                         icmp_send(skb,
1114                                                   ICMP_DEST_UNREACH,
1115                                                   ICMP_PORT_UNREACH, 0);
1116                                 return NF_DROP;
1117                         }
1118                 }
1119         }
1120         IP_VS_DBG_PKT(12, pp, skb, 0,
1121                       "ip_vs_out: packet continues traversal as normal");
1122         return NF_ACCEPT;
1123 }
1124
1125 /*
1126  *      It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1127  *      used only for VS/NAT.
1128  *      Check if packet is reply for established ip_vs_conn.
1129  */
1130 static unsigned int
1131 ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
1132              const struct net_device *in, const struct net_device *out,
1133              int (*okfn)(struct sk_buff *))
1134 {
1135         return ip_vs_out(hooknum, skb, AF_INET);
1136 }
1137
1138 /*
1139  *      It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1140  *      Check if packet is reply for established ip_vs_conn.
1141  */
1142 static unsigned int
1143 ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
1144                    const struct net_device *in, const struct net_device *out,
1145                    int (*okfn)(struct sk_buff *))
1146 {
1147         unsigned int verdict;
1148
1149         /* Disable BH in LOCAL_OUT until all places are fixed */
1150         local_bh_disable();
1151         verdict = ip_vs_out(hooknum, skb, AF_INET);
1152         local_bh_enable();
1153         return verdict;
1154 }
1155
1156 #ifdef CONFIG_IP_VS_IPV6
1157
1158 /*
1159  *      It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1160  *      used only for VS/NAT.
1161  *      Check if packet is reply for established ip_vs_conn.
1162  */
1163 static unsigned int
1164 ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
1165              const struct net_device *in, const struct net_device *out,
1166              int (*okfn)(struct sk_buff *))
1167 {
1168         return ip_vs_out(hooknum, skb, AF_INET6);
1169 }
1170
1171 /*
1172  *      It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1173  *      Check if packet is reply for established ip_vs_conn.
1174  */
1175 static unsigned int
1176 ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
1177                    const struct net_device *in, const struct net_device *out,
1178                    int (*okfn)(struct sk_buff *))
1179 {
1180         unsigned int verdict;
1181
1182         /* Disable BH in LOCAL_OUT until all places are fixed */
1183         local_bh_disable();
1184         verdict = ip_vs_out(hooknum, skb, AF_INET6);
1185         local_bh_enable();
1186         return verdict;
1187 }
1188
1189 #endif
1190
1191 /*
1192  *      Handle ICMP messages in the outside-to-inside direction (incoming).
1193  *      Find any that might be relevant, check against existing connections,
1194  *      forward to the right destination host if relevant.
1195  *      Currently handles error types - unreachable, quench, ttl exceeded.
1196  */
1197 static int
1198 ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1199 {
1200         struct iphdr *iph;
1201         struct icmphdr  _icmph, *ic;
1202         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
1203         struct ip_vs_iphdr ciph;
1204         struct ip_vs_conn *cp;
1205         struct ip_vs_protocol *pp;
1206         unsigned int offset, ihl, verdict;
1207         union nf_inet_addr snet;
1208
1209         *related = 1;
1210
1211         /* reassemble IP fragments */
1212         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1213                 if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
1214                         return NF_STOLEN;
1215         }
1216
1217         iph = ip_hdr(skb);
1218         offset = ihl = iph->ihl * 4;
1219         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1220         if (ic == NULL)
1221                 return NF_DROP;
1222
1223         IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1224                   ic->type, ntohs(icmp_id(ic)),
1225                   &iph->saddr, &iph->daddr);
1226
1227         /*
1228          * Work through seeing if this is for us.
1229          * These checks are supposed to be in an order that means easy
1230          * things are checked first to speed up processing.... however
1231          * this means that some packets will manage to get a long way
1232          * down this stack and then be rejected, but that's life.
1233          */
1234         if ((ic->type != ICMP_DEST_UNREACH) &&
1235             (ic->type != ICMP_SOURCE_QUENCH) &&
1236             (ic->type != ICMP_TIME_EXCEEDED)) {
1237                 *related = 0;
1238                 return NF_ACCEPT;
1239         }
1240
1241         /* Now find the contained IP header */
1242         offset += sizeof(_icmph);
1243         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1244         if (cih == NULL)
1245                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1246
1247         pp = ip_vs_proto_get(cih->protocol);
1248         if (!pp)
1249                 return NF_ACCEPT;
1250
1251         /* Is the embedded protocol header present? */
1252         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1253                      pp->dont_defrag))
1254                 return NF_ACCEPT;
1255
1256         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
1257
1258         offset += cih->ihl * 4;
1259
1260         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1261         /* The embedded headers contain source and dest in reverse order */
1262         cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
1263         if (!cp) {
1264                 /* The packet could also belong to a local client */
1265                 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
1266                 if (cp) {
1267                         snet.ip = iph->saddr;
1268                         return handle_response_icmp(AF_INET, skb, &snet,
1269                                                     cih->protocol, cp, pp,
1270                                                     offset, ihl);
1271                 }
1272                 return NF_ACCEPT;
1273         }
1274
1275         verdict = NF_DROP;
1276
1277         /* Ensure the checksum is correct */
1278         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1279                 /* Failed checksum! */
1280                 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1281                           &iph->saddr);
1282                 goto out;
1283         }
1284
1285         /* do the statistics and put it back */
1286         ip_vs_in_stats(cp, skb);
1287         if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1288                 offset += 2 * sizeof(__u16);
1289         verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
1290         /* LOCALNODE from FORWARD hook is not supported */
1291         if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
1292             skb_rtable(skb)->rt_flags & RTCF_LOCAL) {
1293                 IP_VS_DBG(1, "%s(): "
1294                           "local delivery to %pI4 but in FORWARD\n",
1295                           __func__, &skb_rtable(skb)->rt_dst);
1296                 verdict = NF_DROP;
1297         }
1298
1299   out:
1300         __ip_vs_conn_put(cp);
1301
1302         return verdict;
1303 }
1304
1305 #ifdef CONFIG_IP_VS_IPV6
1306 static int
1307 ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1308 {
1309         struct ipv6hdr *iph;
1310         struct icmp6hdr _icmph, *ic;
1311         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
1312                                            within the ICMP */
1313         struct ip_vs_iphdr ciph;
1314         struct ip_vs_conn *cp;
1315         struct ip_vs_protocol *pp;
1316         unsigned int offset, verdict;
1317         union nf_inet_addr snet;
1318         struct rt6_info *rt;
1319
1320         *related = 1;
1321
1322         /* reassemble IP fragments */
1323         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1324                 if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
1325                         return NF_STOLEN;
1326         }
1327
1328         iph = ipv6_hdr(skb);
1329         offset = sizeof(struct ipv6hdr);
1330         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1331         if (ic == NULL)
1332                 return NF_DROP;
1333
1334         IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
1335                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
1336                   &iph->saddr, &iph->daddr);
1337
1338         /*
1339          * Work through seeing if this is for us.
1340          * These checks are supposed to be in an order that means easy
1341          * things are checked first to speed up processing.... however
1342          * this means that some packets will manage to get a long way
1343          * down this stack and then be rejected, but that's life.
1344          */
1345         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1346             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1347             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1348                 *related = 0;
1349                 return NF_ACCEPT;
1350         }
1351
1352         /* Now find the contained IP header */
1353         offset += sizeof(_icmph);
1354         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1355         if (cih == NULL)
1356                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1357
1358         pp = ip_vs_proto_get(cih->nexthdr);
1359         if (!pp)
1360                 return NF_ACCEPT;
1361
1362         /* Is the embedded protocol header present? */
1363         /* TODO: we don't support fragmentation at the moment anyways */
1364         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1365                 return NF_ACCEPT;
1366
1367         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
1368
1369         offset += sizeof(struct ipv6hdr);
1370
1371         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1372         /* The embedded headers contain source and dest in reverse order */
1373         cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
1374         if (!cp) {
1375                 /* The packet could also belong to a local client */
1376                 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
1377                 if (cp) {
1378                         ipv6_addr_copy(&snet.in6, &iph->saddr);
1379                         return handle_response_icmp(AF_INET6, skb, &snet,
1380                                                     cih->nexthdr,
1381                                                     cp, pp, offset,
1382                                                     sizeof(struct ipv6hdr));
1383                 }
1384                 return NF_ACCEPT;
1385         }
1386
1387         verdict = NF_DROP;
1388
1389         /* do the statistics and put it back */
1390         ip_vs_in_stats(cp, skb);
1391         if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr ||
1392             IPPROTO_SCTP == cih->nexthdr)
1393                 offset += 2 * sizeof(__u16);
1394         verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
1395         /* LOCALNODE from FORWARD hook is not supported */
1396         if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
1397             (rt = (struct rt6_info *) skb_dst(skb)) &&
1398             rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK) {
1399                 IP_VS_DBG(1, "%s(): "
1400                           "local delivery to %pI6 but in FORWARD\n",
1401                           __func__, &rt->rt6i_dst);
1402                 verdict = NF_DROP;
1403         }
1404
1405         __ip_vs_conn_put(cp);
1406
1407         return verdict;
1408 }
1409 #endif
1410
1411
1412 /*
1413  *      Check if it's for virtual services, look it up,
1414  *      and send it on its way...
1415  */
1416 static unsigned int
1417 ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
1418 {
1419         struct ip_vs_iphdr iph;
1420         struct ip_vs_protocol *pp;
1421         struct ip_vs_conn *cp;
1422         int ret, restart, pkts;
1423
1424         /* Already marked as IPVS request or reply? */
1425         if (skb->ipvs_property)
1426                 return NF_ACCEPT;
1427
1428         /*
1429          *      Big tappo:
1430          *      - remote client: only PACKET_HOST
1431          *      - route: used for struct net when skb->dev is unset
1432          */
1433         if (unlikely((skb->pkt_type != PACKET_HOST &&
1434                       hooknum != NF_INET_LOCAL_OUT) ||
1435                      !skb_dst(skb))) {
1436                 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1437                 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
1438                               " ignored in hook %u\n",
1439                               skb->pkt_type, iph.protocol,
1440                               IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
1441                 return NF_ACCEPT;
1442         }
1443         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1444
1445         /* Bad... Do not break raw sockets */
1446         if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
1447                      af == AF_INET)) {
1448                 struct sock *sk = skb->sk;
1449                 struct inet_sock *inet = inet_sk(skb->sk);
1450
1451                 if (inet && sk->sk_family == PF_INET && inet->nodefrag)
1452                         return NF_ACCEPT;
1453         }
1454
1455 #ifdef CONFIG_IP_VS_IPV6
1456         if (af == AF_INET6) {
1457                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1458                         int related;
1459                         int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
1460
1461                         if (related)
1462                                 return verdict;
1463                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1464                 }
1465         } else
1466 #endif
1467                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1468                         int related;
1469                         int verdict = ip_vs_in_icmp(skb, &related, hooknum);
1470
1471                         if (related)
1472                                 return verdict;
1473                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1474                 }
1475
1476         /* Protocol supported? */
1477         pp = ip_vs_proto_get(iph.protocol);
1478         if (unlikely(!pp))
1479                 return NF_ACCEPT;
1480
1481         /*
1482          * Check if the packet belongs to an existing connection entry
1483          */
1484         cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
1485
1486         if (unlikely(!cp)) {
1487                 int v;
1488
1489                 if (!pp->conn_schedule(af, skb, pp, &v, &cp))
1490                         return v;
1491         }
1492
1493         if (unlikely(!cp)) {
1494                 /* sorry, all this trouble for a no-hit :) */
1495                 IP_VS_DBG_PKT(12, pp, skb, 0,
1496                               "ip_vs_in: packet continues traversal as normal");
1497                 return NF_ACCEPT;
1498         }
1499
1500         IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
1501
1502         /* Check the server status */
1503         if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1504                 /* the destination server is not available */
1505
1506                 if (sysctl_ip_vs_expire_nodest_conn) {
1507                         /* try to expire the connection immediately */
1508                         ip_vs_conn_expire_now(cp);
1509                 }
1510                 /* don't restart its timer, and silently
1511                    drop the packet. */
1512                 __ip_vs_conn_put(cp);
1513                 return NF_DROP;
1514         }
1515
1516         ip_vs_in_stats(cp, skb);
1517         restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
1518         if (cp->packet_xmit)
1519                 ret = cp->packet_xmit(skb, cp, pp);
1520                 /* do not touch skb anymore */
1521         else {
1522                 IP_VS_DBG_RL("warning: packet_xmit is null");
1523                 ret = NF_ACCEPT;
1524         }
1525
1526         /* Increase its packet counter and check if it is needed
1527          * to be synchronized
1528          *
1529          * Sync connection if it is about to close to
1530          * encorage the standby servers to update the connections timeout
1531          */
1532         pkts = atomic_add_return(1, &cp->in_pkts);
1533         if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1534             cp->protocol == IPPROTO_SCTP) {
1535                 if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
1536                         (pkts % sysctl_ip_vs_sync_threshold[1]
1537                          == sysctl_ip_vs_sync_threshold[0])) ||
1538                                 (cp->old_state != cp->state &&
1539                                  ((cp->state == IP_VS_SCTP_S_CLOSED) ||
1540                                   (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
1541                                   (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
1542                         ip_vs_sync_conn(cp);
1543                         goto out;
1544                 }
1545         }
1546
1547         /* Keep this block last: TCP and others with pp->num_states <= 1 */
1548         else if (af == AF_INET &&
1549             (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1550             (((cp->protocol != IPPROTO_TCP ||
1551                cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1552               (pkts % sysctl_ip_vs_sync_threshold[1]
1553                == sysctl_ip_vs_sync_threshold[0])) ||
1554              ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1555               ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
1556                (cp->state == IP_VS_TCP_S_CLOSE) ||
1557                (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1558                (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1559                 ip_vs_sync_conn(cp);
1560 out:
1561         cp->old_state = cp->state;
1562
1563         ip_vs_conn_put(cp);
1564         return ret;
1565 }
1566
1567 /*
1568  *      AF_INET handler in NF_INET_LOCAL_IN chain
1569  *      Schedule and forward packets from remote clients
1570  */
1571 static unsigned int
1572 ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
1573                       const struct net_device *in,
1574                       const struct net_device *out,
1575                       int (*okfn)(struct sk_buff *))
1576 {
1577         return ip_vs_in(hooknum, skb, AF_INET);
1578 }
1579
1580 /*
1581  *      AF_INET handler in NF_INET_LOCAL_OUT chain
1582  *      Schedule and forward packets from local clients
1583  */
1584 static unsigned int
1585 ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
1586                      const struct net_device *in, const struct net_device *out,
1587                      int (*okfn)(struct sk_buff *))
1588 {
1589         unsigned int verdict;
1590
1591         /* Disable BH in LOCAL_OUT until all places are fixed */
1592         local_bh_disable();
1593         verdict = ip_vs_in(hooknum, skb, AF_INET);
1594         local_bh_enable();
1595         return verdict;
1596 }
1597
1598 #ifdef CONFIG_IP_VS_IPV6
1599
1600 /*
1601  *      AF_INET6 handler in NF_INET_LOCAL_IN chain
1602  *      Schedule and forward packets from remote clients
1603  */
1604 static unsigned int
1605 ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
1606                       const struct net_device *in,
1607                       const struct net_device *out,
1608                       int (*okfn)(struct sk_buff *))
1609 {
1610         return ip_vs_in(hooknum, skb, AF_INET6);
1611 }
1612
1613 /*
1614  *      AF_INET6 handler in NF_INET_LOCAL_OUT chain
1615  *      Schedule and forward packets from local clients
1616  */
1617 static unsigned int
1618 ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
1619                      const struct net_device *in, const struct net_device *out,
1620                      int (*okfn)(struct sk_buff *))
1621 {
1622         unsigned int verdict;
1623
1624         /* Disable BH in LOCAL_OUT until all places are fixed */
1625         local_bh_disable();
1626         verdict = ip_vs_in(hooknum, skb, AF_INET6);
1627         local_bh_enable();
1628         return verdict;
1629 }
1630
1631 #endif
1632
1633
1634 /*
1635  *      It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1636  *      related packets destined for 0.0.0.0/0.
1637  *      When fwmark-based virtual service is used, such as transparent
1638  *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
1639  *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1640  *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1641  *      and send them to ip_vs_in_icmp.
1642  */
1643 static unsigned int
1644 ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1645                    const struct net_device *in, const struct net_device *out,
1646                    int (*okfn)(struct sk_buff *))
1647 {
1648         int r;
1649
1650         if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1651                 return NF_ACCEPT;
1652
1653         return ip_vs_in_icmp(skb, &r, hooknum);
1654 }
1655
1656 #ifdef CONFIG_IP_VS_IPV6
1657 static unsigned int
1658 ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1659                       const struct net_device *in, const struct net_device *out,
1660                       int (*okfn)(struct sk_buff *))
1661 {
1662         int r;
1663
1664         if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1665                 return NF_ACCEPT;
1666
1667         return ip_vs_in_icmp_v6(skb, &r, hooknum);
1668 }
1669 #endif
1670
1671
1672 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1673         /* After packet filtering, change source only for VS/NAT */
1674         {
1675                 .hook           = ip_vs_reply4,
1676                 .owner          = THIS_MODULE,
1677                 .pf             = PF_INET,
1678                 .hooknum        = NF_INET_LOCAL_IN,
1679                 .priority       = 99,
1680         },
1681         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1682          * or VS/NAT(change destination), so that filtering rules can be
1683          * applied to IPVS. */
1684         {
1685                 .hook           = ip_vs_remote_request4,
1686                 .owner          = THIS_MODULE,
1687                 .pf             = PF_INET,
1688                 .hooknum        = NF_INET_LOCAL_IN,
1689                 .priority       = 101,
1690         },
1691         /* Before ip_vs_in, change source only for VS/NAT */
1692         {
1693                 .hook           = ip_vs_local_reply4,
1694                 .owner          = THIS_MODULE,
1695                 .pf             = PF_INET,
1696                 .hooknum        = NF_INET_LOCAL_OUT,
1697                 .priority       = -99,
1698         },
1699         /* After mangle, schedule and forward local requests */
1700         {
1701                 .hook           = ip_vs_local_request4,
1702                 .owner          = THIS_MODULE,
1703                 .pf             = PF_INET,
1704                 .hooknum        = NF_INET_LOCAL_OUT,
1705                 .priority       = -98,
1706         },
1707         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1708          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1709         {
1710                 .hook           = ip_vs_forward_icmp,
1711                 .owner          = THIS_MODULE,
1712                 .pf             = PF_INET,
1713                 .hooknum        = NF_INET_FORWARD,
1714                 .priority       = 99,
1715         },
1716         /* After packet filtering, change source only for VS/NAT */
1717         {
1718                 .hook           = ip_vs_reply4,
1719                 .owner          = THIS_MODULE,
1720                 .pf             = PF_INET,
1721                 .hooknum        = NF_INET_FORWARD,
1722                 .priority       = 100,
1723         },
1724 #ifdef CONFIG_IP_VS_IPV6
1725         /* After packet filtering, change source only for VS/NAT */
1726         {
1727                 .hook           = ip_vs_reply6,
1728                 .owner          = THIS_MODULE,
1729                 .pf             = PF_INET6,
1730                 .hooknum        = NF_INET_LOCAL_IN,
1731                 .priority       = 99,
1732         },
1733         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1734          * or VS/NAT(change destination), so that filtering rules can be
1735          * applied to IPVS. */
1736         {
1737                 .hook           = ip_vs_remote_request6,
1738                 .owner          = THIS_MODULE,
1739                 .pf             = PF_INET6,
1740                 .hooknum        = NF_INET_LOCAL_IN,
1741                 .priority       = 101,
1742         },
1743         /* Before ip_vs_in, change source only for VS/NAT */
1744         {
1745                 .hook           = ip_vs_local_reply6,
1746                 .owner          = THIS_MODULE,
1747                 .pf             = PF_INET,
1748                 .hooknum        = NF_INET_LOCAL_OUT,
1749                 .priority       = -99,
1750         },
1751         /* After mangle, schedule and forward local requests */
1752         {
1753                 .hook           = ip_vs_local_request6,
1754                 .owner          = THIS_MODULE,
1755                 .pf             = PF_INET6,
1756                 .hooknum        = NF_INET_LOCAL_OUT,
1757                 .priority       = -98,
1758         },
1759         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1760          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1761         {
1762                 .hook           = ip_vs_forward_icmp_v6,
1763                 .owner          = THIS_MODULE,
1764                 .pf             = PF_INET6,
1765                 .hooknum        = NF_INET_FORWARD,
1766                 .priority       = 99,
1767         },
1768         /* After packet filtering, change source only for VS/NAT */
1769         {
1770                 .hook           = ip_vs_reply6,
1771                 .owner          = THIS_MODULE,
1772                 .pf             = PF_INET6,
1773                 .hooknum        = NF_INET_FORWARD,
1774                 .priority       = 100,
1775         },
1776 #endif
1777 };
1778
1779
1780 /*
1781  *      Initialize IP Virtual Server
1782  */
1783 static int __init ip_vs_init(void)
1784 {
1785         int ret;
1786
1787         ip_vs_estimator_init();
1788
1789         ret = ip_vs_control_init();
1790         if (ret < 0) {
1791                 pr_err("can't setup control.\n");
1792                 goto cleanup_estimator;
1793         }
1794
1795         ip_vs_protocol_init();
1796
1797         ret = ip_vs_app_init();
1798         if (ret < 0) {
1799                 pr_err("can't setup application helper.\n");
1800                 goto cleanup_protocol;
1801         }
1802
1803         ret = ip_vs_conn_init();
1804         if (ret < 0) {
1805                 pr_err("can't setup connection table.\n");
1806                 goto cleanup_app;
1807         }
1808
1809         ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1810         if (ret < 0) {
1811                 pr_err("can't register hooks.\n");
1812                 goto cleanup_conn;
1813         }
1814
1815         pr_info("ipvs loaded.\n");
1816         return ret;
1817
1818   cleanup_conn:
1819         ip_vs_conn_cleanup();
1820   cleanup_app:
1821         ip_vs_app_cleanup();
1822   cleanup_protocol:
1823         ip_vs_protocol_cleanup();
1824         ip_vs_control_cleanup();
1825   cleanup_estimator:
1826         ip_vs_estimator_cleanup();
1827         return ret;
1828 }
1829
1830 static void __exit ip_vs_cleanup(void)
1831 {
1832         nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1833         ip_vs_conn_cleanup();
1834         ip_vs_app_cleanup();
1835         ip_vs_protocol_cleanup();
1836         ip_vs_control_cleanup();
1837         ip_vs_estimator_cleanup();
1838         pr_info("ipvs unloaded.\n");
1839 }
1840
1841 module_init(ip_vs_init);
1842 module_exit(ip_vs_cleanup);
1843 MODULE_LICENSE("GPL");