net/netfilter/ipvs/ip_vs_core.c

   1 /*
   2  * IPVS         An implementation of the IP virtual server support for the
   3  *              LINUX operating system.  IPVS is now implemented as a module
   4  *              over the Netfilter framework. IPVS can be used to build a
   5  *              high-performance and highly available server based on a
   6  *              cluster of servers.
   7  *
   8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
   9  *              Peter Kese <peter.kese@ijs.si>
  10  *              Julian Anastasov <ja@ssi.bg>
  11  *
  12  *              This program is free software; you can redistribute it and/or
  13  *              modify it under the terms of the GNU General Public License
  14  *              as published by the Free Software Foundation; either version
  15  *              2 of the License, or (at your option) any later version.
  16  *
  17  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
  18  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
  19  * and others.
  20  *
  21  * Changes:
  22  *      Paul `Rusty' Russell            properly handle non-linear skbs
  23  *      Harald Welte                    don't use nfcache
  24  *
  25  */
  26
  27 #define KMSG_COMPONENT "IPVS"
  28 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  29
  30 #include <linux/module.h>
  31 #include <linux/kernel.h>
  32 #include <linux/ip.h>
  33 #include <linux/tcp.h>
  34 #include <linux/sctp.h>
  35 #include <linux/icmp.h>
  36 #include <linux/slab.h>
  37
  38 #include <net/ip.h>
  39 #include <net/tcp.h>
  40 #include <net/udp.h>
  41 #include <net/icmp.h>                   /* for icmp_send */
  42 #include <net/route.h>
  43 #include <net/ip6_checksum.h>
  44
  45 #include <linux/netfilter.h>
  46 #include <linux/netfilter_ipv4.h>
  47
  48 #ifdef CONFIG_IP_VS_IPV6
  49 #include <net/ipv6.h>
  50 #include <linux/netfilter_ipv6.h>
  51 #endif
  52
  53 #include <net/ip_vs.h>
  54
  55
  56 EXPORT_SYMBOL(register_ip_vs_scheduler);
  57 EXPORT_SYMBOL(unregister_ip_vs_scheduler);
  58 EXPORT_SYMBOL(ip_vs_proto_name);
  59 EXPORT_SYMBOL(ip_vs_conn_new);
  60 EXPORT_SYMBOL(ip_vs_conn_in_get);
  61 EXPORT_SYMBOL(ip_vs_conn_out_get);
  62 #ifdef CONFIG_IP_VS_PROTO_TCP
  63 EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
  64 #endif
  65 EXPORT_SYMBOL(ip_vs_conn_put);
  66 #ifdef CONFIG_IP_VS_DEBUG
  67 EXPORT_SYMBOL(ip_vs_get_debug_level);
  68 #endif
  69
  70
  71 /* ID used in ICMP lookups */
  72 #define icmp_id(icmph)          (((icmph)->un).echo.id)
  73 #define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
  74
  75 const char *ip_vs_proto_name(unsigned proto)
  76 {
  77         static char buf[20];
  78
  79         switch (proto) {
  80         case IPPROTO_IP:
  81                 return "IP";
  82         case IPPROTO_UDP:
  83                 return "UDP";
  84         case IPPROTO_TCP:
  85                 return "TCP";
  86         case IPPROTO_SCTP:
  87                 return "SCTP";
  88         case IPPROTO_ICMP:
  89                 return "ICMP";
  90 #ifdef CONFIG_IP_VS_IPV6
  91         case IPPROTO_ICMPV6:
  92                 return "ICMPv6";
  93 #endif
  94         default:
  95                 sprintf(buf, "IP_%d", proto);
  96                 return buf;
  97         }
  98 }
  99
 100 void ip_vs_init_hash_table(struct list_head *table, int rows)
 101 {
 102         while (--rows >= 0)
 103                 INIT_LIST_HEAD(&table[rows]);
 104 }
 105
 106 static inline void
 107 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 108 {
 109         struct ip_vs_dest *dest = cp->dest;
 110         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 111                 spin_lock(&dest->stats.lock);
 112                 dest->stats.ustats.inpkts++;
 113                 dest->stats.ustats.inbytes += skb->len;
 114                 spin_unlock(&dest->stats.lock);
 115
 116                 spin_lock(&dest->svc->stats.lock);
 117                 dest->svc->stats.ustats.inpkts++;
 118                 dest->svc->stats.ustats.inbytes += skb->len;
 119                 spin_unlock(&dest->svc->stats.lock);
 120
 121                 spin_lock(&ip_vs_stats.lock);
 122                 ip_vs_stats.ustats.inpkts++;
 123                 ip_vs_stats.ustats.inbytes += skb->len;
 124                 spin_unlock(&ip_vs_stats.lock);
 125         }
 126 }
 127
 128
 129 static inline void
 130 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 131 {
 132         struct ip_vs_dest *dest = cp->dest;
 133         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 134                 spin_lock(&dest->stats.lock);
 135                 dest->stats.ustats.outpkts++;
 136                 dest->stats.ustats.outbytes += skb->len;
 137                 spin_unlock(&dest->stats.lock);
 138
 139                 spin_lock(&dest->svc->stats.lock);
 140                 dest->svc->stats.ustats.outpkts++;
 141                 dest->svc->stats.ustats.outbytes += skb->len;
 142                 spin_unlock(&dest->svc->stats.lock);
 143
 144                 spin_lock(&ip_vs_stats.lock);
 145                 ip_vs_stats.ustats.outpkts++;
 146                 ip_vs_stats.ustats.outbytes += skb->len;
 147                 spin_unlock(&ip_vs_stats.lock);
 148         }
 149 }
 150
 151
 152 static inline void
 153 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 154 {
 155         spin_lock(&cp->dest->stats.lock);
 156         cp->dest->stats.ustats.conns++;
 157         spin_unlock(&cp->dest->stats.lock);
 158
 159         spin_lock(&svc->stats.lock);
 160         svc->stats.ustats.conns++;
 161         spin_unlock(&svc->stats.lock);
 162
 163         spin_lock(&ip_vs_stats.lock);
 164         ip_vs_stats.ustats.conns++;
 165         spin_unlock(&ip_vs_stats.lock);
 166 }
 167
 168
 169 static inline int
 170 ip_vs_set_state(struct ip_vs_conn *cp, int direction,
 171                 const struct sk_buff *skb,
 172                 struct ip_vs_protocol *pp)
 173 {
 174         if (unlikely(!pp->state_transition))
 175                 return 0;
 176         return pp->state_transition(cp, direction, skb, pp);
 177 }
 178
 179
 180 /*
 181  *  IPVS persistent scheduling function
 182  *  It creates a connection entry according to its template if exists,
 183  *  or selects a server and creates a connection entry plus a template.
 184  *  Locking: we are svc user (svc->refcnt), so we hold all dests too
 185  *  Protocols supported: TCP, UDP
 186  */
 187 static struct ip_vs_conn *
 188 ip_vs_sched_persist(struct ip_vs_service *svc,
 189                     const struct sk_buff *skb,
 190                     __be16 ports[2])
 191 {
 192         struct ip_vs_conn *cp = NULL;
 193         struct ip_vs_iphdr iph;
 194         struct ip_vs_dest *dest;
 195         struct ip_vs_conn *ct;
 196         __be16  dport;                  /* destination port to forward */
 197         __be16  flags;
 198         union nf_inet_addr snet;        /* source network of the client,
 199                                            after masking */
 200
 201         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 202
 203         /* Mask saddr with the netmask to adjust template granularity */
 204 #ifdef CONFIG_IP_VS_IPV6
 205         if (svc->af == AF_INET6)
 206                 ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
 207         else
 208 #endif
 209                 snet.ip = iph.saddr.ip & svc->netmask;
 210
 211         IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
 212                       "mnet %s\n",
 213                       IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
 214                       IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
 215                       IP_VS_DBG_ADDR(svc->af, &snet));
 216
 217         /*
 218          * As far as we know, FTP is a very complicated network protocol, and
 219          * it uses control connection and data connections. For active FTP,
 220          * FTP server initialize data connection to the client, its source port
 221          * is often 20. For passive FTP, FTP server tells the clients the port
 222          * that it passively listens to,  and the client issues the data
 223          * connection. In the tunneling or direct routing mode, the load
 224          * balancer is on the client-to-server half of connection, the port
 225          * number is unknown to the load balancer. So, a conn template like
 226          * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
 227          * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
 228          * is created for other persistent services.
 229          */
 230         if (ports[1] == svc->port) {
 231                 /* Check if a template already exists */
 232                 if (svc->port != FTPPORT)
 233                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
 234                                              &iph.daddr, ports[1]);
 235                 else
 236                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
 237                                              &iph.daddr, 0);
 238
 239                 if (!ct || !ip_vs_check_template(ct)) {
 240                         /*
 241                          * No template found or the dest of the connection
 242                          * template is not available.
 243                          */
 244                         dest = svc->scheduler->schedule(svc, skb);
 245                         if (dest == NULL) {
 246                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
 247                                 return NULL;
 248                         }
 249
 250                         /*
 251                          * Create a template like <protocol,caddr,0,
 252                          * vaddr,vport,daddr,dport> for non-ftp service,
 253                          * and <protocol,caddr,0,vaddr,0,daddr,0>
 254                          * for ftp service.
 255                          */
 256                         if (svc->port != FTPPORT)
 257                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
 258                                                     &snet, 0,
 259                                                     &iph.daddr,
 260                                                     ports[1],
 261                                                     &dest->addr, dest->port,
 262                                                     IP_VS_CONN_F_TEMPLATE,
 263                                                     dest);
 264                         else
 265                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
 266                                                     &snet, 0,
 267                                                     &iph.daddr, 0,
 268                                                     &dest->addr, 0,
 269                                                     IP_VS_CONN_F_TEMPLATE,
 270                                                     dest);
 271                         if (ct == NULL)
 272                                 return NULL;
 273
 274                         ct->timeout = svc->timeout;
 275                 } else {
 276                         /* set destination with the found template */
 277                         dest = ct->dest;
 278                 }
 279                 dport = dest->port;
 280         } else {
 281                 /*
 282                  * Note: persistent fwmark-based services and persistent
 283                  * port zero service are handled here.
 284                  * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
 285                  * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
 286                  */
 287                 if (svc->fwmark) {
 288                         union nf_inet_addr fwmark = {
 289                                 .ip = htonl(svc->fwmark)
 290                         };
 291
 292                         ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
 293                                              &fwmark, 0);
 294                 } else
 295                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
 296                                              &iph.daddr, 0);
 297
 298                 if (!ct || !ip_vs_check_template(ct)) {
 299                         /*
 300                          * If it is not persistent port zero, return NULL,
 301                          * otherwise create a connection template.
 302                          */
 303                         if (svc->port)
 304                                 return NULL;
 305
 306                         dest = svc->scheduler->schedule(svc, skb);
 307                         if (dest == NULL) {
 308                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
 309                                 return NULL;
 310                         }
 311
 312                         /*
 313                          * Create a template according to the service
 314                          */
 315                         if (svc->fwmark) {
 316                                 union nf_inet_addr fwmark = {
 317                                         .ip = htonl(svc->fwmark)
 318                                 };
 319
 320                                 ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
 321                                                     &snet, 0,
 322                                                     &fwmark, 0,
 323                                                     &dest->addr, 0,
 324                                                     IP_VS_CONN_F_TEMPLATE,
 325                                                     dest);
 326                         } else
 327                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
 328                                                     &snet, 0,
 329                                                     &iph.daddr, 0,
 330                                                     &dest->addr, 0,
 331                                                     IP_VS_CONN_F_TEMPLATE,
 332                                                     dest);
 333                         if (ct == NULL)
 334                                 return NULL;
 335
 336                         ct->timeout = svc->timeout;
 337                 } else {
 338                         /* set destination with the found template */
 339                         dest = ct->dest;
 340                 }
 341                 dport = ports[1];
 342         }
 343
 344         flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
 345                  && iph.protocol == IPPROTO_UDP)?
 346                 IP_VS_CONN_F_ONE_PACKET : 0;
 347
 348         /*
 349          *    Create a new connection according to the template
 350          */
 351         cp = ip_vs_conn_new(svc->af, iph.protocol,
 352                             &iph.saddr, ports[0],
 353                             &iph.daddr, ports[1],
 354                             &dest->addr, dport,
 355                             flags,
 356                             dest);
 357         if (cp == NULL) {
 358                 ip_vs_conn_put(ct);
 359                 return NULL;
 360         }
 361
 362         /*
 363          *    Add its control
 364          */
 365         ip_vs_control_add(cp, ct);
 366         ip_vs_conn_put(ct);
 367
 368         ip_vs_conn_stats(cp, svc);
 369         return cp;
 370 }
 371
 372
 373 /*
 374  *  IPVS main scheduling function
 375  *  It selects a server according to the virtual service, and
 376  *  creates a connection entry.
 377  *  Protocols supported: TCP, UDP
 378  */
 379 struct ip_vs_conn *
 380 ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
 381 {
 382         struct ip_vs_conn *cp = NULL;
 383         struct ip_vs_iphdr iph;
 384         struct ip_vs_dest *dest;
 385         __be16 _ports[2], *pptr, flags;
 386
 387         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 388         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
 389         if (pptr == NULL)
 390                 return NULL;
 391
 392         /*
 393          *    Persistent service
 394          */
 395         if (svc->flags & IP_VS_SVC_F_PERSISTENT)
 396                 return ip_vs_sched_persist(svc, skb, pptr);
 397
 398         /*
 399          *    Non-persistent service
 400          */
 401         if (!svc->fwmark && pptr[1] != svc->port) {
 402                 if (!svc->port)
 403                         pr_err("Schedule: port zero only supported "
 404                                "in persistent services, "
 405                                "check your ipvs configuration\n");
 406                 return NULL;
 407         }
 408
 409         dest = svc->scheduler->schedule(svc, skb);
 410         if (dest == NULL) {
 411                 IP_VS_DBG(1, "Schedule: no dest found.\n");
 412                 return NULL;
 413         }
 414
 415         flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
 416                  && iph.protocol == IPPROTO_UDP)?
 417                 IP_VS_CONN_F_ONE_PACKET : 0;
 418
 419         /*
 420          *    Create a connection entry.
 421          */
 422         cp = ip_vs_conn_new(svc->af, iph.protocol,
 423                             &iph.saddr, pptr[0],
 424                             &iph.daddr, pptr[1],
 425                             &dest->addr, dest->port ? dest->port : pptr[1],
 426                             flags,
 427                             dest);
 428         if (cp == NULL)
 429                 return NULL;
 430
 431         IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
 432                       "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
 433                       ip_vs_fwd_tag(cp),
 434                       IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
 435                       IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
 436                       IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
 437                       cp->flags, atomic_read(&cp->refcnt));
 438
 439         ip_vs_conn_stats(cp, svc);
 440         return cp;
 441 }
 442
 443
 444 /*
 445  *  Pass or drop the packet.
 446  *  Called by ip_vs_in, when the virtual service is available but
 447  *  no destination is available for a new connection.
 448  */
 449 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 450                 struct ip_vs_protocol *pp)
 451 {
 452         __be16 _ports[2], *pptr;
 453         struct ip_vs_iphdr iph;
 454         int unicast;
 455         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
 456
 457         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
 458         if (pptr == NULL) {
 459                 ip_vs_service_put(svc);
 460                 return NF_DROP;
 461         }
 462
 463 #ifdef CONFIG_IP_VS_IPV6
 464         if (svc->af == AF_INET6)
 465                 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
 466         else
 467 #endif
 468                 unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
 469
 470         /* if it is fwmark-based service, the cache_bypass sysctl is up
 471            and the destination is a non-local unicast, then create
 472            a cache_bypass connection entry */
 473         if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
 474                 int ret, cs;
 475                 struct ip_vs_conn *cp;
 476                 __u16 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
 477                                 iph.protocol == IPPROTO_UDP)?
 478                                 IP_VS_CONN_F_ONE_PACKET : 0;
 479                 union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
 480
 481                 ip_vs_service_put(svc);
 482
 483                 /* create a new connection entry */
 484                 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
 485                 cp = ip_vs_conn_new(svc->af, iph.protocol,
 486                                     &iph.saddr, pptr[0],
 487                                     &iph.daddr, pptr[1],
 488                                     &daddr, 0,
 489                                     IP_VS_CONN_F_BYPASS | flags,
 490                                     NULL);
 491                 if (cp == NULL)
 492                         return NF_DROP;
 493
 494                 /* statistics */
 495                 ip_vs_in_stats(cp, skb);
 496
 497                 /* set state */
 498                 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
 499
 500                 /* transmit the first SYN packet */
 501                 ret = cp->packet_xmit(skb, cp, pp);
 502                 /* do not touch skb anymore */
 503
 504                 atomic_inc(&cp->in_pkts);
 505                 ip_vs_conn_put(cp);
 506                 return ret;
 507         }
 508
 509         /*
 510          * When the virtual ftp service is presented, packets destined
 511          * for other services on the VIP may get here (except services
 512          * listed in the ipvs table), pass the packets, because it is
 513          * not ipvs job to decide to drop the packets.
 514          */
 515         if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
 516                 ip_vs_service_put(svc);
 517                 return NF_ACCEPT;
 518         }
 519
 520         ip_vs_service_put(svc);
 521
 522         /*
 523          * Notify the client that the destination is unreachable, and
 524          * release the socket buffer.
 525          * Since it is in IP layer, the TCP socket is not actually
 526          * created, the TCP RST packet cannot be sent, instead that
 527          * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
 528          */
 529 #ifdef CONFIG_IP_VS_IPV6
 530         if (svc->af == AF_INET6)
 531                 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
 532         else
 533 #endif
 534                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 535
 536         return NF_DROP;
 537 }
 538
 539 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 540 {
 541         return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
 542 }
 543
 544 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
 545 {
 546         int err = ip_defrag(skb, user);
 547
 548         if (!err)
 549                 ip_send_check(ip_hdr(skb));
 550
 551         return err;
 552 }
 553
 554 #ifdef CONFIG_IP_VS_IPV6
 555 static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
 556 {
 557         /* TODO IPv6: Find out what to do here for IPv6 */
 558         return 0;
 559 }
 560 #endif
 561
 562 /*
 563  * Packet has been made sufficiently writable in caller
 564  * - inout: 1=in->out, 0=out->in
 565  */
 566 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
 567                     struct ip_vs_conn *cp, int inout)
 568 {
 569         struct iphdr *iph        = ip_hdr(skb);
 570         unsigned int icmp_offset = iph->ihl*4;
 571         struct icmphdr *icmph    = (struct icmphdr *)(skb_network_header(skb) +
 572                                                       icmp_offset);
 573         struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
 574
 575         if (inout) {
 576                 iph->saddr = cp->vaddr.ip;
 577                 ip_send_check(iph);
 578                 ciph->daddr = cp->vaddr.ip;
 579                 ip_send_check(ciph);
 580         } else {
 581                 iph->daddr = cp->daddr.ip;
 582                 ip_send_check(iph);
 583                 ciph->saddr = cp->daddr.ip;
 584                 ip_send_check(ciph);
 585         }
 586
 587         /* the TCP/UDP/SCTP port */
 588         if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
 589             IPPROTO_SCTP == ciph->protocol) {
 590                 __be16 *ports = (void *)ciph + ciph->ihl*4;
 591
 592                 if (inout)
 593                         ports[1] = cp->vport;
 594                 else
 595                         ports[0] = cp->dport;
 596         }
 597
 598         /* And finally the ICMP checksum */
 599         icmph->checksum = 0;
 600         icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
 601         skb->ip_summed = CHECKSUM_UNNECESSARY;
 602
 603         if (inout)
 604                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 605                         "Forwarding altered outgoing ICMP");
 606         else
 607                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 608                         "Forwarding altered incoming ICMP");
 609 }
 610
 611 #ifdef CONFIG_IP_VS_IPV6
 612 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
 613                     struct ip_vs_conn *cp, int inout)
 614 {
 615         struct ipv6hdr *iph      = ipv6_hdr(skb);
 616         unsigned int icmp_offset = sizeof(struct ipv6hdr);
 617         struct icmp6hdr *icmph   = (struct icmp6hdr *)(skb_network_header(skb) +
 618                                                       icmp_offset);
 619         struct ipv6hdr *ciph     = (struct ipv6hdr *)(icmph + 1);
 620
 621         if (inout) {
 622                 iph->saddr = cp->vaddr.in6;
 623                 ciph->daddr = cp->vaddr.in6;
 624         } else {
 625                 iph->daddr = cp->daddr.in6;
 626                 ciph->saddr = cp->daddr.in6;
 627         }
 628
 629         /* the TCP/UDP/SCTP port */
 630         if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr ||
 631             IPPROTO_SCTP == ciph->nexthdr) {
 632                 __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
 633
 634                 if (inout)
 635                         ports[1] = cp->vport;
 636                 else
 637                         ports[0] = cp->dport;
 638         }
 639
 640         /* And finally the ICMP checksum */
 641         icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
 642                                               skb->len - icmp_offset,
 643                                               IPPROTO_ICMPV6, 0);
 644         skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
 645         skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
 646         skb->ip_summed = CHECKSUM_PARTIAL;
 647
 648         if (inout)
 649                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 650                         "Forwarding altered outgoing ICMPv6");
 651         else
 652                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
 653                         "Forwarding altered incoming ICMPv6");
 654 }
 655 #endif
 656
 657 /* Handle relevant response ICMP messages - forward to the right
 658  * destination host. Used for NAT and local client.
 659  */
 660 static int handle_response_icmp(int af, struct sk_buff *skb,
 661                                 union nf_inet_addr *snet,
 662                                 __u8 protocol, struct ip_vs_conn *cp,
 663                                 struct ip_vs_protocol *pp,
 664                                 unsigned int offset, unsigned int ihl)
 665 {
 666         unsigned int verdict = NF_DROP;
 667
 668         if (IP_VS_FWD_METHOD(cp) != 0) {
 669                 pr_err("shouldn't reach here, because the box is on the "
 670                        "half connection in the tun/dr module.\n");
 671         }
 672
 673         /* Ensure the checksum is correct */
 674         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
 675                 /* Failed checksum! */
 676                 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
 677                               IP_VS_DBG_ADDR(af, snet));
 678                 goto out;
 679         }
 680
 681         if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
 682             IPPROTO_SCTP == protocol)
 683                 offset += 2 * sizeof(__u16);
 684         if (!skb_make_writable(skb, offset))
 685                 goto out;
 686
 687 #ifdef CONFIG_IP_VS_IPV6
 688         if (af == AF_INET6)
 689                 ip_vs_nat_icmp_v6(skb, pp, cp, 1);
 690         else
 691 #endif
 692                 ip_vs_nat_icmp(skb, pp, cp, 1);
 693
 694         /* do the statistics and put it back */
 695         ip_vs_out_stats(cp, skb);
 696
 697         skb->ipvs_property = 1;
 698         verdict = NF_ACCEPT;
 699
 700 out:
 701         __ip_vs_conn_put(cp);
 702
 703         return verdict;
 704 }
 705
 706 /*
 707  *      Handle ICMP messages in the inside-to-outside direction (outgoing).
 708  *      Find any that might be relevant, check against existing connections.
 709  *      Currently handles error types - unreachable, quench, ttl exceeded.
 710  */
 711 static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
 712 {
 713         struct iphdr *iph;
 714         struct icmphdr  _icmph, *ic;
 715         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
 716         struct ip_vs_iphdr ciph;
 717         struct ip_vs_conn *cp;
 718         struct ip_vs_protocol *pp;
 719         unsigned int offset, ihl;
 720         union nf_inet_addr snet;
 721
 722         *related = 1;
 723
 724         /* reassemble IP fragments */
 725         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
 726                 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
 727                         return NF_STOLEN;
 728         }
 729
 730         iph = ip_hdr(skb);
 731         offset = ihl = iph->ihl * 4;
 732         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
 733         if (ic == NULL)
 734                 return NF_DROP;
 735
 736         IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
 737                   ic->type, ntohs(icmp_id(ic)),
 738                   &iph->saddr, &iph->daddr);
 739
 740         /*
 741          * Work through seeing if this is for us.
 742          * These checks are supposed to be in an order that means easy
 743          * things are checked first to speed up processing.... however
 744          * this means that some packets will manage to get a long way
 745          * down this stack and then be rejected, but that's life.
 746          */
 747         if ((ic->type != ICMP_DEST_UNREACH) &&
 748             (ic->type != ICMP_SOURCE_QUENCH) &&
 749             (ic->type != ICMP_TIME_EXCEEDED)) {
 750                 *related = 0;
 751                 return NF_ACCEPT;
 752         }
 753
 754         /* Now find the contained IP header */
 755         offset += sizeof(_icmph);
 756         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 757         if (cih == NULL)
 758                 return NF_ACCEPT; /* The packet looks wrong, ignore */
 759
 760         pp = ip_vs_proto_get(cih->protocol);
 761         if (!pp)
 762                 return NF_ACCEPT;
 763
 764         /* Is the embedded protocol header present? */
 765         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
 766                      pp->dont_defrag))
 767                 return NF_ACCEPT;
 768
 769         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
 770
 771         offset += cih->ihl * 4;
 772
 773         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
 774         /* The embedded headers contain source and dest in reverse order */
 775         cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
 776         if (!cp)
 777                 return NF_ACCEPT;
 778
 779         snet.ip = iph->saddr;
 780         return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
 781                                     pp, offset, ihl);
 782 }
 783
 784 #ifdef CONFIG_IP_VS_IPV6
 785 static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
 786 {
 787         struct ipv6hdr *iph;
 788         struct icmp6hdr _icmph, *ic;
 789         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
 790                                            within the ICMP */
 791         struct ip_vs_iphdr ciph;
 792         struct ip_vs_conn *cp;
 793         struct ip_vs_protocol *pp;
 794         unsigned int offset;
 795         union nf_inet_addr snet;
 796
 797         *related = 1;
 798
 799         /* reassemble IP fragments */
 800         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
 801                 if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
 802                         return NF_STOLEN;
 803         }
 804
 805         iph = ipv6_hdr(skb);
 806         offset = sizeof(struct ipv6hdr);
 807         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
 808         if (ic == NULL)
 809                 return NF_DROP;
 810
 811         IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
 812                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
 813                   &iph->saddr, &iph->daddr);
 814
 815         /*
 816          * Work through seeing if this is for us.
 817          * These checks are supposed to be in an order that means easy
 818          * things are checked first to speed up processing.... however
 819          * this means that some packets will manage to get a long way
 820          * down this stack and then be rejected, but that's life.
 821          */
 822         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
 823             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
 824             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
 825                 *related = 0;
 826                 return NF_ACCEPT;
 827         }
 828
 829         /* Now find the contained IP header */
 830         offset += sizeof(_icmph);
 831         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 832         if (cih == NULL)
 833                 return NF_ACCEPT; /* The packet looks wrong, ignore */
 834
 835         pp = ip_vs_proto_get(cih->nexthdr);
 836         if (!pp)
 837                 return NF_ACCEPT;
 838
 839         /* Is the embedded protocol header present? */
 840         /* TODO: we don't support fragmentation at the moment anyways */
 841         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
 842                 return NF_ACCEPT;
 843
 844         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
 845
 846         offset += sizeof(struct ipv6hdr);
 847
 848         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
 849         /* The embedded headers contain source and dest in reverse order */
 850         cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
 851         if (!cp)
 852                 return NF_ACCEPT;
 853
 854         ipv6_addr_copy(&snet.in6, &iph->saddr);
 855         return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
 856                                     pp, offset, sizeof(struct ipv6hdr));
 857 }
 858 #endif
 859
 860 /*
 861  * Check if sctp chunc is ABORT chunk
 862  */
 863 static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
 864 {
 865         sctp_chunkhdr_t *sch, schunk;
 866         sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t),
 867                         sizeof(schunk), &schunk);
 868         if (sch == NULL)
 869                 return 0;
 870         if (sch->type == SCTP_CID_ABORT)
 871                 return 1;
 872         return 0;
 873 }
 874
 875 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
 876 {
 877         struct tcphdr _tcph, *th;
 878
 879         th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
 880         if (th == NULL)
 881                 return 0;
 882         return th->rst;
 883 }
 884
 885 /* Handle response packets: rewrite addresses and send away...
 886  * Used for NAT and local client.
 887  */
 888 static unsigned int
 889 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 890                 struct ip_vs_conn *cp, int ihl)
 891 {
 892         IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
 893
 894         if (!skb_make_writable(skb, ihl))
 895                 goto drop;
 896
 897         /* mangle the packet */
 898         if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
 899                 goto drop;
 900
 901 #ifdef CONFIG_IP_VS_IPV6
 902         if (af == AF_INET6)
 903                 ipv6_hdr(skb)->saddr = cp->vaddr.in6;
 904         else
 905 #endif
 906         {
 907                 ip_hdr(skb)->saddr = cp->vaddr.ip;
 908                 ip_send_check(ip_hdr(skb));
 909         }
 910
 911         /* For policy routing, packets originating from this
 912          * machine itself may be routed differently to packets
 913          * passing through.  We want this packet to be routed as
 914          * if it came from this machine itself.  So re-compute
 915          * the routing information.
 916          */
 917 #ifdef CONFIG_IP_VS_IPV6
 918         if (af == AF_INET6) {
 919                 if (ip6_route_me_harder(skb) != 0)
 920                         goto drop;
 921         } else
 922 #endif
 923                 if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
 924                         goto drop;
 925
 926         IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
 927
 928         ip_vs_out_stats(cp, skb);
 929         ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
 930         ip_vs_update_conntrack(skb, cp, 0);
 931         ip_vs_conn_put(cp);
 932
 933         skb->ipvs_property = 1;
 934
 935         LeaveFunction(11);
 936         return NF_ACCEPT;
 937
 938 drop:
 939         ip_vs_conn_put(cp);
 940         kfree_skb(skb);
 941         return NF_STOLEN;
 942 }
 943
 944 /*
 945  *      It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
 946  *      Check if outgoing packet belongs to the established ip_vs_conn.
 947  */
 948 static unsigned int
 949 ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
 950           const struct net_device *in, const struct net_device *out,
 951           int (*okfn)(struct sk_buff *))
 952 {
 953         struct ip_vs_iphdr iph;
 954         struct ip_vs_protocol *pp;
 955         struct ip_vs_conn *cp;
 956         int af;
 957
 958         EnterFunction(11);
 959
 960         af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
 961
 962         if (skb->ipvs_property)
 963                 return NF_ACCEPT;
 964
 965         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 966 #ifdef CONFIG_IP_VS_IPV6
 967         if (af == AF_INET6) {
 968                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
 969                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
 970
 971                         if (related)
 972                                 return verdict;
 973                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 974                 }
 975         } else
 976 #endif
 977                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
 978                         int related, verdict = ip_vs_out_icmp(skb, &related);
 979
 980                         if (related)
 981                                 return verdict;
 982                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 983                 }
 984
 985         pp = ip_vs_proto_get(iph.protocol);
 986         if (unlikely(!pp))
 987                 return NF_ACCEPT;
 988
 989         /* reassemble IP fragments */
 990 #ifdef CONFIG_IP_VS_IPV6
 991         if (af == AF_INET6) {
 992                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
 993                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
 994
 995                         if (related)
 996                                 return verdict;
 997
 998                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 999                 }
1000         } else
1001 #endif
1002                 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
1003                              !pp->dont_defrag)) {
1004                         if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
1005                                 return NF_STOLEN;
1006
1007                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1008                 }
1009
1010         /*
1011          * Check if the packet belongs to an existing entry
1012          */
1013         cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1014
1015         if (unlikely(!cp)) {
1016                 if (sysctl_ip_vs_nat_icmp_send &&
1017                     (pp->protocol == IPPROTO_TCP ||
1018                      pp->protocol == IPPROTO_UDP ||
1019                      pp->protocol == IPPROTO_SCTP)) {
1020                         __be16 _ports[2], *pptr;
1021
1022                         pptr = skb_header_pointer(skb, iph.len,
1023                                                   sizeof(_ports), _ports);
1024                         if (pptr == NULL)
1025                                 return NF_ACCEPT;       /* Not for me */
1026                         if (ip_vs_lookup_real_service(af, iph.protocol,
1027                                                       &iph.saddr,
1028                                                       pptr[0])) {
1029                                 /*
1030                                  * Notify the real server: there is no
1031                                  * existing entry if it is not RST
1032                                  * packet or not TCP packet.
1033                                  */
1034                                 if ((iph.protocol != IPPROTO_TCP &&
1035                                      iph.protocol != IPPROTO_SCTP)
1036                                      || ((iph.protocol == IPPROTO_TCP
1037                                           && !is_tcp_reset(skb, iph.len))
1038                                          || (iph.protocol == IPPROTO_SCTP
1039                                                 && !is_sctp_abort(skb,
1040                                                         iph.len)))) {
1041 #ifdef CONFIG_IP_VS_IPV6
1042                                         if (af == AF_INET6)
1043                                                 icmpv6_send(skb,
1044                                                             ICMPV6_DEST_UNREACH,
1045                                                             ICMPV6_PORT_UNREACH,
1046                                                             0);
1047                                         else
1048 #endif
1049                                                 icmp_send(skb,
1050                                                           ICMP_DEST_UNREACH,
1051                                                           ICMP_PORT_UNREACH, 0);
1052                                         return NF_DROP;
1053                                 }
1054                         }
1055                 }
1056                 IP_VS_DBG_PKT(12, pp, skb, 0,
1057                               "packet continues traversal as normal");
1058                 return NF_ACCEPT;
1059         }
1060
1061         return handle_response(af, skb, pp, cp, iph.len);
1062 }
1063
1064
1065 /*
1066  *      Handle ICMP messages in the outside-to-inside direction (incoming).
1067  *      Find any that might be relevant, check against existing connections,
1068  *      forward to the right destination host if relevant.
1069  *      Currently handles error types - unreachable, quench, ttl exceeded.
1070  */
1071 static int
1072 ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1073 {
1074         struct iphdr *iph;
1075         struct icmphdr  _icmph, *ic;
1076         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
1077         struct ip_vs_iphdr ciph;
1078         struct ip_vs_conn *cp;
1079         struct ip_vs_protocol *pp;
1080         unsigned int offset, ihl, verdict;
1081         union nf_inet_addr snet;
1082
1083         *related = 1;
1084
1085         /* reassemble IP fragments */
1086         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1087                 if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
1088                                             IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
1089                         return NF_STOLEN;
1090         }
1091
1092         iph = ip_hdr(skb);
1093         offset = ihl = iph->ihl * 4;
1094         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1095         if (ic == NULL)
1096                 return NF_DROP;
1097
1098         IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1099                   ic->type, ntohs(icmp_id(ic)),
1100                   &iph->saddr, &iph->daddr);
1101
1102         /*
1103          * Work through seeing if this is for us.
1104          * These checks are supposed to be in an order that means easy
1105          * things are checked first to speed up processing.... however
1106          * this means that some packets will manage to get a long way
1107          * down this stack and then be rejected, but that's life.
1108          */
1109         if ((ic->type != ICMP_DEST_UNREACH) &&
1110             (ic->type != ICMP_SOURCE_QUENCH) &&
1111             (ic->type != ICMP_TIME_EXCEEDED)) {
1112                 *related = 0;
1113                 return NF_ACCEPT;
1114         }
1115
1116         /* Now find the contained IP header */
1117         offset += sizeof(_icmph);
1118         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1119         if (cih == NULL)
1120                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1121
1122         pp = ip_vs_proto_get(cih->protocol);
1123         if (!pp)
1124                 return NF_ACCEPT;
1125
1126         /* Is the embedded protocol header present? */
1127         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1128                      pp->dont_defrag))
1129                 return NF_ACCEPT;
1130
1131         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
1132
1133         offset += cih->ihl * 4;
1134
1135         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1136         /* The embedded headers contain source and dest in reverse order */
1137         cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
1138         if (!cp) {
1139                 /* The packet could also belong to a local client */
1140                 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
1141                 if (cp) {
1142                         snet.ip = iph->saddr;
1143                         return handle_response_icmp(AF_INET, skb, &snet,
1144                                                     cih->protocol, cp, pp,
1145                                                     offset, ihl);
1146                 }
1147                 return NF_ACCEPT;
1148         }
1149
1150         verdict = NF_DROP;
1151
1152         /* Ensure the checksum is correct */
1153         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1154                 /* Failed checksum! */
1155                 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1156                           &iph->saddr);
1157                 goto out;
1158         }
1159
1160         /* do the statistics and put it back */
1161         ip_vs_in_stats(cp, skb);
1162         if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1163                 offset += 2 * sizeof(__u16);
1164         verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
1165         /* do not touch skb anymore */
1166
1167   out:
1168         __ip_vs_conn_put(cp);
1169
1170         return verdict;
1171 }
1172
1173 #ifdef CONFIG_IP_VS_IPV6
1174 static int
1175 ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1176 {
1177         struct ipv6hdr *iph;
1178         struct icmp6hdr _icmph, *ic;
1179         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
1180                                            within the ICMP */
1181         struct ip_vs_iphdr ciph;
1182         struct ip_vs_conn *cp;
1183         struct ip_vs_protocol *pp;
1184         unsigned int offset, verdict;
1185         union nf_inet_addr snet;
1186
1187         *related = 1;
1188
1189         /* reassemble IP fragments */
1190         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1191                 if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
1192                                                IP_DEFRAG_VS_IN :
1193                                                IP_DEFRAG_VS_FWD))
1194                         return NF_STOLEN;
1195         }
1196
1197         iph = ipv6_hdr(skb);
1198         offset = sizeof(struct ipv6hdr);
1199         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1200         if (ic == NULL)
1201                 return NF_DROP;
1202
1203         IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
1204                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
1205                   &iph->saddr, &iph->daddr);
1206
1207         /*
1208          * Work through seeing if this is for us.
1209          * These checks are supposed to be in an order that means easy
1210          * things are checked first to speed up processing.... however
1211          * this means that some packets will manage to get a long way
1212          * down this stack and then be rejected, but that's life.
1213          */
1214         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1215             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1216             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1217                 *related = 0;
1218                 return NF_ACCEPT;
1219         }
1220
1221         /* Now find the contained IP header */
1222         offset += sizeof(_icmph);
1223         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1224         if (cih == NULL)
1225                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1226
1227         pp = ip_vs_proto_get(cih->nexthdr);
1228         if (!pp)
1229                 return NF_ACCEPT;
1230
1231         /* Is the embedded protocol header present? */
1232         /* TODO: we don't support fragmentation at the moment anyways */
1233         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1234                 return NF_ACCEPT;
1235
1236         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
1237
1238         offset += sizeof(struct ipv6hdr);
1239
1240         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1241         /* The embedded headers contain source and dest in reverse order */
1242         cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
1243         if (!cp) {
1244                 /* The packet could also belong to a local client */
1245                 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
1246                 if (cp) {
1247                         ipv6_addr_copy(&snet.in6, &iph->saddr);
1248                         return handle_response_icmp(AF_INET6, skb, &snet,
1249                                                     cih->nexthdr,
1250                                                     cp, pp, offset,
1251                                                     sizeof(struct ipv6hdr));
1252                 }
1253                 return NF_ACCEPT;
1254         }
1255
1256         verdict = NF_DROP;
1257
1258         /* do the statistics and put it back */
1259         ip_vs_in_stats(cp, skb);
1260         if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr ||
1261             IPPROTO_SCTP == cih->nexthdr)
1262                 offset += 2 * sizeof(__u16);
1263         verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
1264         /* do not touch skb anymore */
1265
1266         __ip_vs_conn_put(cp);
1267
1268         return verdict;
1269 }
1270 #endif
1271
1272
1273 /*
1274  *      Check if it's for virtual services, look it up,
1275  *      and send it on its way...
1276  */
1277 static unsigned int
1278 ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1279          const struct net_device *in, const struct net_device *out,
1280          int (*okfn)(struct sk_buff *))
1281 {
1282         struct ip_vs_iphdr iph;
1283         struct ip_vs_protocol *pp;
1284         struct ip_vs_conn *cp;
1285         int ret, restart, af, pkts;
1286
1287         af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
1288
1289         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1290
1291         /*
1292          *      Big tappo: only PACKET_HOST, including loopback for local client
1293          *      Don't handle local packets on IPv6 for now
1294          */
1295         if (unlikely(skb->pkt_type != PACKET_HOST)) {
1296                 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
1297                               skb->pkt_type,
1298                               iph.protocol,
1299                               IP_VS_DBG_ADDR(af, &iph.daddr));
1300                 return NF_ACCEPT;
1301         }
1302
1303 #ifdef CONFIG_IP_VS_IPV6
1304         if (af == AF_INET6) {
1305                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1306                         int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
1307
1308                         if (related)
1309                                 return verdict;
1310                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1311                 }
1312         } else
1313 #endif
1314                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1315                         int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
1316
1317                         if (related)
1318                                 return verdict;
1319                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1320                 }
1321
1322         /* Protocol supported? */
1323         pp = ip_vs_proto_get(iph.protocol);
1324         if (unlikely(!pp))
1325                 return NF_ACCEPT;
1326
1327         /*
1328          * Check if the packet belongs to an existing connection entry
1329          */
1330         cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
1331
1332         if (unlikely(!cp)) {
1333                 int v;
1334
1335                 /* For local client packets, it could be a response */
1336                 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1337                 if (cp)
1338                         return handle_response(af, skb, pp, cp, iph.len);
1339
1340                 if (!pp->conn_schedule(af, skb, pp, &v, &cp))
1341                         return v;
1342         }
1343
1344         if (unlikely(!cp)) {
1345                 /* sorry, all this trouble for a no-hit :) */
1346                 IP_VS_DBG_PKT(12, pp, skb, 0,
1347                               "packet continues traversal as normal");
1348                 return NF_ACCEPT;
1349         }
1350
1351         IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
1352
1353         /* Check the server status */
1354         if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1355                 /* the destination server is not available */
1356
1357                 if (sysctl_ip_vs_expire_nodest_conn) {
1358                         /* try to expire the connection immediately */
1359                         ip_vs_conn_expire_now(cp);
1360                 }
1361                 /* don't restart its timer, and silently
1362                    drop the packet. */
1363                 __ip_vs_conn_put(cp);
1364                 return NF_DROP;
1365         }
1366
1367         ip_vs_in_stats(cp, skb);
1368         restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
1369         if (cp->packet_xmit)
1370                 ret = cp->packet_xmit(skb, cp, pp);
1371                 /* do not touch skb anymore */
1372         else {
1373                 IP_VS_DBG_RL("warning: packet_xmit is null");
1374                 ret = NF_ACCEPT;
1375         }
1376
1377         /* Increase its packet counter and check if it is needed
1378          * to be synchronized
1379          *
1380          * Sync connection if it is about to close to
1381          * encorage the standby servers to update the connections timeout
1382          */
1383         pkts = atomic_add_return(1, &cp->in_pkts);
1384         if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1385             cp->protocol == IPPROTO_SCTP) {
1386                 if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
1387                         (pkts % sysctl_ip_vs_sync_threshold[1]
1388                          == sysctl_ip_vs_sync_threshold[0])) ||
1389                                 (cp->old_state != cp->state &&
1390                                  ((cp->state == IP_VS_SCTP_S_CLOSED) ||
1391                                   (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
1392                                   (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
1393                         ip_vs_sync_conn(cp);
1394                         goto out;
1395                 }
1396         }
1397
1398         /* Keep this block last: TCP and others with pp->num_states <= 1 */
1399         else if (af == AF_INET &&
1400             (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1401             (((cp->protocol != IPPROTO_TCP ||
1402                cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1403               (pkts % sysctl_ip_vs_sync_threshold[1]
1404                == sysctl_ip_vs_sync_threshold[0])) ||
1405              ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1406               ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
1407                (cp->state == IP_VS_TCP_S_CLOSE) ||
1408                (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1409                (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1410                 ip_vs_sync_conn(cp);
1411 out:
1412         cp->old_state = cp->state;
1413
1414         ip_vs_conn_put(cp);
1415         return ret;
1416 }
1417
1418
1419 /*
1420  *      It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1421  *      related packets destined for 0.0.0.0/0.
1422  *      When fwmark-based virtual service is used, such as transparent
1423  *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
1424  *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1425  *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1426  *      and send them to ip_vs_in_icmp.
1427  */
1428 static unsigned int
1429 ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1430                    const struct net_device *in, const struct net_device *out,
1431                    int (*okfn)(struct sk_buff *))
1432 {
1433         int r;
1434
1435         if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1436                 return NF_ACCEPT;
1437
1438         return ip_vs_in_icmp(skb, &r, hooknum);
1439 }
1440
1441 #ifdef CONFIG_IP_VS_IPV6
1442 static unsigned int
1443 ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1444                       const struct net_device *in, const struct net_device *out,
1445                       int (*okfn)(struct sk_buff *))
1446 {
1447         int r;
1448
1449         if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1450                 return NF_ACCEPT;
1451
1452         return ip_vs_in_icmp_v6(skb, &r, hooknum);
1453 }
1454 #endif
1455
1456
1457 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1458         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1459          * or VS/NAT(change destination), so that filtering rules can be
1460          * applied to IPVS. */
1461         {
1462                 .hook           = ip_vs_in,
1463                 .owner          = THIS_MODULE,
1464                 .pf             = PF_INET,
1465                 .hooknum        = NF_INET_LOCAL_IN,
1466                 .priority       = 100,
1467         },
1468         /* After packet filtering, change source only for VS/NAT */
1469         {
1470                 .hook           = ip_vs_out,
1471                 .owner          = THIS_MODULE,
1472                 .pf             = PF_INET,
1473                 .hooknum        = NF_INET_FORWARD,
1474                 .priority       = 100,
1475         },
1476         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1477          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1478         {
1479                 .hook           = ip_vs_forward_icmp,
1480                 .owner          = THIS_MODULE,
1481                 .pf             = PF_INET,
1482                 .hooknum        = NF_INET_FORWARD,
1483                 .priority       = 99,
1484         },
1485 #ifdef CONFIG_IP_VS_IPV6
1486         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1487          * or VS/NAT(change destination), so that filtering rules can be
1488          * applied to IPVS. */
1489         {
1490                 .hook           = ip_vs_in,
1491                 .owner          = THIS_MODULE,
1492                 .pf             = PF_INET6,
1493                 .hooknum        = NF_INET_LOCAL_IN,
1494                 .priority       = 100,
1495         },
1496         /* After packet filtering, change source only for VS/NAT */
1497         {
1498                 .hook           = ip_vs_out,
1499                 .owner          = THIS_MODULE,
1500                 .pf             = PF_INET6,
1501                 .hooknum        = NF_INET_FORWARD,
1502                 .priority       = 100,
1503         },
1504         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1505          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1506         {
1507                 .hook           = ip_vs_forward_icmp_v6,
1508                 .owner          = THIS_MODULE,
1509                 .pf             = PF_INET6,
1510                 .hooknum        = NF_INET_FORWARD,
1511                 .priority       = 99,
1512         },
1513 #endif
1514 };
1515
1516
1517 /*
1518  *      Initialize IP Virtual Server
1519  */
1520 static int __init ip_vs_init(void)
1521 {
1522         int ret;
1523
1524         ip_vs_estimator_init();
1525
1526         ret = ip_vs_control_init();
1527         if (ret < 0) {
1528                 pr_err("can't setup control.\n");
1529                 goto cleanup_estimator;
1530         }
1531
1532         ip_vs_protocol_init();
1533
1534         ret = ip_vs_app_init();
1535         if (ret < 0) {
1536                 pr_err("can't setup application helper.\n");
1537                 goto cleanup_protocol;
1538         }
1539
1540         ret = ip_vs_conn_init();
1541         if (ret < 0) {
1542                 pr_err("can't setup connection table.\n");
1543                 goto cleanup_app;
1544         }
1545
1546         ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1547         if (ret < 0) {
1548                 pr_err("can't register hooks.\n");
1549                 goto cleanup_conn;
1550         }
1551
1552         pr_info("ipvs loaded.\n");
1553         return ret;
1554
1555   cleanup_conn:
1556         ip_vs_conn_cleanup();
1557   cleanup_app:
1558         ip_vs_app_cleanup();
1559   cleanup_protocol:
1560         ip_vs_protocol_cleanup();
1561         ip_vs_control_cleanup();
1562   cleanup_estimator:
1563         ip_vs_estimator_cleanup();
1564         return ret;
1565 }
1566
1567 static void __exit ip_vs_cleanup(void)
1568 {
1569         nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1570         ip_vs_conn_cleanup();
1571         ip_vs_app_cleanup();
1572         ip_vs_protocol_cleanup();
1573         ip_vs_control_cleanup();
1574         ip_vs_estimator_cleanup();
1575         pr_info("ipvs unloaded.\n");
1576 }
1577
1578 module_init(ip_vs_init);
1579 module_exit(ip_vs_cleanup);
1580 MODULE_LICENSE("GPL");