ipvs: netfilter connection tracking changes

author Julian Anastasov <ja@ssi.bg>

Tue, 21 Sep 2010 15:35:41 +0000 (17:35 +0200)

committer Patrick McHardy <kaber@trash.net>

Tue, 21 Sep 2010 15:35:41 +0000 (17:35 +0200)
author Julian Anastasov <ja@ssi.bg>
Tue, 21 Sep 2010 15:35:41 +0000 (17:35 +0200)
committer Patrick McHardy <kaber@trash.net>
Tue, 21 Sep 2010 15:35:41 +0000 (17:35 +0200)
diff --git a/include/linux/ip_vs.h b/include/linux/ip_vs.h

index 003d75f..df77286 100644 (file)
--- a/include/linux/ip_vs.h
+++ b/include/linux/ip_vs.h
@@ -90,10 +90,12 @@
  #define IP_VS_CONN_F_ONE_PACKET        0x2000          /* forward only one packet */
  
  /* Flags that are not sent to backup server start from bit 16 */
+#define IP_VS_CONN_F_NFCT      (1 << 16)       /* use netfilter conntrack */
  
  /* Connection flags from destination that can be changed by user space */
  #define IP_VS_CONN_F_DEST_MASK (IP_VS_CONN_F_FWD_MASK | \
                                 IP_VS_CONN_F_ONE_PACKET | \
+                               IP_VS_CONN_F_NFCT | \
                                 0)
  
  #define IP_VS_SCHEDNAME_MAXLEN 16
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h

index 62698a9..e8ec523 100644 (file)
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -25,7 +25,9 @@
  #include <linux/ip.h>
  #include <linux/ipv6.h>                        /* for struct ipv6hdr */
  #include <net/ipv6.h>                  /* for ipv6_addr_copy */
-
+#ifdef CONFIG_IP_VS_NFCT
+#include <net/netfilter/nf_conntrack.h>
+#endif
  
  /* Connections' size value needed by ip_vs_ctl.c */
  extern int ip_vs_conn_tab_size;
@@ -798,6 +800,7 @@ extern int sysctl_ip_vs_expire_nodest_conn;
  extern int sysctl_ip_vs_expire_quiescent_template;
  extern int sysctl_ip_vs_sync_threshold[2];
  extern int sysctl_ip_vs_nat_icmp_send;
+extern int sysctl_ip_vs_conntrack;
  extern struct ip_vs_stats ip_vs_stats;
  extern const struct ctl_path net_vs_ctl_path[];
  
@@ -955,8 +958,47 @@ static inline __wsum ip_vs_check_diff2(__be16 old, __be16 new, __wsum oldsum)
         return csum_partial(diff, sizeof(diff), oldsum);
  }
  
+#ifdef CONFIG_IP_VS_NFCT
+/*
+ *      Netfilter connection tracking
+ *      (from ip_vs_nfct.c)
+ */
+static inline int ip_vs_conntrack_enabled(void)
+{
+       return sysctl_ip_vs_conntrack;
+}
+
  extern void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp,
                                    int outin);
+extern int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp);
+extern void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
+                                     struct ip_vs_conn *cp, u_int8_t proto,
+                                     const __be16 port, int from_rs);
+extern void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp);
+
+#else
+
+static inline int ip_vs_conntrack_enabled(void)
+{
+       return 0;
+}
+
+static inline void ip_vs_update_conntrack(struct sk_buff *skb,
+                                         struct ip_vs_conn *cp, int outin)
+{
+}
+
+static inline int ip_vs_confirm_conntrack(struct sk_buff *skb,
+                                         struct ip_vs_conn *cp)
+{
+       return NF_ACCEPT;
+}
+
+static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
+{
+}
+/* CONFIG_IP_VS_NFCT */
+#endif
  
  #endif /* __KERNEL__ */
  
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig

index 46a77d5..af3c9f4 100644 (file)
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -3,7 +3,7 @@
  #
  menuconfig IP_VS
         tristate "IP virtual server support"
-       depends on NET && INET && NETFILTER && NF_CONNTRACK
+       depends on NET && INET && NETFILTER
         ---help---
           IP Virtual Server support will let you build a high-performance
           virtual server based on cluster of two or more real servers. This
@@ -235,7 +235,8 @@ comment 'IPVS application helper'
  
  config IP_VS_FTP
         tristate "FTP protocol helper"
-        depends on IP_VS_PROTO_TCP && NF_NAT
+        depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT
+       select IP_VS_NFCT
         ---help---
           FTP is a protocol that transfers IP address and/or port number in
           the payload. In the virtual server via Network Address Translation,
@@ -247,4 +248,12 @@ config     IP_VS_FTP
           If you want to compile it in kernel, say Y. To compile it as a
           module, choose M here. If unsure, say N.
  
+config IP_VS_NFCT
+       bool "Netfilter connection tracking"
+       depends on NF_CONNTRACK
+       ---help---
+         The Netfilter connection tracking support allows the IPVS
+         connection state to be exported to the Netfilter framework
+         for filtering purposes.
+
  endif # IP_VS
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile

index e3baefd..349fe88 100644 (file)
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -9,10 +9,13 @@ ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
  ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
  ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o
  
+ip_vs-extra_objs-y :=
+ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o
+
  ip_vs-objs :=  ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o        \
                 ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o                      \
                 ip_vs_est.o ip_vs_proto.o                                  \
-               $(ip_vs_proto-objs-y)
+               $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y)
  
  
  # IPVS core
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c

index 9fe1da7..a970d96 100644 (file)
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -721,6 +721,9 @@ static void ip_vs_conn_expire(unsigned long data)
                 if (cp->control)
                         ip_vs_control_del(cp);
  
+               if (cp->flags & IP_VS_CONN_F_NFCT)
+                       ip_vs_conn_drop_conntrack(cp);
+
                 if (unlikely(cp->app != NULL))
                         ip_vs_unbind_app(cp);
                 ip_vs_unbind_dest(cp);
@@ -816,6 +819,16 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
         if (unlikely(pp && atomic_read(&pp->appcnt)))
                 ip_vs_bind_app(cp, pp);
  
+       /*
+        * Allow conntrack to be preserved. By default, conntrack
+        * is created and destroyed for every packet.
+        * Sometimes keeping conntrack can be useful for
+        * IP_VS_CONN_F_ONE_PACKET too.
+        */
+
+       if (ip_vs_conntrack_enabled())
+               cp->flags |= IP_VS_CONN_F_NFCT;
+
         /* Hash it in the ip_vs_conn_tab finally */
         ip_vs_conn_hash(cp);
  
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c

index 319991d..7fbc80d 100644 (file)
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -537,6 +537,23 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
         return NF_DROP;
  }
  
+/*
+ * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
+ * chain and is used to avoid double NAT and confirmation when we do
+ * not want to keep the conntrack structure
+ */
+static unsigned int ip_vs_post_routing(unsigned int hooknum,
+                                      struct sk_buff *skb,
+                                      const struct net_device *in,
+                                      const struct net_device *out,
+                                      int (*okfn)(struct sk_buff *))
+{
+       if (!skb->ipvs_property)
+               return NF_ACCEPT;
+       /* The packet was sent from IPVS, exit this chain */
+       return NF_STOP;
+}
+
  __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
  {
         return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
@@ -695,7 +712,10 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
         /* do the statistics and put it back */
         ip_vs_out_stats(cp, skb);
  
-       skb->ipvs_property = 1;
+       if (!(cp->flags & IP_VS_CONN_F_NFCT))
+               skb->ipvs_property = 1;
+       else
+               ip_vs_update_conntrack(skb, cp, 0);
         verdict = NF_ACCEPT;
  
  out:
@@ -928,17 +948,19 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
  
         ip_vs_out_stats(cp, skb);
         ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
-       ip_vs_update_conntrack(skb, cp, 0);
+       if (!(cp->flags & IP_VS_CONN_F_NFCT))
+               skb->ipvs_property = 1;
+       else
+               ip_vs_update_conntrack(skb, cp, 0);
         ip_vs_conn_put(cp);
  
-       skb->ipvs_property = 1;
-
         LeaveFunction(11);
         return NF_ACCEPT;
  
  drop:
         ip_vs_conn_put(cp);
         kfree_skb(skb);
+       LeaveFunction(11);
         return NF_STOLEN;
  }
  
@@ -1483,6 +1505,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
                 .hooknum        = NF_INET_FORWARD,
                 .priority       = 99,
         },
+       /* Before the netfilter connection tracking, exit from POST_ROUTING */
+       {
+               .hook           = ip_vs_post_routing,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET,
+               .hooknum        = NF_INET_POST_ROUTING,
+               .priority       = NF_IP_PRI_NAT_SRC-1,
+       },
  #ifdef CONFIG_IP_VS_IPV6
         /* After packet filtering, forward packet through VS/DR, VS/TUN,
          * or VS/NAT(change destination), so that filtering rules can be
@@ -1511,6 +1541,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
                 .hooknum        = NF_INET_FORWARD,
                 .priority       = 99,
         },
+       /* Before the netfilter connection tracking, exit from POST_ROUTING */
+       {
+               .hook           = ip_vs_post_routing,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET6,
+               .hooknum        = NF_INET_POST_ROUTING,
+               .priority       = NF_IP6_PRI_NAT_SRC-1,
+       },
  #endif
  };
  
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c

index 7bd41d2..d2d842f 100644 (file)
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -88,6 +88,9 @@ int sysctl_ip_vs_expire_nodest_conn = 0;
  int sysctl_ip_vs_expire_quiescent_template = 0;
  int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
  int sysctl_ip_vs_nat_icmp_send = 0;
+#ifdef CONFIG_IP_VS_NFCT
+int sysctl_ip_vs_conntrack;
+#endif
  
  
  #ifdef CONFIG_IP_VS_DEBUG
@@ -1580,6 +1583,15 @@ static struct ctl_table vs_vars[] = {
                 .mode           = 0644,
                 .proc_handler   = proc_do_defense_mode,
         },
+#ifdef CONFIG_IP_VS_NFCT
+       {
+               .procname       = "conntrack",
+               .data           = &sysctl_ip_vs_conntrack,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+#endif
         {
                 .procname       = "secure_tcp",
                 .data           = &sysctl_ip_vs_secure_tcp,
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c

index 7e9af5b..9cd375f 100644 (file)
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -20,17 +20,6 @@
   *
   * Author:     Wouter Gadeyne
   *
- *
- * Code for ip_vs_expect_related and ip_vs_expect_callback is taken from
- * http://www.ssi.bg/~ja/nfct/:
- *
- * ip_vs_nfct.c:       Netfilter connection tracking support for IPVS
- *
- * Portions Copyright (C) 2001-2002
- * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
- *
- * Portions Copyright (C) 2003-2008
- * Julian Anastasov
   */
  
  #define KMSG_COMPONENT "IPVS"
@@ -58,16 +47,6 @@
  #define SERVER_STRING "227 Entering Passive Mode ("
  #define CLIENT_STRING "PORT "
  
-#define FMT_TUPLE      "%pI4:%u->%pI4:%u/%u"
-#define ARG_TUPLE(T)   &(T)->src.u3.ip, ntohs((T)->src.u.all), \
-                       &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
-                       (T)->dst.protonum
-
-#define FMT_CONN       "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
-#define ARG_CONN(C)    &((C)->caddr.ip), ntohs((C)->cport), \
-                       &((C)->vaddr.ip), ntohs((C)->vport), \
-                       &((C)->daddr.ip), ntohs((C)->dport), \
-                       (C)->protocol, (C)->state
  
  /*
   * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
@@ -85,6 +64,8 @@ static int ip_vs_ftp_pasv;
  static int
  ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
  {
+       /* We use connection tracking for the command connection */
+       cp->flags |= IP_VS_CONN_F_NFCT;
         return 0;
  }
  
@@ -148,120 +129,6 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
         return 1;
  }
  
-/*
- * Called from init_conntrack() as expectfn handler.
- */
-static void
-ip_vs_expect_callback(struct nf_conn *ct,
-                     struct nf_conntrack_expect *exp)
-{
-       struct nf_conntrack_tuple *orig, new_reply;
-       struct ip_vs_conn *cp;
-
-       if (exp->tuple.src.l3num != PF_INET)
-               return;
-
-       /*
-        * We assume that no NF locks are held before this callback.
-        * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
-        * expectations even if they use wildcard values, now we provide the
-        * actual values from the newly created original conntrack direction.
-        * The conntrack is confirmed when packet reaches IPVS hooks.
-        */
-
-       /* RS->CLIENT */
-       orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
-       cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum,
-                               &orig->src.u3, orig->src.u.tcp.port,
-                               &orig->dst.u3, orig->dst.u.tcp.port);
-       if (cp) {
-               /* Change reply CLIENT->RS to CLIENT->VS */
-               new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-               IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
-                         FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
-                         __func__, ct, ct->status,
-                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-                         ARG_CONN(cp));
-               new_reply.dst.u3 = cp->vaddr;
-               new_reply.dst.u.tcp.port = cp->vport;
-               IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
-                         ", inout cp=" FMT_CONN "\n",
-                         __func__, ct,
-                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-                         ARG_CONN(cp));
-               goto alter;
-       }
-
-       /* CLIENT->VS */
-       cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum,
-                              &orig->src.u3, orig->src.u.tcp.port,
-                              &orig->dst.u3, orig->dst.u.tcp.port);
-       if (cp) {
-               /* Change reply VS->CLIENT to RS->CLIENT */
-               new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-               IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
-                         FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
-                         __func__, ct, ct->status,
-                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-                         ARG_CONN(cp));
-               new_reply.src.u3 = cp->daddr;
-               new_reply.src.u.tcp.port = cp->dport;
-               IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", "
-                         FMT_TUPLE ", outin cp=" FMT_CONN "\n",
-                         __func__, ct,
-                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
-                         ARG_CONN(cp));
-               goto alter;
-       }
-
-       IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuple=" FMT_TUPLE
-                 " - unknown expect\n",
-                 __func__, ct, ct->status, ARG_TUPLE(orig));
-       return;
-
-alter:
-       /* Never alter conntrack for non-NAT conns */
-       if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
-               nf_conntrack_alter_reply(ct, &new_reply);
-       ip_vs_conn_put(cp);
-       return;
-}
-
-/*
- * Create NF conntrack expectation with wildcard (optional) source port.
- * Then the default callback function will alter the reply and will confirm
- * the conntrack entry when the first packet comes.
- */
-static void
-ip_vs_expect_related(struct sk_buff *skb, struct nf_conn *ct,
-                    struct ip_vs_conn *cp, u_int8_t proto,
-                    const __be16 *port, int from_rs)
-{
-       struct nf_conntrack_expect *exp;
-
-       BUG_ON(!ct || ct == &nf_conntrack_untracked);
-
-       exp = nf_ct_expect_alloc(ct);
-       if (!exp)
-               return;
-
-       if (from_rs)
-               nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
-                                 nf_ct_l3num(ct), &cp->daddr, &cp->caddr,
-                                 proto, port, &cp->cport);
-       else
-               nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
-                                 nf_ct_l3num(ct), &cp->caddr, &cp->vaddr,
-                                 proto, port, &cp->vport);
-
-       exp->expectfn = ip_vs_expect_callback;
-
-       IP_VS_DBG(7, "%s(): ct=%p, expect tuple=" FMT_TUPLE "\n",
-                 __func__, ct, ARG_TUPLE(&exp->tuple));
-       nf_ct_expect_related(exp);
-       nf_ct_expect_put(exp);
-}
-
  /*
   * Look at outgoing ftp packets to catch the response to a PASV command
   * from the server (inside-to-outside).
@@ -335,7 +202,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
                                               &cp->caddr, 0,
                                               &cp->vaddr, port,
                                               &from, port,
-                                             IP_VS_CONN_F_NO_CPORT,
+                                             IP_VS_CONN_F_NO_CPORT |
+                                             IP_VS_CONN_F_NFCT,
                                               cp->dest);
                         if (!n_cp)
                                 return 0;
@@ -371,8 +239,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
                                                        start-data, end-start,
                                                        buf, buf_len);
                         if (ret)
-                               ip_vs_expect_related(skb, ct, n_cp,
-                                                    IPPROTO_TCP, NULL, 0);
+                               ip_vs_nfct_expect_related(skb, ct, n_cp,
+                                                         IPPROTO_TCP, 0, 0);
                 }
  
                 /*
@@ -487,7 +355,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
                                       &to, port,
                                       &cp->vaddr, htons(ntohs(cp->vport)-1),
                                       &cp->daddr, htons(ntohs(cp->dport)-1),
-                                     0,
+                                     IP_VS_CONN_F_NFCT,
                                       cp->dest);
                 if (!n_cp)
                         return 0;
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c

new file mode 100644 (file)

index 0000000..c038458
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -0,0 +1,292 @@
+/*
+ * ip_vs_nfct.c:       Netfilter connection tracking support for IPVS
+ *
+ * Portions Copyright (C) 2001-2002
+ * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
+ *
+ * Portions Copyright (C) 2003-2010
+ * Julian Anastasov
+ *
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *
+ * Authors:
+ * Ben North <ben@redfrontdoor.org>
+ * Julian Anastasov <ja@ssi.bg>                Reorganize and sync with latest kernels
+ * Hannes Eder <heder@google.com>      Extend NFCT support for FTP, ipvs match
+ *
+ *
+ * Current status:
+ *
+ * - provide conntrack confirmation for new and related connections, by
+ * this way we can see their proper conntrack state in all hooks
+ * - support for all forwarding methods, not only NAT
+ * - FTP support (NAT), ability to support other NAT apps with expectations
+ * - to correctly create expectations for related NAT connections the proper
+ * NF conntrack support must be already installed, eg. ip_vs_ftp requires
+ * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables
+ * NAT rules are needed)
+ * - alter reply for NAT when forwarding packet in original direction:
+ * conntrack from client in NEW or RELATED (Passive FTP DATA) state or
+ * when RELATED conntrack is created from real server (Active FTP DATA)
+ * - if iptables_nat is not loaded the Passive FTP will not work (the
+ * PASV response can not be NAT-ed) but Active FTP should work
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+
+#define FMT_TUPLE      "%pI4:%u->%pI4:%u/%u"
+#define ARG_TUPLE(T)   &(T)->src.u3.ip, ntohs((T)->src.u.all), \
+                       &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+                       (T)->dst.protonum
+
+#define FMT_CONN       "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
+#define ARG_CONN(C)    &((C)->caddr.ip), ntohs((C)->cport), \
+                       &((C)->vaddr.ip), ntohs((C)->vport), \
+                       &((C)->daddr.ip), ntohs((C)->dport), \
+                       (C)->protocol, (C)->state
+
+void
+ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
+{
+       enum ip_conntrack_info ctinfo;
+       struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+       struct nf_conntrack_tuple new_tuple;
+
+       if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) ||
+           nf_ct_is_dying(ct))
+               return;
+
+       /* Never alter conntrack for non-NAT conns */
+       if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+               return;
+
+       /* Alter reply only in original direction */
+       if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+               return;
+
+       /*
+        * The connection is not yet in the hashtable, so we update it.
+        * CIP->VIP will remain the same, so leave the tuple in
+        * IP_CT_DIR_ORIGINAL untouched.  When the reply comes back from the
+        * real-server we will see RIP->DIP.
+        */
+       new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+       /*
+        * This will also take care of UDP and other protocols.
+        */
+       if (outin) {
+               new_tuple.src.u3 = cp->daddr;
+               if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+                   new_tuple.dst.protonum != IPPROTO_ICMPV6)
+                       new_tuple.src.u.tcp.port = cp->dport;
+       } else {
+               new_tuple.dst.u3 = cp->vaddr;
+               if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+                   new_tuple.dst.protonum != IPPROTO_ICMPV6)
+                       new_tuple.dst.u.tcp.port = cp->vport;
+       }
+       IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+                 "ctinfo=%d, old reply=" FMT_TUPLE
+                 ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
+                 __func__, ct, ct->status, ctinfo,
+                 ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
+                 ARG_TUPLE(&new_tuple), ARG_CONN(cp));
+       nf_conntrack_alter_reply(ct, &new_tuple);
+}
+
+int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
+{
+       return nf_conntrack_confirm(skb);
+}
+
+/*
+ * Called from init_conntrack() as expectfn handler.
+ */
+static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
+       struct nf_conntrack_expect *exp)
+{
+       struct nf_conntrack_tuple *orig, new_reply;
+       struct ip_vs_conn *cp;
+
+       if (exp->tuple.src.l3num != PF_INET)
+               return;
+
+       /*
+        * We assume that no NF locks are held before this callback.
+        * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
+        * expectations even if they use wildcard values, now we provide the
+        * actual values from the newly created original conntrack direction.
+        * The conntrack is confirmed when packet reaches IPVS hooks.
+        */
+
+       /* RS->CLIENT */
+       orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+       cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum,
+                               &orig->src.u3, orig->src.u.tcp.port,
+                               &orig->dst.u3, orig->dst.u.tcp.port);
+       if (cp) {
+               /* Change reply CLIENT->RS to CLIENT->VS */
+               new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+               IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+                         FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
+                         __func__, ct, ct->status,
+                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+                         ARG_CONN(cp));
+               new_reply.dst.u3 = cp->vaddr;
+               new_reply.dst.u.tcp.port = cp->vport;
+               IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
+                         ", inout cp=" FMT_CONN "\n",
+                         __func__, ct,
+                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+                         ARG_CONN(cp));
+               goto alter;
+       }
+
+       /* CLIENT->VS */
+       cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum,
+                              &orig->src.u3, orig->src.u.tcp.port,
+                              &orig->dst.u3, orig->dst.u.tcp.port);
+       if (cp) {
+               /* Change reply VS->CLIENT to RS->CLIENT */
+               new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+               IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+                         FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
+                         __func__, ct, ct->status,
+                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+                         ARG_CONN(cp));
+               new_reply.src.u3 = cp->daddr;
+               new_reply.src.u.tcp.port = cp->dport;
+               IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
+                         FMT_TUPLE ", outin cp=" FMT_CONN "\n",
+                         __func__, ct,
+                         ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+                         ARG_CONN(cp));
+               goto alter;
+       }
+
+       IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+                 " - unknown expect\n",
+                 __func__, ct, ct->status, ARG_TUPLE(orig));
+       return;
+
+alter:
+       /* Never alter conntrack for non-NAT conns */
+       if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
+               nf_conntrack_alter_reply(ct, &new_reply);
+       ip_vs_conn_put(cp);
+       return;
+}
+
+/*
+ * Create NF conntrack expectation with wildcard (optional) source port.
+ * Then the default callback function will alter the reply and will confirm
+ * the conntrack entry when the first packet comes.
+ * Use port 0 to expect connection from any port.
+ */
+void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
+                              struct ip_vs_conn *cp, u_int8_t proto,
+                              const __be16 port, int from_rs)
+{
+       struct nf_conntrack_expect *exp;
+
+       if (ct == NULL || nf_ct_is_untracked(ct))
+               return;
+
+       exp = nf_ct_expect_alloc(ct);
+       if (!exp)
+               return;
+
+       nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+                       from_rs ? &cp->daddr : &cp->caddr,
+                       from_rs ? &cp->caddr : &cp->vaddr,
+                       proto, port ? &port : NULL,
+                       from_rs ? &cp->cport : &cp->vport);
+
+       exp->expectfn = ip_vs_nfct_expect_callback;
+
+       IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
+               __func__, ct, ARG_TUPLE(&exp->tuple));
+       nf_ct_expect_related(exp);
+       nf_ct_expect_put(exp);
+}
+EXPORT_SYMBOL(ip_vs_nfct_expect_related);
+
+/*
+ * Our connection was terminated, try to drop the conntrack immediately
+ */
+void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
+{
+       struct nf_conntrack_tuple_hash *h;
+       struct nf_conn *ct;
+       struct nf_conntrack_tuple tuple;
+
+       if (!cp->cport)
+               return;
+
+       tuple = (struct nf_conntrack_tuple) {
+               .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } };
+       tuple.src.u3 = cp->caddr;
+       tuple.src.u.all = cp->cport;
+       tuple.src.l3num = cp->af;
+       tuple.dst.u3 = cp->vaddr;
+       tuple.dst.u.all = cp->vport;
+
+       IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
+               " for conn " FMT_CONN "\n",
+               __func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
+
+       h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple);
+       if (h) {
+               ct = nf_ct_tuplehash_to_ctrack(h);
+               /* Show what happens instead of calling nf_ct_kill() */
+               if (del_timer(&ct->timeout)) {
+                       IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple="
+                               FMT_TUPLE "\n",
+                               __func__, ct, ARG_TUPLE(&tuple));
+                       if (ct->timeout.function)
+                               ct->timeout.function(ct->timeout.data);
+               } else {
+                       IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
+                               FMT_TUPLE "\n",
+                               __func__, ct, ARG_TUPLE(&tuple));
+               }
+               nf_ct_put(ct);
+       } else {
+               IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
+                       __func__, ARG_TUPLE(&tuple));
+       }
+}
+
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c

index 49df6be..8817afa 100644 (file)
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -28,7 +28,6 @@
  #include <net/ip6_route.h>
  #include <linux/icmpv6.h>
  #include <linux/netfilter.h>
-#include <net/netfilter/nf_conntrack.h>
  #include <linux/netfilter_ipv4.h>
  
  #include <net/ip_vs.h>
@@ -194,12 +193,37 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
         dst_release(old_dst);
  }
  
-#define IP_VS_XMIT(pf, skb, rt)                                \
+#define IP_VS_XMIT_TUNNEL(skb, cp)                             \
+({                                                             \
+       int __ret = NF_ACCEPT;                                  \
+                                                               \
+       if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT))          \
+               __ret = ip_vs_confirm_conntrack(skb, cp);       \
+       if (__ret == NF_ACCEPT) {                               \
+               nf_reset(skb);                                  \
+               (skb)->ip_summed = CHECKSUM_NONE;               \
+       }                                                       \
+       __ret;                                                  \
+})
+
+#define IP_VS_XMIT_NAT(pf, skb, cp)                            \
  do {                                                   \
-       (skb)->ipvs_property = 1;                       \
+       if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
+               (skb)->ipvs_property = 1;               \
+       else                                            \
+               ip_vs_update_conntrack(skb, cp, 1);     \
         skb_forward_csum(skb);                          \
         NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,     \
-               (rt)->dst.dev, dst_output);             \
+               skb_dst(skb)->dev, dst_output);         \
+} while (0)
+
+#define IP_VS_XMIT(pf, skb, cp)                                \
+do {                                                   \
+       if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
+               (skb)->ipvs_property = 1;               \
+       skb_forward_csum(skb);                          \
+       NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL,     \
+               skb_dst(skb)->dev, dst_output);         \
  } while (0)
  
  
@@ -271,7 +295,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
         /* Another hack: avoid icmp_send in ip_fragment */
         skb->local_df = 1;
  
-       IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+       IP_VS_XMIT(NFPROTO_IPV4, skb, cp);
  
         LeaveFunction(10);
         return NF_STOLEN;
@@ -335,7 +359,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
         /* Another hack: avoid icmp_send in ip_fragment */
         skb->local_df = 1;
  
-       IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+       IP_VS_XMIT(NFPROTO_IPV6, skb, cp);
  
         LeaveFunction(10);
         return NF_STOLEN;
@@ -349,36 +373,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
  }
  #endif
  
-void
-ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
-{
-       struct nf_conn *ct = (struct nf_conn *)skb->nfct;
-       struct nf_conntrack_tuple new_tuple;
-
-       if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct))
-               return;
-
-       /*
-        * The connection is not yet in the hashtable, so we update it.
-        * CIP->VIP will remain the same, so leave the tuple in
-        * IP_CT_DIR_ORIGINAL untouched.  When the reply comes back from the
-        * real-server we will see RIP->DIP.
-        */
-       new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-       if (outin)
-               new_tuple.src.u3 = cp->daddr;
-       else
-               new_tuple.dst.u3 = cp->vaddr;
-       /*
-        * This will also take care of UDP and other protocols.
-        */
-       if (outin)
-               new_tuple.src.u.tcp.port = cp->dport;
-       else
-               new_tuple.dst.u.tcp.port = cp->vport;
-       nf_conntrack_alter_reply(ct, &new_tuple);
-}
-
  /*
   *      NAT transmitter (only for outside-to-inside nat forwarding)
   *      Not used for related ICMP
@@ -434,8 +428,6 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
  
         IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
  
-       ip_vs_update_conntrack(skb, cp, 1);
-
         /* FIXME: when application helper enlarges the packet and the length
            is larger than the MTU of outgoing device, there will be still
            MTU problem. */
@@ -443,7 +435,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
         /* Another hack: avoid icmp_send in ip_fragment */
         skb->local_df = 1;
  
-       IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+       IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp);
  
         LeaveFunction(10);
         return NF_STOLEN;
@@ -451,8 +443,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
    tx_error_icmp:
         dst_link_failure(skb);
    tx_error:
-       LeaveFunction(10);
         kfree_skb(skb);
+       LeaveFunction(10);
         return NF_STOLEN;
    tx_error_put:
         ip_rt_put(rt);
@@ -512,8 +504,6 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
  
         IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
  
-       ip_vs_update_conntrack(skb, cp, 1);
-
         /* FIXME: when application helper enlarges the packet and the length
            is larger than the MTU of outgoing device, there will be still
            MTU problem. */
@@ -521,7 +511,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
         /* Another hack: avoid icmp_send in ip_fragment */
         skb->local_df = 1;
  
-       IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+       IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp);
  
         LeaveFunction(10);
         return NF_STOLEN;
@@ -571,6 +561,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
         struct iphdr  *iph;                     /* Our new IP header */
         unsigned int max_headroom;              /* The extra header space needed */
         int    mtu;
+       int ret;
  
         EnterFunction(10);
  
@@ -655,7 +646,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
         /* Another hack: avoid icmp_send in ip_fragment */
         skb->local_df = 1;
  
-       ip_local_out(skb);
+       ret = IP_VS_XMIT_TUNNEL(skb, cp);
+       if (ret == NF_ACCEPT)
+               ip_local_out(skb);
+       else if (ret == NF_DROP)
+               kfree_skb(skb);
  
         LeaveFunction(10);
  
@@ -681,6 +676,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
         struct ipv6hdr  *iph;           /* Our new IP header */
         unsigned int max_headroom;      /* The extra header space needed */
         int    mtu;
+       int ret;
  
         EnterFunction(10);
  
@@ -761,7 +757,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
         /* Another hack: avoid icmp_send in ip_fragment */
         skb->local_df = 1;
  
-       ip6_local_out(skb);
+       ret = IP_VS_XMIT_TUNNEL(skb, cp);
+       if (ret == NF_ACCEPT)
+               ip6_local_out(skb);
+       else if (ret == NF_DROP)
+               kfree_skb(skb);
  
         LeaveFunction(10);
  
@@ -820,7 +820,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
         /* Another hack: avoid icmp_send in ip_fragment */
         skb->local_df = 1;
  
-       IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+       IP_VS_XMIT(NFPROTO_IPV4, skb, cp);
  
         LeaveFunction(10);
         return NF_STOLEN;
@@ -873,7 +873,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
         /* Another hack: avoid icmp_send in ip_fragment */
         skb->local_df = 1;
  
-       IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+       IP_VS_XMIT(NFPROTO_IPV6, skb, cp);
  
         LeaveFunction(10);
         return NF_STOLEN;
@@ -947,7 +947,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
         /* Another hack: avoid icmp_send in ip_fragment */
         skb->local_df = 1;
  
-       IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+       IP_VS_XMIT(NFPROTO_IPV4, skb, cp);
  
         rc = NF_STOLEN;
         goto out;
@@ -1022,7 +1022,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
         /* Another hack: avoid icmp_send in ip_fragment */
         skb->local_df = 1;
  
-       IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+       IP_VS_XMIT(NFPROTO_IPV6, skb, cp);
  
         rc = NF_STOLEN;
         goto out;
author	Julian Anastasov <ja@ssi.bg>
	Tue, 21 Sep 2010 15:35:41 +0000 (17:35 +0200)
committer	Patrick McHardy <kaber@trash.net>
	Tue, 21 Sep 2010 15:35:41 +0000 (17:35 +0200)
include/linux/ip_vs.h		patch \| blob \| history
include/net/ip_vs.h		patch \| blob \| history
net/netfilter/ipvs/Kconfig		patch \| blob \| history
net/netfilter/ipvs/Makefile		patch \| blob \| history
net/netfilter/ipvs/ip_vs_conn.c		patch \| blob \| history
net/netfilter/ipvs/ip_vs_core.c		patch \| blob \| history
net/netfilter/ipvs/ip_vs_ctl.c		patch \| blob \| history
net/netfilter/ipvs/ip_vs_ftp.c		patch \| blob \| history
net/netfilter/ipvs/ip_vs_nfct.c	[new file with mode: 0644]	patch \| blob
net/netfilter/ipvs/ip_vs_xmit.c		patch \| blob \| history