ipvs: changes for local client
authorJulian Anastasov <ja@ssi.bg>
Sun, 17 Oct 2010 13:40:51 +0000 (16:40 +0300)
committerSimon Horman <horms@verge.net.au>
Thu, 21 Oct 2010 09:04:01 +0000 (11:04 +0200)
This patch deals with local client processing.

Prefer LOCAL_OUT hook for scheduling connections from
local clients. LOCAL_IN is still supported if the packets are
not marked as processed in LOCAL_OUT. The idea to process
requests in LOCAL_OUT is to alter conntrack reply before
it is confirmed at POST_ROUTING. If the local requests are
processed in LOCAL_IN the conntrack can not be updated
and matching by state is impossible.

Add the following handlers:

- ip_vs_reply[46] at LOCAL_IN:99 to process replies from
remote real servers to local clients. Now when both
replies from remote real servers (ip_vs_reply*) and
local real servers (ip_vs_local_reply*) are handled
it is safe to remove the conn_out_get call from ip_vs_in
because it does not support related ICMP packets.

- ip_vs_local_request[46] at LOCAL_OUT:-98 to process
requests from local client

Handling in LOCAL_OUT causes some changes:

- as skb->dev, skb->protocol and skb->pkt_type are not defined
in LOCAL_OUT make sure we set skb->dev before calling icmpv6_send,
prefer skb_dst(skb) for struct net and remove the skb->protocol
checks from TUN transmitters.

[ horms@verge.net.au: removed trailing whitespace ]
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
net/netfilter/ipvs/ip_vs_core.c
net/netfilter/ipvs/ip_vs_xmit.c

index a6c8aff..5fbcf67 100644 (file)
@@ -529,9 +529,14 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
         * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
         */
 #ifdef CONFIG_IP_VS_IPV6
-       if (svc->af == AF_INET6)
+       if (svc->af == AF_INET6) {
+               if (!skb->dev) {
+                       struct net *net = dev_net(skb_dst(skb)->dev);
+
+                       skb->dev = net->loopback_dev;
+               }
                icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
-       else
+       else
 #endif
                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 
@@ -1065,57 +1070,61 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
         */
        cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
 
-       if (unlikely(!cp)) {
-               if (sysctl_ip_vs_nat_icmp_send &&
-                   (pp->protocol == IPPROTO_TCP ||
-                    pp->protocol == IPPROTO_UDP ||
-                    pp->protocol == IPPROTO_SCTP)) {
-                       __be16 _ports[2], *pptr;
-
-                       pptr = skb_header_pointer(skb, iph.len,
-                                                 sizeof(_ports), _ports);
-                       if (pptr == NULL)
-                               return NF_ACCEPT;       /* Not for me */
-                       if (ip_vs_lookup_real_service(af, iph.protocol,
-                                                     &iph.saddr,
-                                                     pptr[0])) {
-                               /*
-                                * Notify the real server: there is no
-                                * existing entry if it is not RST
-                                * packet or not TCP packet.
-                                */
-                               if ((iph.protocol != IPPROTO_TCP &&
-                                    iph.protocol != IPPROTO_SCTP)
-                                    || ((iph.protocol == IPPROTO_TCP
-                                         && !is_tcp_reset(skb, iph.len))
-                                        || (iph.protocol == IPPROTO_SCTP
-                                               && !is_sctp_abort(skb,
-                                                       iph.len)))) {
+       if (likely(cp))
+               return handle_response(af, skb, pp, cp, iph.len);
+       if (sysctl_ip_vs_nat_icmp_send &&
+           (pp->protocol == IPPROTO_TCP ||
+            pp->protocol == IPPROTO_UDP ||
+            pp->protocol == IPPROTO_SCTP)) {
+               __be16 _ports[2], *pptr;
+
+               pptr = skb_header_pointer(skb, iph.len,
+                                         sizeof(_ports), _ports);
+               if (pptr == NULL)
+                       return NF_ACCEPT;       /* Not for me */
+               if (ip_vs_lookup_real_service(af, iph.protocol,
+                                             &iph.saddr,
+                                             pptr[0])) {
+                       /*
+                        * Notify the real server: there is no
+                        * existing entry if it is not RST
+                        * packet or not TCP packet.
+                        */
+                       if ((iph.protocol != IPPROTO_TCP &&
+                            iph.protocol != IPPROTO_SCTP)
+                            || ((iph.protocol == IPPROTO_TCP
+                                 && !is_tcp_reset(skb, iph.len))
+                                || (iph.protocol == IPPROTO_SCTP
+                                       && !is_sctp_abort(skb,
+                                               iph.len)))) {
 #ifdef CONFIG_IP_VS_IPV6
-                                       if (af == AF_INET6)
-                                               icmpv6_send(skb,
-                                                           ICMPV6_DEST_UNREACH,
-                                                           ICMPV6_PORT_UNREACH,
-                                                           0);
-                                       else
+                               if (af == AF_INET6) {
+                                       struct net *net =
+                                               dev_net(skb_dst(skb)->dev);
+
+                                       if (!skb->dev)
+                                               skb->dev = net->loopback_dev;
+                                       icmpv6_send(skb,
+                                                   ICMPV6_DEST_UNREACH,
+                                                   ICMPV6_PORT_UNREACH,
+                                                   0);
+                               } else
 #endif
-                                               icmp_send(skb,
-                                                         ICMP_DEST_UNREACH,
-                                                         ICMP_PORT_UNREACH, 0);
-                                       return NF_DROP;
-                               }
+                                       icmp_send(skb,
+                                                 ICMP_DEST_UNREACH,
+                                                 ICMP_PORT_UNREACH, 0);
+                               return NF_DROP;
                        }
                }
-               IP_VS_DBG_PKT(12, pp, skb, 0,
-                             "packet continues traversal as normal");
-               return NF_ACCEPT;
        }
-
-       return handle_response(af, skb, pp, cp, iph.len);
+       IP_VS_DBG_PKT(12, pp, skb, 0,
+                     "ip_vs_out: packet continues traversal as normal");
+       return NF_ACCEPT;
 }
 
 /*
- *     It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
+ *     It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ *     used only for VS/NAT.
  *     Check if packet is reply for established ip_vs_conn.
  */
 static unsigned int
@@ -1147,7 +1156,8 @@ ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
 #ifdef CONFIG_IP_VS_IPV6
 
 /*
- *     It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
+ *     It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ *     used only for VS/NAT.
  *     Check if packet is reply for established ip_vs_conn.
  */
 static unsigned int
@@ -1404,34 +1414,43 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
  *     and send it on its way...
  */
 static unsigned int
-ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
-        const struct net_device *in, const struct net_device *out,
-        int (*okfn)(struct sk_buff *))
+ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 {
        struct ip_vs_iphdr iph;
        struct ip_vs_protocol *pp;
        struct ip_vs_conn *cp;
-       int ret, restart, af, pkts;
+       int ret, restart, pkts;
 
        /* Already marked as IPVS request or reply? */
        if (skb->ipvs_property)
                return NF_ACCEPT;
 
-       af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
-
-       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-
        /*
-        *      Big tappo: only PACKET_HOST, including loopback for local client
-        *      Don't handle local packets on IPv6 for now
+        *      Big tappo:
+        *      - remote client: only PACKET_HOST
+        *      - route: used for struct net when skb->dev is unset
         */
-       if (unlikely(skb->pkt_type != PACKET_HOST)) {
-               IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
-                             skb->pkt_type,
-                             iph.protocol,
-                             IP_VS_DBG_ADDR(af, &iph.daddr));
+       if (unlikely((skb->pkt_type != PACKET_HOST &&
+                     hooknum != NF_INET_LOCAL_OUT) ||
+                    !skb_dst(skb))) {
+               ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+               IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
+                             " ignored in hook %u\n",
+                             skb->pkt_type, iph.protocol,
+                             IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
                return NF_ACCEPT;
        }
+       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+
+       /* Bad... Do not break raw sockets */
+       if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+                    af == AF_INET)) {
+               struct sock *sk = skb->sk;
+               struct inet_sock *inet = inet_sk(skb->sk);
+
+               if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+                       return NF_ACCEPT;
+       }
 
 #ifdef CONFIG_IP_VS_IPV6
        if (af == AF_INET6) {
@@ -1467,11 +1486,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
        if (unlikely(!cp)) {
                int v;
 
-               /* For local client packets, it could be a response */
-               cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
-               if (cp)
-                       return handle_response(af, skb, pp, cp, iph.len);
-
                if (!pp->conn_schedule(af, skb, pp, &v, &cp))
                        return v;
        }
@@ -1479,7 +1493,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
        if (unlikely(!cp)) {
                /* sorry, all this trouble for a no-hit :) */
                IP_VS_DBG_PKT(12, pp, skb, 0,
-                             "packet continues traversal as normal");
+                             "ip_vs_in: packet continues traversal as normal");
                return NF_ACCEPT;
        }
 
@@ -1550,6 +1564,72 @@ out:
        return ret;
 }
 
+/*
+ *     AF_INET handler in NF_INET_LOCAL_IN chain
+ *     Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
+                     const struct net_device *in,
+                     const struct net_device *out,
+                     int (*okfn)(struct sk_buff *))
+{
+       return ip_vs_in(hooknum, skb, AF_INET);
+}
+
+/*
+ *     AF_INET handler in NF_INET_LOCAL_OUT chain
+ *     Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
+                    const struct net_device *in, const struct net_device *out,
+                    int (*okfn)(struct sk_buff *))
+{
+       unsigned int verdict;
+
+       /* Disable BH in LOCAL_OUT until all places are fixed */
+       local_bh_disable();
+       verdict = ip_vs_in(hooknum, skb, AF_INET);
+       local_bh_enable();
+       return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ *     AF_INET6 handler in NF_INET_LOCAL_IN chain
+ *     Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
+                     const struct net_device *in,
+                     const struct net_device *out,
+                     int (*okfn)(struct sk_buff *))
+{
+       return ip_vs_in(hooknum, skb, AF_INET6);
+}
+
+/*
+ *     AF_INET6 handler in NF_INET_LOCAL_OUT chain
+ *     Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
+                    const struct net_device *in, const struct net_device *out,
+                    int (*okfn)(struct sk_buff *))
+{
+       unsigned int verdict;
+
+       /* Disable BH in LOCAL_OUT until all places are fixed */
+       local_bh_disable();
+       verdict = ip_vs_in(hooknum, skb, AF_INET6);
+       local_bh_enable();
+       return verdict;
+}
+
+#endif
+
 
 /*
  *     It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
@@ -1590,15 +1670,23 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
 
 
 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
+       /* After packet filtering, change source only for VS/NAT */
+       {
+               .hook           = ip_vs_reply4,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET,
+               .hooknum        = NF_INET_LOCAL_IN,
+               .priority       = 99,
+       },
        /* After packet filtering, forward packet through VS/DR, VS/TUN,
         * or VS/NAT(change destination), so that filtering rules can be
         * applied to IPVS. */
        {
-               .hook           = ip_vs_in,
+               .hook           = ip_vs_remote_request4,
                .owner          = THIS_MODULE,
                .pf             = PF_INET,
-               .hooknum        = NF_INET_LOCAL_IN,
-               .priority       = 100,
+               .hooknum        = NF_INET_LOCAL_IN,
+               .priority       = 101,
        },
        /* Before ip_vs_in, change source only for VS/NAT */
        {
@@ -1608,14 +1696,22 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
                .hooknum        = NF_INET_LOCAL_OUT,
                .priority       = -99,
        },
+       /* After mangle, schedule and forward local requests */
+       {
+               .hook           = ip_vs_local_request4,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET,
+               .hooknum        = NF_INET_LOCAL_OUT,
+               .priority       = -98,
+       },
        /* After packet filtering (but before ip_vs_out_icmp), catch icmp
         * destined for 0.0.0.0/0, which is for incoming IPVS connections */
        {
                .hook           = ip_vs_forward_icmp,
                .owner          = THIS_MODULE,
                .pf             = PF_INET,
-               .hooknum        = NF_INET_FORWARD,
-               .priority       = 99,
+               .hooknum        = NF_INET_FORWARD,
+               .priority       = 99,
        },
        /* After packet filtering, change source only for VS/NAT */
        {
@@ -1626,15 +1722,23 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
                .priority       = 100,
        },
 #ifdef CONFIG_IP_VS_IPV6
+       /* After packet filtering, change source only for VS/NAT */
+       {
+               .hook           = ip_vs_reply6,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET6,
+               .hooknum        = NF_INET_LOCAL_IN,
+               .priority       = 99,
+       },
        /* After packet filtering, forward packet through VS/DR, VS/TUN,
         * or VS/NAT(change destination), so that filtering rules can be
         * applied to IPVS. */
        {
-               .hook           = ip_vs_in,
+               .hook           = ip_vs_remote_request6,
                .owner          = THIS_MODULE,
                .pf             = PF_INET6,
-               .hooknum        = NF_INET_LOCAL_IN,
-               .priority       = 100,
+               .hooknum        = NF_INET_LOCAL_IN,
+               .priority       = 101,
        },
        /* Before ip_vs_in, change source only for VS/NAT */
        {
@@ -1644,14 +1748,22 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
                .hooknum        = NF_INET_LOCAL_OUT,
                .priority       = -99,
        },
+       /* After mangle, schedule and forward local requests */
+       {
+               .hook           = ip_vs_local_request6,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET6,
+               .hooknum        = NF_INET_LOCAL_OUT,
+               .priority       = -98,
+       },
        /* After packet filtering (but before ip_vs_out_icmp), catch icmp
         * destined for 0.0.0.0/0, which is for incoming IPVS connections */
        {
                .hook           = ip_vs_forward_icmp_v6,
                .owner          = THIS_MODULE,
                .pf             = PF_INET6,
-               .hooknum        = NF_INET_FORWARD,
-               .priority       = 99,
+               .hooknum        = NF_INET_FORWARD,
+               .priority       = 99,
        },
        /* After packet filtering, change source only for VS/NAT */
        {
index 8608882..97b5361 100644 (file)
  *
  * Changes:
  *
+ * Description of forwarding methods:
+ * - all transmitters are called from LOCAL_IN (remote clients) and
+ * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
+ * - not all connections have destination server, for example,
+ * connections in backup server when fwmark is used
+ * - bypass connections use daddr from packet
+ * LOCAL_OUT rules:
+ * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
+ * - skb->pkt_type is not set yet
+ * - the only place where we can see skb->sk != NULL
  */
 
 #define KMSG_COMPONENT "IPVS"
@@ -452,8 +462,13 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
        if (skb->len > mtu) {
-               dst_release(&rt->dst);
+               if (!skb->dev) {
+                       struct net *net = dev_net(skb_dst(skb)->dev);
+
+                       skb->dev = net->loopback_dev;
+               }
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+               dst_release(&rt->dst);
                IP_VS_DBG_RL("%s(): frag needed\n", __func__);
                goto tx_error;
        }
@@ -659,6 +674,11 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
        if (skb->len > mtu) {
+               if (!skb->dev) {
+                       struct net *net = dev_net(skb_dst(skb)->dev);
+
+                       skb->dev = net->loopback_dev;
+               }
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
                IP_VS_DBG_RL_PKT(0, pp, skb, 0,
                                 "ip_vs_nat_xmit_v6(): frag needed for");
@@ -748,13 +768,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
        EnterFunction(10);
 
-       if (skb->protocol != htons(ETH_P_IP)) {
-               IP_VS_DBG_RL("%s(): protocol error, "
-                            "ETH_P_IP: %d, skb protocol: %d\n",
-                            __func__, htons(ETH_P_IP), skb->protocol);
-               goto tx_error;
-       }
-
        if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
                                      RT_TOS(tos), 1|2)))
                goto tx_error_icmp;
@@ -869,13 +882,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
        EnterFunction(10);
 
-       if (skb->protocol != htons(ETH_P_IPV6)) {
-               IP_VS_DBG_RL("%s(): protocol error, "
-                            "ETH_P_IPV6: %d, skb protocol: %d\n",
-                            __func__, htons(ETH_P_IPV6), skb->protocol);
-               goto tx_error;
-       }
-
        if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
                                         &saddr, 1, 1|2)))
                goto tx_error_icmp;
@@ -896,6 +902,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
                skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
 
        if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) {
+               if (!skb->dev) {
+                       struct net *net = dev_net(skb_dst(skb)->dev);
+
+                       skb->dev = net->loopback_dev;
+               }
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
                IP_VS_DBG_RL("%s(): frag needed\n", __func__);
                goto tx_error_put;
@@ -1053,6 +1064,11 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
        if (skb->len > mtu) {
+               if (!skb->dev) {
+                       struct net *net = dev_net(skb_dst(skb)->dev);
+
+                       skb->dev = net->loopback_dev;
+               }
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
                dst_release(&rt->dst);
                IP_VS_DBG_RL("%s(): frag needed\n", __func__);
@@ -1271,6 +1287,11 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
        if (skb->len > mtu) {
+               if (!skb->dev) {
+                       struct net *net = dev_net(skb_dst(skb)->dev);
+
+                       skb->dev = net->loopback_dev;
+               }
                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
                IP_VS_DBG_RL("%s(): frag needed\n", __func__);
                goto tx_error_put;