net: Put flowi_* prefix on AF independent members of struct flowi
[pandora-kernel.git] / net / netfilter / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * Changes:
18  *
19  */
20
21 #define KMSG_COMPONENT "IPVS"
22 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23
24 #include <linux/module.h>
25 #include <linux/init.h>
26 #include <linux/types.h>
27 #include <linux/capability.h>
28 #include <linux/fs.h>
29 #include <linux/sysctl.h>
30 #include <linux/proc_fs.h>
31 #include <linux/workqueue.h>
32 #include <linux/swap.h>
33 #include <linux/seq_file.h>
34 #include <linux/slab.h>
35
36 #include <linux/netfilter.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/mutex.h>
39
40 #include <net/net_namespace.h>
41 #include <linux/nsproxy.h>
42 #include <net/ip.h>
43 #ifdef CONFIG_IP_VS_IPV6
44 #include <net/ipv6.h>
45 #include <net/ip6_route.h>
46 #endif
47 #include <net/route.h>
48 #include <net/sock.h>
49 #include <net/genetlink.h>
50
51 #include <asm/uaccess.h>
52
53 #include <net/ip_vs.h>
54
55 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
56 static DEFINE_MUTEX(__ip_vs_mutex);
57
58 /* lock for service table */
59 static DEFINE_RWLOCK(__ip_vs_svc_lock);
60
61 /* sysctl variables */
62
63 #ifdef CONFIG_IP_VS_DEBUG
64 static int sysctl_ip_vs_debug_level = 0;
65
66 int ip_vs_get_debug_level(void)
67 {
68         return sysctl_ip_vs_debug_level;
69 }
70 #endif
71
72 #ifdef CONFIG_IP_VS_IPV6
73 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
74 static int __ip_vs_addr_is_local_v6(struct net *net,
75                                     const struct in6_addr *addr)
76 {
77         struct rt6_info *rt;
78         struct flowi fl = {
79                 .flowi_oif = 0,
80                 .fl6_dst = *addr,
81                 .fl6_src = { .s6_addr32 = {0, 0, 0, 0} },
82         };
83
84         rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl);
85         if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
86                         return 1;
87
88         return 0;
89 }
90 #endif
91 /*
92  *      update_defense_level is called from keventd and from sysctl,
93  *      so it needs to protect itself from softirqs
94  */
95 static void update_defense_level(struct netns_ipvs *ipvs)
96 {
97         struct sysinfo i;
98         static int old_secure_tcp = 0;
99         int availmem;
100         int nomem;
101         int to_change = -1;
102
103         /* we only count free and buffered memory (in pages) */
104         si_meminfo(&i);
105         availmem = i.freeram + i.bufferram;
106         /* however in linux 2.5 the i.bufferram is total page cache size,
107            we need adjust it */
108         /* si_swapinfo(&i); */
109         /* availmem = availmem - (i.totalswap - i.freeswap); */
110
111         nomem = (availmem < ipvs->sysctl_amemthresh);
112
113         local_bh_disable();
114
115         /* drop_entry */
116         spin_lock(&ipvs->dropentry_lock);
117         switch (ipvs->sysctl_drop_entry) {
118         case 0:
119                 atomic_set(&ipvs->dropentry, 0);
120                 break;
121         case 1:
122                 if (nomem) {
123                         atomic_set(&ipvs->dropentry, 1);
124                         ipvs->sysctl_drop_entry = 2;
125                 } else {
126                         atomic_set(&ipvs->dropentry, 0);
127                 }
128                 break;
129         case 2:
130                 if (nomem) {
131                         atomic_set(&ipvs->dropentry, 1);
132                 } else {
133                         atomic_set(&ipvs->dropentry, 0);
134                         ipvs->sysctl_drop_entry = 1;
135                 };
136                 break;
137         case 3:
138                 atomic_set(&ipvs->dropentry, 1);
139                 break;
140         }
141         spin_unlock(&ipvs->dropentry_lock);
142
143         /* drop_packet */
144         spin_lock(&ipvs->droppacket_lock);
145         switch (ipvs->sysctl_drop_packet) {
146         case 0:
147                 ipvs->drop_rate = 0;
148                 break;
149         case 1:
150                 if (nomem) {
151                         ipvs->drop_rate = ipvs->drop_counter
152                                 = ipvs->sysctl_amemthresh /
153                                 (ipvs->sysctl_amemthresh-availmem);
154                         ipvs->sysctl_drop_packet = 2;
155                 } else {
156                         ipvs->drop_rate = 0;
157                 }
158                 break;
159         case 2:
160                 if (nomem) {
161                         ipvs->drop_rate = ipvs->drop_counter
162                                 = ipvs->sysctl_amemthresh /
163                                 (ipvs->sysctl_amemthresh-availmem);
164                 } else {
165                         ipvs->drop_rate = 0;
166                         ipvs->sysctl_drop_packet = 1;
167                 }
168                 break;
169         case 3:
170                 ipvs->drop_rate = ipvs->sysctl_am_droprate;
171                 break;
172         }
173         spin_unlock(&ipvs->droppacket_lock);
174
175         /* secure_tcp */
176         spin_lock(&ipvs->securetcp_lock);
177         switch (ipvs->sysctl_secure_tcp) {
178         case 0:
179                 if (old_secure_tcp >= 2)
180                         to_change = 0;
181                 break;
182         case 1:
183                 if (nomem) {
184                         if (old_secure_tcp < 2)
185                                 to_change = 1;
186                         ipvs->sysctl_secure_tcp = 2;
187                 } else {
188                         if (old_secure_tcp >= 2)
189                                 to_change = 0;
190                 }
191                 break;
192         case 2:
193                 if (nomem) {
194                         if (old_secure_tcp < 2)
195                                 to_change = 1;
196                 } else {
197                         if (old_secure_tcp >= 2)
198                                 to_change = 0;
199                         ipvs->sysctl_secure_tcp = 1;
200                 }
201                 break;
202         case 3:
203                 if (old_secure_tcp < 2)
204                         to_change = 1;
205                 break;
206         }
207         old_secure_tcp = ipvs->sysctl_secure_tcp;
208         if (to_change >= 0)
209                 ip_vs_protocol_timeout_change(ipvs,
210                                               ipvs->sysctl_secure_tcp > 1);
211         spin_unlock(&ipvs->securetcp_lock);
212
213         local_bh_enable();
214 }
215
216
217 /*
218  *      Timer for checking the defense
219  */
220 #define DEFENSE_TIMER_PERIOD    1*HZ
221
222 static void defense_work_handler(struct work_struct *work)
223 {
224         struct netns_ipvs *ipvs =
225                 container_of(work, struct netns_ipvs, defense_work.work);
226
227         update_defense_level(ipvs);
228         if (atomic_read(&ipvs->dropentry))
229                 ip_vs_random_dropentry(ipvs->net);
230         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
231 }
232
233 int
234 ip_vs_use_count_inc(void)
235 {
236         return try_module_get(THIS_MODULE);
237 }
238
239 void
240 ip_vs_use_count_dec(void)
241 {
242         module_put(THIS_MODULE);
243 }
244
245
246 /*
247  *      Hash table: for virtual service lookups
248  */
249 #define IP_VS_SVC_TAB_BITS 8
250 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
251 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
252
253 /* the service table hashed by <protocol, addr, port> */
254 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
255 /* the service table hashed by fwmark */
256 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
257
258
259 /*
260  *      Returns hash value for virtual service
261  */
262 static inline unsigned
263 ip_vs_svc_hashkey(struct net *net, int af, unsigned proto,
264                   const union nf_inet_addr *addr, __be16 port)
265 {
266         register unsigned porth = ntohs(port);
267         __be32 addr_fold = addr->ip;
268
269 #ifdef CONFIG_IP_VS_IPV6
270         if (af == AF_INET6)
271                 addr_fold = addr->ip6[0]^addr->ip6[1]^
272                             addr->ip6[2]^addr->ip6[3];
273 #endif
274         addr_fold ^= ((size_t)net>>8);
275
276         return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
277                 & IP_VS_SVC_TAB_MASK;
278 }
279
280 /*
281  *      Returns hash value of fwmark for virtual service lookup
282  */
283 static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
284 {
285         return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
286 }
287
288 /*
289  *      Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
290  *      or in the ip_vs_svc_fwm_table by fwmark.
291  *      Should be called with locked tables.
292  */
293 static int ip_vs_svc_hash(struct ip_vs_service *svc)
294 {
295         unsigned hash;
296
297         if (svc->flags & IP_VS_SVC_F_HASHED) {
298                 pr_err("%s(): request for already hashed, called from %pF\n",
299                        __func__, __builtin_return_address(0));
300                 return 0;
301         }
302
303         if (svc->fwmark == 0) {
304                 /*
305                  *  Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
306                  */
307                 hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
308                                          &svc->addr, svc->port);
309                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
310         } else {
311                 /*
312                  *  Hash it by fwmark in svc_fwm_table
313                  */
314                 hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
315                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
316         }
317
318         svc->flags |= IP_VS_SVC_F_HASHED;
319         /* increase its refcnt because it is referenced by the svc table */
320         atomic_inc(&svc->refcnt);
321         return 1;
322 }
323
324
325 /*
326  *      Unhashes a service from svc_table / svc_fwm_table.
327  *      Should be called with locked tables.
328  */
329 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
330 {
331         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
332                 pr_err("%s(): request for unhash flagged, called from %pF\n",
333                        __func__, __builtin_return_address(0));
334                 return 0;
335         }
336
337         if (svc->fwmark == 0) {
338                 /* Remove it from the svc_table table */
339                 list_del(&svc->s_list);
340         } else {
341                 /* Remove it from the svc_fwm_table table */
342                 list_del(&svc->f_list);
343         }
344
345         svc->flags &= ~IP_VS_SVC_F_HASHED;
346         atomic_dec(&svc->refcnt);
347         return 1;
348 }
349
350
351 /*
352  *      Get service by {netns, proto,addr,port} in the service table.
353  */
354 static inline struct ip_vs_service *
355 __ip_vs_service_find(struct net *net, int af, __u16 protocol,
356                      const union nf_inet_addr *vaddr, __be16 vport)
357 {
358         unsigned hash;
359         struct ip_vs_service *svc;
360
361         /* Check for "full" addressed entries */
362         hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
363
364         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
365                 if ((svc->af == af)
366                     && ip_vs_addr_equal(af, &svc->addr, vaddr)
367                     && (svc->port == vport)
368                     && (svc->protocol == protocol)
369                     && net_eq(svc->net, net)) {
370                         /* HIT */
371                         return svc;
372                 }
373         }
374
375         return NULL;
376 }
377
378
379 /*
380  *      Get service by {fwmark} in the service table.
381  */
382 static inline struct ip_vs_service *
383 __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
384 {
385         unsigned hash;
386         struct ip_vs_service *svc;
387
388         /* Check for fwmark addressed entries */
389         hash = ip_vs_svc_fwm_hashkey(net, fwmark);
390
391         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
392                 if (svc->fwmark == fwmark && svc->af == af
393                     && net_eq(svc->net, net)) {
394                         /* HIT */
395                         return svc;
396                 }
397         }
398
399         return NULL;
400 }
401
402 struct ip_vs_service *
403 ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
404                   const union nf_inet_addr *vaddr, __be16 vport)
405 {
406         struct ip_vs_service *svc;
407         struct netns_ipvs *ipvs = net_ipvs(net);
408
409         read_lock(&__ip_vs_svc_lock);
410
411         /*
412          *      Check the table hashed by fwmark first
413          */
414         svc = __ip_vs_svc_fwm_find(net, af, fwmark);
415         if (fwmark && svc)
416                 goto out;
417
418         /*
419          *      Check the table hashed by <protocol,addr,port>
420          *      for "full" addressed entries
421          */
422         svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
423
424         if (svc == NULL
425             && protocol == IPPROTO_TCP
426             && atomic_read(&ipvs->ftpsvc_counter)
427             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
428                 /*
429                  * Check if ftp service entry exists, the packet
430                  * might belong to FTP data connections.
431                  */
432                 svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
433         }
434
435         if (svc == NULL
436             && atomic_read(&ipvs->nullsvc_counter)) {
437                 /*
438                  * Check if the catch-all port (port zero) exists
439                  */
440                 svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
441         }
442
443   out:
444         if (svc)
445                 atomic_inc(&svc->usecnt);
446         read_unlock(&__ip_vs_svc_lock);
447
448         IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
449                       fwmark, ip_vs_proto_name(protocol),
450                       IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
451                       svc ? "hit" : "not hit");
452
453         return svc;
454 }
455
456
457 static inline void
458 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
459 {
460         atomic_inc(&svc->refcnt);
461         dest->svc = svc;
462 }
463
464 static void
465 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
466 {
467         struct ip_vs_service *svc = dest->svc;
468
469         dest->svc = NULL;
470         if (atomic_dec_and_test(&svc->refcnt)) {
471                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
472                               svc->fwmark,
473                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
474                               ntohs(svc->port), atomic_read(&svc->usecnt));
475                 free_percpu(svc->stats.cpustats);
476                 kfree(svc);
477         }
478 }
479
480
481 /*
482  *      Returns hash value for real service
483  */
484 static inline unsigned ip_vs_rs_hashkey(int af,
485                                             const union nf_inet_addr *addr,
486                                             __be16 port)
487 {
488         register unsigned porth = ntohs(port);
489         __be32 addr_fold = addr->ip;
490
491 #ifdef CONFIG_IP_VS_IPV6
492         if (af == AF_INET6)
493                 addr_fold = addr->ip6[0]^addr->ip6[1]^
494                             addr->ip6[2]^addr->ip6[3];
495 #endif
496
497         return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
498                 & IP_VS_RTAB_MASK;
499 }
500
501 /*
502  *      Hashes ip_vs_dest in rs_table by <proto,addr,port>.
503  *      should be called with locked tables.
504  */
505 static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
506 {
507         unsigned hash;
508
509         if (!list_empty(&dest->d_list)) {
510                 return 0;
511         }
512
513         /*
514          *      Hash by proto,addr,port,
515          *      which are the parameters of the real service.
516          */
517         hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
518
519         list_add(&dest->d_list, &ipvs->rs_table[hash]);
520
521         return 1;
522 }
523
524 /*
525  *      UNhashes ip_vs_dest from rs_table.
526  *      should be called with locked tables.
527  */
528 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
529 {
530         /*
531          * Remove it from the rs_table table.
532          */
533         if (!list_empty(&dest->d_list)) {
534                 list_del(&dest->d_list);
535                 INIT_LIST_HEAD(&dest->d_list);
536         }
537
538         return 1;
539 }
540
541 /*
542  *      Lookup real service by <proto,addr,port> in the real service table.
543  */
544 struct ip_vs_dest *
545 ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
546                           const union nf_inet_addr *daddr,
547                           __be16 dport)
548 {
549         struct netns_ipvs *ipvs = net_ipvs(net);
550         unsigned hash;
551         struct ip_vs_dest *dest;
552
553         /*
554          *      Check for "full" addressed entries
555          *      Return the first found entry
556          */
557         hash = ip_vs_rs_hashkey(af, daddr, dport);
558
559         read_lock(&ipvs->rs_lock);
560         list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
561                 if ((dest->af == af)
562                     && ip_vs_addr_equal(af, &dest->addr, daddr)
563                     && (dest->port == dport)
564                     && ((dest->protocol == protocol) ||
565                         dest->vfwmark)) {
566                         /* HIT */
567                         read_unlock(&ipvs->rs_lock);
568                         return dest;
569                 }
570         }
571         read_unlock(&ipvs->rs_lock);
572
573         return NULL;
574 }
575
576 /*
577  *      Lookup destination by {addr,port} in the given service
578  */
579 static struct ip_vs_dest *
580 ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
581                   __be16 dport)
582 {
583         struct ip_vs_dest *dest;
584
585         /*
586          * Find the destination for the given service
587          */
588         list_for_each_entry(dest, &svc->destinations, n_list) {
589                 if ((dest->af == svc->af)
590                     && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
591                     && (dest->port == dport)) {
592                         /* HIT */
593                         return dest;
594                 }
595         }
596
597         return NULL;
598 }
599
600 /*
601  * Find destination by {daddr,dport,vaddr,protocol}
602  * Cretaed to be used in ip_vs_process_message() in
603  * the backup synchronization daemon. It finds the
604  * destination to be bound to the received connection
605  * on the backup.
606  *
607  * ip_vs_lookup_real_service() looked promissing, but
608  * seems not working as expected.
609  */
610 struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
611                                    const union nf_inet_addr *daddr,
612                                    __be16 dport,
613                                    const union nf_inet_addr *vaddr,
614                                    __be16 vport, __u16 protocol, __u32 fwmark)
615 {
616         struct ip_vs_dest *dest;
617         struct ip_vs_service *svc;
618
619         svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
620         if (!svc)
621                 return NULL;
622         dest = ip_vs_lookup_dest(svc, daddr, dport);
623         if (dest)
624                 atomic_inc(&dest->refcnt);
625         ip_vs_service_put(svc);
626         return dest;
627 }
628
629 /*
630  *  Lookup dest by {svc,addr,port} in the destination trash.
631  *  The destination trash is used to hold the destinations that are removed
632  *  from the service table but are still referenced by some conn entries.
633  *  The reason to add the destination trash is when the dest is temporary
634  *  down (either by administrator or by monitor program), the dest can be
635  *  picked back from the trash, the remaining connections to the dest can
636  *  continue, and the counting information of the dest is also useful for
637  *  scheduling.
638  */
639 static struct ip_vs_dest *
640 ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
641                      __be16 dport)
642 {
643         struct ip_vs_dest *dest, *nxt;
644         struct netns_ipvs *ipvs = net_ipvs(svc->net);
645
646         /*
647          * Find the destination in trash
648          */
649         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
650                 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
651                               "dest->refcnt=%d\n",
652                               dest->vfwmark,
653                               IP_VS_DBG_ADDR(svc->af, &dest->addr),
654                               ntohs(dest->port),
655                               atomic_read(&dest->refcnt));
656                 if (dest->af == svc->af &&
657                     ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
658                     dest->port == dport &&
659                     dest->vfwmark == svc->fwmark &&
660                     dest->protocol == svc->protocol &&
661                     (svc->fwmark ||
662                      (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
663                       dest->vport == svc->port))) {
664                         /* HIT */
665                         return dest;
666                 }
667
668                 /*
669                  * Try to purge the destination from trash if not referenced
670                  */
671                 if (atomic_read(&dest->refcnt) == 1) {
672                         IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
673                                       "from trash\n",
674                                       dest->vfwmark,
675                                       IP_VS_DBG_ADDR(svc->af, &dest->addr),
676                                       ntohs(dest->port));
677                         list_del(&dest->n_list);
678                         ip_vs_dst_reset(dest);
679                         __ip_vs_unbind_svc(dest);
680                         free_percpu(dest->stats.cpustats);
681                         kfree(dest);
682                 }
683         }
684
685         return NULL;
686 }
687
688
689 /*
690  *  Clean up all the destinations in the trash
691  *  Called by the ip_vs_control_cleanup()
692  *
693  *  When the ip_vs_control_clearup is activated by ipvs module exit,
694  *  the service tables must have been flushed and all the connections
695  *  are expired, and the refcnt of each destination in the trash must
696  *  be 1, so we simply release them here.
697  */
698 static void ip_vs_trash_cleanup(struct net *net)
699 {
700         struct ip_vs_dest *dest, *nxt;
701         struct netns_ipvs *ipvs = net_ipvs(net);
702
703         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
704                 list_del(&dest->n_list);
705                 ip_vs_dst_reset(dest);
706                 __ip_vs_unbind_svc(dest);
707                 free_percpu(dest->stats.cpustats);
708                 kfree(dest);
709         }
710 }
711
712
713 static void
714 ip_vs_zero_stats(struct ip_vs_stats *stats)
715 {
716         spin_lock_bh(&stats->lock);
717
718         memset(&stats->ustats, 0, sizeof(stats->ustats));
719         ip_vs_zero_estimator(stats);
720
721         spin_unlock_bh(&stats->lock);
722 }
723
724 /*
725  *      Update a destination in the given service
726  */
727 static void
728 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
729                     struct ip_vs_dest_user_kern *udest, int add)
730 {
731         struct netns_ipvs *ipvs = net_ipvs(svc->net);
732         int conn_flags;
733
734         /* set the weight and the flags */
735         atomic_set(&dest->weight, udest->weight);
736         conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
737         conn_flags |= IP_VS_CONN_F_INACTIVE;
738
739         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
740         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
741                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
742         } else {
743                 /*
744                  *    Put the real service in rs_table if not present.
745                  *    For now only for NAT!
746                  */
747                 write_lock_bh(&ipvs->rs_lock);
748                 ip_vs_rs_hash(ipvs, dest);
749                 write_unlock_bh(&ipvs->rs_lock);
750         }
751         atomic_set(&dest->conn_flags, conn_flags);
752
753         /* bind the service */
754         if (!dest->svc) {
755                 __ip_vs_bind_svc(dest, svc);
756         } else {
757                 if (dest->svc != svc) {
758                         __ip_vs_unbind_svc(dest);
759                         ip_vs_zero_stats(&dest->stats);
760                         __ip_vs_bind_svc(dest, svc);
761                 }
762         }
763
764         /* set the dest status flags */
765         dest->flags |= IP_VS_DEST_F_AVAILABLE;
766
767         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
768                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
769         dest->u_threshold = udest->u_threshold;
770         dest->l_threshold = udest->l_threshold;
771
772         spin_lock_bh(&dest->dst_lock);
773         ip_vs_dst_reset(dest);
774         spin_unlock_bh(&dest->dst_lock);
775
776         if (add)
777                 ip_vs_new_estimator(svc->net, &dest->stats);
778
779         write_lock_bh(&__ip_vs_svc_lock);
780
781         /* Wait until all other svc users go away */
782         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
783
784         if (add) {
785                 list_add(&dest->n_list, &svc->destinations);
786                 svc->num_dests++;
787         }
788
789         /* call the update_service, because server weight may be changed */
790         if (svc->scheduler->update_service)
791                 svc->scheduler->update_service(svc);
792
793         write_unlock_bh(&__ip_vs_svc_lock);
794 }
795
796
797 /*
798  *      Create a destination for the given service
799  */
800 static int
801 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
802                struct ip_vs_dest **dest_p)
803 {
804         struct ip_vs_dest *dest;
805         unsigned atype;
806
807         EnterFunction(2);
808
809 #ifdef CONFIG_IP_VS_IPV6
810         if (svc->af == AF_INET6) {
811                 atype = ipv6_addr_type(&udest->addr.in6);
812                 if ((!(atype & IPV6_ADDR_UNICAST) ||
813                         atype & IPV6_ADDR_LINKLOCAL) &&
814                         !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
815                         return -EINVAL;
816         } else
817 #endif
818         {
819                 atype = inet_addr_type(svc->net, udest->addr.ip);
820                 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
821                         return -EINVAL;
822         }
823
824         dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
825         if (dest == NULL) {
826                 pr_err("%s(): no memory.\n", __func__);
827                 return -ENOMEM;
828         }
829         dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
830         if (!dest->stats.cpustats) {
831                 pr_err("%s() alloc_percpu failed\n", __func__);
832                 goto err_alloc;
833         }
834
835         dest->af = svc->af;
836         dest->protocol = svc->protocol;
837         dest->vaddr = svc->addr;
838         dest->vport = svc->port;
839         dest->vfwmark = svc->fwmark;
840         ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
841         dest->port = udest->port;
842
843         atomic_set(&dest->activeconns, 0);
844         atomic_set(&dest->inactconns, 0);
845         atomic_set(&dest->persistconns, 0);
846         atomic_set(&dest->refcnt, 1);
847
848         INIT_LIST_HEAD(&dest->d_list);
849         spin_lock_init(&dest->dst_lock);
850         spin_lock_init(&dest->stats.lock);
851         __ip_vs_update_dest(svc, dest, udest, 1);
852
853         *dest_p = dest;
854
855         LeaveFunction(2);
856         return 0;
857
858 err_alloc:
859         kfree(dest);
860         return -ENOMEM;
861 }
862
863
864 /*
865  *      Add a destination into an existing service
866  */
867 static int
868 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
869 {
870         struct ip_vs_dest *dest;
871         union nf_inet_addr daddr;
872         __be16 dport = udest->port;
873         int ret;
874
875         EnterFunction(2);
876
877         if (udest->weight < 0) {
878                 pr_err("%s(): server weight less than zero\n", __func__);
879                 return -ERANGE;
880         }
881
882         if (udest->l_threshold > udest->u_threshold) {
883                 pr_err("%s(): lower threshold is higher than upper threshold\n",
884                         __func__);
885                 return -ERANGE;
886         }
887
888         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
889
890         /*
891          * Check if the dest already exists in the list
892          */
893         dest = ip_vs_lookup_dest(svc, &daddr, dport);
894
895         if (dest != NULL) {
896                 IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
897                 return -EEXIST;
898         }
899
900         /*
901          * Check if the dest already exists in the trash and
902          * is from the same service
903          */
904         dest = ip_vs_trash_get_dest(svc, &daddr, dport);
905
906         if (dest != NULL) {
907                 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
908                               "dest->refcnt=%d, service %u/%s:%u\n",
909                               IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
910                               atomic_read(&dest->refcnt),
911                               dest->vfwmark,
912                               IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
913                               ntohs(dest->vport));
914
915                 /*
916                  * Get the destination from the trash
917                  */
918                 list_del(&dest->n_list);
919
920                 __ip_vs_update_dest(svc, dest, udest, 1);
921                 ret = 0;
922         } else {
923                 /*
924                  * Allocate and initialize the dest structure
925                  */
926                 ret = ip_vs_new_dest(svc, udest, &dest);
927         }
928         LeaveFunction(2);
929
930         return ret;
931 }
932
933
934 /*
935  *      Edit a destination in the given service
936  */
937 static int
938 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
939 {
940         struct ip_vs_dest *dest;
941         union nf_inet_addr daddr;
942         __be16 dport = udest->port;
943
944         EnterFunction(2);
945
946         if (udest->weight < 0) {
947                 pr_err("%s(): server weight less than zero\n", __func__);
948                 return -ERANGE;
949         }
950
951         if (udest->l_threshold > udest->u_threshold) {
952                 pr_err("%s(): lower threshold is higher than upper threshold\n",
953                         __func__);
954                 return -ERANGE;
955         }
956
957         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
958
959         /*
960          *  Lookup the destination list
961          */
962         dest = ip_vs_lookup_dest(svc, &daddr, dport);
963
964         if (dest == NULL) {
965                 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
966                 return -ENOENT;
967         }
968
969         __ip_vs_update_dest(svc, dest, udest, 0);
970         LeaveFunction(2);
971
972         return 0;
973 }
974
975
976 /*
977  *      Delete a destination (must be already unlinked from the service)
978  */
979 static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
980 {
981         struct netns_ipvs *ipvs = net_ipvs(net);
982
983         ip_vs_kill_estimator(net, &dest->stats);
984
985         /*
986          *  Remove it from the d-linked list with the real services.
987          */
988         write_lock_bh(&ipvs->rs_lock);
989         ip_vs_rs_unhash(dest);
990         write_unlock_bh(&ipvs->rs_lock);
991
992         /*
993          *  Decrease the refcnt of the dest, and free the dest
994          *  if nobody refers to it (refcnt=0). Otherwise, throw
995          *  the destination into the trash.
996          */
997         if (atomic_dec_and_test(&dest->refcnt)) {
998                 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
999                               dest->vfwmark,
1000                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1001                               ntohs(dest->port));
1002                 ip_vs_dst_reset(dest);
1003                 /* simply decrease svc->refcnt here, let the caller check
1004                    and release the service if nobody refers to it.
1005                    Only user context can release destination and service,
1006                    and only one user context can update virtual service at a
1007                    time, so the operation here is OK */
1008                 atomic_dec(&dest->svc->refcnt);
1009                 free_percpu(dest->stats.cpustats);
1010                 kfree(dest);
1011         } else {
1012                 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
1013                               "dest->refcnt=%d\n",
1014                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1015                               ntohs(dest->port),
1016                               atomic_read(&dest->refcnt));
1017                 list_add(&dest->n_list, &ipvs->dest_trash);
1018                 atomic_inc(&dest->refcnt);
1019         }
1020 }
1021
1022
1023 /*
1024  *      Unlink a destination from the given service
1025  */
1026 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1027                                 struct ip_vs_dest *dest,
1028                                 int svcupd)
1029 {
1030         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1031
1032         /*
1033          *  Remove it from the d-linked destination list.
1034          */
1035         list_del(&dest->n_list);
1036         svc->num_dests--;
1037
1038         /*
1039          *  Call the update_service function of its scheduler
1040          */
1041         if (svcupd && svc->scheduler->update_service)
1042                         svc->scheduler->update_service(svc);
1043 }
1044
1045
1046 /*
1047  *      Delete a destination server in the given service
1048  */
1049 static int
1050 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1051 {
1052         struct ip_vs_dest *dest;
1053         __be16 dport = udest->port;
1054
1055         EnterFunction(2);
1056
1057         dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
1058
1059         if (dest == NULL) {
1060                 IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
1061                 return -ENOENT;
1062         }
1063
1064         write_lock_bh(&__ip_vs_svc_lock);
1065
1066         /*
1067          *      Wait until all other svc users go away.
1068          */
1069         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1070
1071         /*
1072          *      Unlink dest from the service
1073          */
1074         __ip_vs_unlink_dest(svc, dest, 1);
1075
1076         write_unlock_bh(&__ip_vs_svc_lock);
1077
1078         /*
1079          *      Delete the destination
1080          */
1081         __ip_vs_del_dest(svc->net, dest);
1082
1083         LeaveFunction(2);
1084
1085         return 0;
1086 }
1087
1088
1089 /*
1090  *      Add a service into the service hash table
1091  */
1092 static int
1093 ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
1094                   struct ip_vs_service **svc_p)
1095 {
1096         int ret = 0;
1097         struct ip_vs_scheduler *sched = NULL;
1098         struct ip_vs_pe *pe = NULL;
1099         struct ip_vs_service *svc = NULL;
1100         struct netns_ipvs *ipvs = net_ipvs(net);
1101
1102         /* increase the module use count */
1103         ip_vs_use_count_inc();
1104
1105         /* Lookup the scheduler by 'u->sched_name' */
1106         sched = ip_vs_scheduler_get(u->sched_name);
1107         if (sched == NULL) {
1108                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1109                 ret = -ENOENT;
1110                 goto out_err;
1111         }
1112
1113         if (u->pe_name && *u->pe_name) {
1114                 pe = ip_vs_pe_getbyname(u->pe_name);
1115                 if (pe == NULL) {
1116                         pr_info("persistence engine module ip_vs_pe_%s "
1117                                 "not found\n", u->pe_name);
1118                         ret = -ENOENT;
1119                         goto out_err;
1120                 }
1121         }
1122
1123 #ifdef CONFIG_IP_VS_IPV6
1124         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1125                 ret = -EINVAL;
1126                 goto out_err;
1127         }
1128 #endif
1129
1130         svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
1131         if (svc == NULL) {
1132                 IP_VS_DBG(1, "%s(): no memory\n", __func__);
1133                 ret = -ENOMEM;
1134                 goto out_err;
1135         }
1136         svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1137         if (!svc->stats.cpustats) {
1138                 pr_err("%s() alloc_percpu failed\n", __func__);
1139                 goto out_err;
1140         }
1141
1142         /* I'm the first user of the service */
1143         atomic_set(&svc->usecnt, 0);
1144         atomic_set(&svc->refcnt, 0);
1145
1146         svc->af = u->af;
1147         svc->protocol = u->protocol;
1148         ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1149         svc->port = u->port;
1150         svc->fwmark = u->fwmark;
1151         svc->flags = u->flags;
1152         svc->timeout = u->timeout * HZ;
1153         svc->netmask = u->netmask;
1154         svc->net = net;
1155
1156         INIT_LIST_HEAD(&svc->destinations);
1157         rwlock_init(&svc->sched_lock);
1158         spin_lock_init(&svc->stats.lock);
1159
1160         /* Bind the scheduler */
1161         ret = ip_vs_bind_scheduler(svc, sched);
1162         if (ret)
1163                 goto out_err;
1164         sched = NULL;
1165
1166         /* Bind the ct retriever */
1167         ip_vs_bind_pe(svc, pe);
1168         pe = NULL;
1169
1170         /* Update the virtual service counters */
1171         if (svc->port == FTPPORT)
1172                 atomic_inc(&ipvs->ftpsvc_counter);
1173         else if (svc->port == 0)
1174                 atomic_inc(&ipvs->nullsvc_counter);
1175
1176         ip_vs_new_estimator(net, &svc->stats);
1177
1178         /* Count only IPv4 services for old get/setsockopt interface */
1179         if (svc->af == AF_INET)
1180                 ipvs->num_services++;
1181
1182         /* Hash the service into the service table */
1183         write_lock_bh(&__ip_vs_svc_lock);
1184         ip_vs_svc_hash(svc);
1185         write_unlock_bh(&__ip_vs_svc_lock);
1186
1187         *svc_p = svc;
1188         return 0;
1189
1190
1191  out_err:
1192         if (svc != NULL) {
1193                 ip_vs_unbind_scheduler(svc);
1194                 if (svc->inc) {
1195                         local_bh_disable();
1196                         ip_vs_app_inc_put(svc->inc);
1197                         local_bh_enable();
1198                 }
1199                 if (svc->stats.cpustats)
1200                         free_percpu(svc->stats.cpustats);
1201                 kfree(svc);
1202         }
1203         ip_vs_scheduler_put(sched);
1204         ip_vs_pe_put(pe);
1205
1206         /* decrease the module use count */
1207         ip_vs_use_count_dec();
1208
1209         return ret;
1210 }
1211
1212
1213 /*
1214  *      Edit a service and bind it with a new scheduler
1215  */
1216 static int
1217 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1218 {
1219         struct ip_vs_scheduler *sched, *old_sched;
1220         struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1221         int ret = 0;
1222
1223         /*
1224          * Lookup the scheduler, by 'u->sched_name'
1225          */
1226         sched = ip_vs_scheduler_get(u->sched_name);
1227         if (sched == NULL) {
1228                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1229                 return -ENOENT;
1230         }
1231         old_sched = sched;
1232
1233         if (u->pe_name && *u->pe_name) {
1234                 pe = ip_vs_pe_getbyname(u->pe_name);
1235                 if (pe == NULL) {
1236                         pr_info("persistence engine module ip_vs_pe_%s "
1237                                 "not found\n", u->pe_name);
1238                         ret = -ENOENT;
1239                         goto out;
1240                 }
1241                 old_pe = pe;
1242         }
1243
1244 #ifdef CONFIG_IP_VS_IPV6
1245         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1246                 ret = -EINVAL;
1247                 goto out;
1248         }
1249 #endif
1250
1251         write_lock_bh(&__ip_vs_svc_lock);
1252
1253         /*
1254          * Wait until all other svc users go away.
1255          */
1256         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1257
1258         /*
1259          * Set the flags and timeout value
1260          */
1261         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1262         svc->timeout = u->timeout * HZ;
1263         svc->netmask = u->netmask;
1264
1265         old_sched = svc->scheduler;
1266         if (sched != old_sched) {
1267                 /*
1268                  * Unbind the old scheduler
1269                  */
1270                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1271                         old_sched = sched;
1272                         goto out_unlock;
1273                 }
1274
1275                 /*
1276                  * Bind the new scheduler
1277                  */
1278                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1279                         /*
1280                          * If ip_vs_bind_scheduler fails, restore the old
1281                          * scheduler.
1282                          * The main reason of failure is out of memory.
1283                          *
1284                          * The question is if the old scheduler can be
1285                          * restored all the time. TODO: if it cannot be
1286                          * restored some time, we must delete the service,
1287                          * otherwise the system may crash.
1288                          */
1289                         ip_vs_bind_scheduler(svc, old_sched);
1290                         old_sched = sched;
1291                         goto out_unlock;
1292                 }
1293         }
1294
1295         old_pe = svc->pe;
1296         if (pe != old_pe) {
1297                 ip_vs_unbind_pe(svc);
1298                 ip_vs_bind_pe(svc, pe);
1299         }
1300
1301   out_unlock:
1302         write_unlock_bh(&__ip_vs_svc_lock);
1303   out:
1304         ip_vs_scheduler_put(old_sched);
1305         ip_vs_pe_put(old_pe);
1306         return ret;
1307 }
1308
1309
1310 /*
1311  *      Delete a service from the service list
1312  *      - The service must be unlinked, unlocked and not referenced!
1313  *      - We are called under _bh lock
1314  */
1315 static void __ip_vs_del_service(struct ip_vs_service *svc)
1316 {
1317         struct ip_vs_dest *dest, *nxt;
1318         struct ip_vs_scheduler *old_sched;
1319         struct ip_vs_pe *old_pe;
1320         struct netns_ipvs *ipvs = net_ipvs(svc->net);
1321
1322         pr_info("%s: enter\n", __func__);
1323
1324         /* Count only IPv4 services for old get/setsockopt interface */
1325         if (svc->af == AF_INET)
1326                 ipvs->num_services--;
1327
1328         ip_vs_kill_estimator(svc->net, &svc->stats);
1329
1330         /* Unbind scheduler */
1331         old_sched = svc->scheduler;
1332         ip_vs_unbind_scheduler(svc);
1333         ip_vs_scheduler_put(old_sched);
1334
1335         /* Unbind persistence engine */
1336         old_pe = svc->pe;
1337         ip_vs_unbind_pe(svc);
1338         ip_vs_pe_put(old_pe);
1339
1340         /* Unbind app inc */
1341         if (svc->inc) {
1342                 ip_vs_app_inc_put(svc->inc);
1343                 svc->inc = NULL;
1344         }
1345
1346         /*
1347          *    Unlink the whole destination list
1348          */
1349         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1350                 __ip_vs_unlink_dest(svc, dest, 0);
1351                 __ip_vs_del_dest(svc->net, dest);
1352         }
1353
1354         /*
1355          *    Update the virtual service counters
1356          */
1357         if (svc->port == FTPPORT)
1358                 atomic_dec(&ipvs->ftpsvc_counter);
1359         else if (svc->port == 0)
1360                 atomic_dec(&ipvs->nullsvc_counter);
1361
1362         /*
1363          *    Free the service if nobody refers to it
1364          */
1365         if (atomic_read(&svc->refcnt) == 0) {
1366                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
1367                               svc->fwmark,
1368                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
1369                               ntohs(svc->port), atomic_read(&svc->usecnt));
1370                 free_percpu(svc->stats.cpustats);
1371                 kfree(svc);
1372         }
1373
1374         /* decrease the module use count */
1375         ip_vs_use_count_dec();
1376 }
1377
1378 /*
1379  * Unlink a service from list and try to delete it if its refcnt reached 0
1380  */
1381 static void ip_vs_unlink_service(struct ip_vs_service *svc)
1382 {
1383         /*
1384          * Unhash it from the service table
1385          */
1386         write_lock_bh(&__ip_vs_svc_lock);
1387
1388         ip_vs_svc_unhash(svc);
1389
1390         /*
1391          * Wait until all the svc users go away.
1392          */
1393         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1394
1395         __ip_vs_del_service(svc);
1396
1397         write_unlock_bh(&__ip_vs_svc_lock);
1398 }
1399
1400 /*
1401  *      Delete a service from the service list
1402  */
1403 static int ip_vs_del_service(struct ip_vs_service *svc)
1404 {
1405         if (svc == NULL)
1406                 return -EEXIST;
1407         ip_vs_unlink_service(svc);
1408
1409         return 0;
1410 }
1411
1412
1413 /*
1414  *      Flush all the virtual services
1415  */
1416 static int ip_vs_flush(struct net *net)
1417 {
1418         int idx;
1419         struct ip_vs_service *svc, *nxt;
1420
1421         /*
1422          * Flush the service table hashed by <netns,protocol,addr,port>
1423          */
1424         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1425                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
1426                                          s_list) {
1427                         if (net_eq(svc->net, net))
1428                                 ip_vs_unlink_service(svc);
1429                 }
1430         }
1431
1432         /*
1433          * Flush the service table hashed by fwmark
1434          */
1435         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1436                 list_for_each_entry_safe(svc, nxt,
1437                                          &ip_vs_svc_fwm_table[idx], f_list) {
1438                         if (net_eq(svc->net, net))
1439                                 ip_vs_unlink_service(svc);
1440                 }
1441         }
1442
1443         return 0;
1444 }
1445
1446
1447 /*
1448  *      Zero counters in a service or all services
1449  */
1450 static int ip_vs_zero_service(struct ip_vs_service *svc)
1451 {
1452         struct ip_vs_dest *dest;
1453
1454         write_lock_bh(&__ip_vs_svc_lock);
1455         list_for_each_entry(dest, &svc->destinations, n_list) {
1456                 ip_vs_zero_stats(&dest->stats);
1457         }
1458         ip_vs_zero_stats(&svc->stats);
1459         write_unlock_bh(&__ip_vs_svc_lock);
1460         return 0;
1461 }
1462
1463 static int ip_vs_zero_all(struct net *net)
1464 {
1465         int idx;
1466         struct ip_vs_service *svc;
1467
1468         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1469                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1470                         if (net_eq(svc->net, net))
1471                                 ip_vs_zero_service(svc);
1472                 }
1473         }
1474
1475         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1476                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1477                         if (net_eq(svc->net, net))
1478                                 ip_vs_zero_service(svc);
1479                 }
1480         }
1481
1482         ip_vs_zero_stats(net_ipvs(net)->tot_stats);
1483         return 0;
1484 }
1485
1486
1487 static int
1488 proc_do_defense_mode(ctl_table *table, int write,
1489                      void __user *buffer, size_t *lenp, loff_t *ppos)
1490 {
1491         struct net *net = current->nsproxy->net_ns;
1492         int *valp = table->data;
1493         int val = *valp;
1494         int rc;
1495
1496         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1497         if (write && (*valp != val)) {
1498                 if ((*valp < 0) || (*valp > 3)) {
1499                         /* Restore the correct value */
1500                         *valp = val;
1501                 } else {
1502                         update_defense_level(net_ipvs(net));
1503                 }
1504         }
1505         return rc;
1506 }
1507
1508
1509 static int
1510 proc_do_sync_threshold(ctl_table *table, int write,
1511                        void __user *buffer, size_t *lenp, loff_t *ppos)
1512 {
1513         int *valp = table->data;
1514         int val[2];
1515         int rc;
1516
1517         /* backup the value first */
1518         memcpy(val, valp, sizeof(val));
1519
1520         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1521         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1522                 /* Restore the correct value */
1523                 memcpy(valp, val, sizeof(val));
1524         }
1525         return rc;
1526 }
1527
1528 static int
1529 proc_do_sync_mode(ctl_table *table, int write,
1530                      void __user *buffer, size_t *lenp, loff_t *ppos)
1531 {
1532         int *valp = table->data;
1533         int val = *valp;
1534         int rc;
1535
1536         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1537         if (write && (*valp != val)) {
1538                 if ((*valp < 0) || (*valp > 1)) {
1539                         /* Restore the correct value */
1540                         *valp = val;
1541                 } else {
1542                         struct net *net = current->nsproxy->net_ns;
1543                         ip_vs_sync_switch_mode(net, val);
1544                 }
1545         }
1546         return rc;
1547 }
1548
1549 /*
1550  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1551  *      Do not change order or insert new entries without
1552  *      align with netns init in __ip_vs_control_init()
1553  */
1554
1555 static struct ctl_table vs_vars[] = {
1556         {
1557                 .procname       = "amemthresh",
1558                 .maxlen         = sizeof(int),
1559                 .mode           = 0644,
1560                 .proc_handler   = proc_dointvec,
1561         },
1562         {
1563                 .procname       = "am_droprate",
1564                 .maxlen         = sizeof(int),
1565                 .mode           = 0644,
1566                 .proc_handler   = proc_dointvec,
1567         },
1568         {
1569                 .procname       = "drop_entry",
1570                 .maxlen         = sizeof(int),
1571                 .mode           = 0644,
1572                 .proc_handler   = proc_do_defense_mode,
1573         },
1574         {
1575                 .procname       = "drop_packet",
1576                 .maxlen         = sizeof(int),
1577                 .mode           = 0644,
1578                 .proc_handler   = proc_do_defense_mode,
1579         },
1580 #ifdef CONFIG_IP_VS_NFCT
1581         {
1582                 .procname       = "conntrack",
1583                 .maxlen         = sizeof(int),
1584                 .mode           = 0644,
1585                 .proc_handler   = &proc_dointvec,
1586         },
1587 #endif
1588         {
1589                 .procname       = "secure_tcp",
1590                 .maxlen         = sizeof(int),
1591                 .mode           = 0644,
1592                 .proc_handler   = proc_do_defense_mode,
1593         },
1594         {
1595                 .procname       = "snat_reroute",
1596                 .maxlen         = sizeof(int),
1597                 .mode           = 0644,
1598                 .proc_handler   = &proc_dointvec,
1599         },
1600         {
1601                 .procname       = "sync_version",
1602                 .maxlen         = sizeof(int),
1603                 .mode           = 0644,
1604                 .proc_handler   = &proc_do_sync_mode,
1605         },
1606         {
1607                 .procname       = "cache_bypass",
1608                 .maxlen         = sizeof(int),
1609                 .mode           = 0644,
1610                 .proc_handler   = proc_dointvec,
1611         },
1612         {
1613                 .procname       = "expire_nodest_conn",
1614                 .maxlen         = sizeof(int),
1615                 .mode           = 0644,
1616                 .proc_handler   = proc_dointvec,
1617         },
1618         {
1619                 .procname       = "expire_quiescent_template",
1620                 .maxlen         = sizeof(int),
1621                 .mode           = 0644,
1622                 .proc_handler   = proc_dointvec,
1623         },
1624         {
1625                 .procname       = "sync_threshold",
1626                 .maxlen         =
1627                         sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
1628                 .mode           = 0644,
1629                 .proc_handler   = proc_do_sync_threshold,
1630         },
1631         {
1632                 .procname       = "nat_icmp_send",
1633                 .maxlen         = sizeof(int),
1634                 .mode           = 0644,
1635                 .proc_handler   = proc_dointvec,
1636         },
1637 #ifdef CONFIG_IP_VS_DEBUG
1638         {
1639                 .procname       = "debug_level",
1640                 .data           = &sysctl_ip_vs_debug_level,
1641                 .maxlen         = sizeof(int),
1642                 .mode           = 0644,
1643                 .proc_handler   = proc_dointvec,
1644         },
1645 #endif
1646 #if 0
1647         {
1648                 .procname       = "timeout_established",
1649                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1650                 .maxlen         = sizeof(int),
1651                 .mode           = 0644,
1652                 .proc_handler   = proc_dointvec_jiffies,
1653         },
1654         {
1655                 .procname       = "timeout_synsent",
1656                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1657                 .maxlen         = sizeof(int),
1658                 .mode           = 0644,
1659                 .proc_handler   = proc_dointvec_jiffies,
1660         },
1661         {
1662                 .procname       = "timeout_synrecv",
1663                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1664                 .maxlen         = sizeof(int),
1665                 .mode           = 0644,
1666                 .proc_handler   = proc_dointvec_jiffies,
1667         },
1668         {
1669                 .procname       = "timeout_finwait",
1670                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1671                 .maxlen         = sizeof(int),
1672                 .mode           = 0644,
1673                 .proc_handler   = proc_dointvec_jiffies,
1674         },
1675         {
1676                 .procname       = "timeout_timewait",
1677                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1678                 .maxlen         = sizeof(int),
1679                 .mode           = 0644,
1680                 .proc_handler   = proc_dointvec_jiffies,
1681         },
1682         {
1683                 .procname       = "timeout_close",
1684                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1685                 .maxlen         = sizeof(int),
1686                 .mode           = 0644,
1687                 .proc_handler   = proc_dointvec_jiffies,
1688         },
1689         {
1690                 .procname       = "timeout_closewait",
1691                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1692                 .maxlen         = sizeof(int),
1693                 .mode           = 0644,
1694                 .proc_handler   = proc_dointvec_jiffies,
1695         },
1696         {
1697                 .procname       = "timeout_lastack",
1698                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1699                 .maxlen         = sizeof(int),
1700                 .mode           = 0644,
1701                 .proc_handler   = proc_dointvec_jiffies,
1702         },
1703         {
1704                 .procname       = "timeout_listen",
1705                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1706                 .maxlen         = sizeof(int),
1707                 .mode           = 0644,
1708                 .proc_handler   = proc_dointvec_jiffies,
1709         },
1710         {
1711                 .procname       = "timeout_synack",
1712                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1713                 .maxlen         = sizeof(int),
1714                 .mode           = 0644,
1715                 .proc_handler   = proc_dointvec_jiffies,
1716         },
1717         {
1718                 .procname       = "timeout_udp",
1719                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1720                 .maxlen         = sizeof(int),
1721                 .mode           = 0644,
1722                 .proc_handler   = proc_dointvec_jiffies,
1723         },
1724         {
1725                 .procname       = "timeout_icmp",
1726                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1727                 .maxlen         = sizeof(int),
1728                 .mode           = 0644,
1729                 .proc_handler   = proc_dointvec_jiffies,
1730         },
1731 #endif
1732         { }
1733 };
1734
1735 const struct ctl_path net_vs_ctl_path[] = {
1736         { .procname = "net", },
1737         { .procname = "ipv4", },
1738         { .procname = "vs", },
1739         { }
1740 };
1741 EXPORT_SYMBOL_GPL(net_vs_ctl_path);
1742
1743 #ifdef CONFIG_PROC_FS
1744
1745 struct ip_vs_iter {
1746         struct seq_net_private p;  /* Do not move this, netns depends upon it*/
1747         struct list_head *table;
1748         int bucket;
1749 };
1750
1751 /*
1752  *      Write the contents of the VS rule table to a PROCfs file.
1753  *      (It is kept just for backward compatibility)
1754  */
1755 static inline const char *ip_vs_fwd_name(unsigned flags)
1756 {
1757         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1758         case IP_VS_CONN_F_LOCALNODE:
1759                 return "Local";
1760         case IP_VS_CONN_F_TUNNEL:
1761                 return "Tunnel";
1762         case IP_VS_CONN_F_DROUTE:
1763                 return "Route";
1764         default:
1765                 return "Masq";
1766         }
1767 }
1768
1769
1770 /* Get the Nth entry in the two lists */
1771 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1772 {
1773         struct net *net = seq_file_net(seq);
1774         struct ip_vs_iter *iter = seq->private;
1775         int idx;
1776         struct ip_vs_service *svc;
1777
1778         /* look in hash by protocol */
1779         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1780                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1781                         if (net_eq(svc->net, net) && pos-- == 0) {
1782                                 iter->table = ip_vs_svc_table;
1783                                 iter->bucket = idx;
1784                                 return svc;
1785                         }
1786                 }
1787         }
1788
1789         /* keep looking in fwmark */
1790         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1791                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1792                         if (net_eq(svc->net, net) && pos-- == 0) {
1793                                 iter->table = ip_vs_svc_fwm_table;
1794                                 iter->bucket = idx;
1795                                 return svc;
1796                         }
1797                 }
1798         }
1799
1800         return NULL;
1801 }
1802
1803 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1804 __acquires(__ip_vs_svc_lock)
1805 {
1806
1807         read_lock_bh(&__ip_vs_svc_lock);
1808         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1809 }
1810
1811
1812 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1813 {
1814         struct list_head *e;
1815         struct ip_vs_iter *iter;
1816         struct ip_vs_service *svc;
1817
1818         ++*pos;
1819         if (v == SEQ_START_TOKEN)
1820                 return ip_vs_info_array(seq,0);
1821
1822         svc = v;
1823         iter = seq->private;
1824
1825         if (iter->table == ip_vs_svc_table) {
1826                 /* next service in table hashed by protocol */
1827                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1828                         return list_entry(e, struct ip_vs_service, s_list);
1829
1830
1831                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1832                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1833                                             s_list) {
1834                                 return svc;
1835                         }
1836                 }
1837
1838                 iter->table = ip_vs_svc_fwm_table;
1839                 iter->bucket = -1;
1840                 goto scan_fwmark;
1841         }
1842
1843         /* next service in hashed by fwmark */
1844         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1845                 return list_entry(e, struct ip_vs_service, f_list);
1846
1847  scan_fwmark:
1848         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1849                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1850                                     f_list)
1851                         return svc;
1852         }
1853
1854         return NULL;
1855 }
1856
1857 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1858 __releases(__ip_vs_svc_lock)
1859 {
1860         read_unlock_bh(&__ip_vs_svc_lock);
1861 }
1862
1863
1864 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1865 {
1866         if (v == SEQ_START_TOKEN) {
1867                 seq_printf(seq,
1868                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1869                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
1870                 seq_puts(seq,
1871                          "Prot LocalAddress:Port Scheduler Flags\n");
1872                 seq_puts(seq,
1873                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1874         } else {
1875                 const struct ip_vs_service *svc = v;
1876                 const struct ip_vs_iter *iter = seq->private;
1877                 const struct ip_vs_dest *dest;
1878
1879                 if (iter->table == ip_vs_svc_table) {
1880 #ifdef CONFIG_IP_VS_IPV6
1881                         if (svc->af == AF_INET6)
1882                                 seq_printf(seq, "%s  [%pI6]:%04X %s ",
1883                                            ip_vs_proto_name(svc->protocol),
1884                                            &svc->addr.in6,
1885                                            ntohs(svc->port),
1886                                            svc->scheduler->name);
1887                         else
1888 #endif
1889                                 seq_printf(seq, "%s  %08X:%04X %s %s ",
1890                                            ip_vs_proto_name(svc->protocol),
1891                                            ntohl(svc->addr.ip),
1892                                            ntohs(svc->port),
1893                                            svc->scheduler->name,
1894                                            (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
1895                 } else {
1896                         seq_printf(seq, "FWM  %08X %s %s",
1897                                    svc->fwmark, svc->scheduler->name,
1898                                    (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
1899                 }
1900
1901                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1902                         seq_printf(seq, "persistent %d %08X\n",
1903                                 svc->timeout,
1904                                 ntohl(svc->netmask));
1905                 else
1906                         seq_putc(seq, '\n');
1907
1908                 list_for_each_entry(dest, &svc->destinations, n_list) {
1909 #ifdef CONFIG_IP_VS_IPV6
1910                         if (dest->af == AF_INET6)
1911                                 seq_printf(seq,
1912                                            "  -> [%pI6]:%04X"
1913                                            "      %-7s %-6d %-10d %-10d\n",
1914                                            &dest->addr.in6,
1915                                            ntohs(dest->port),
1916                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1917                                            atomic_read(&dest->weight),
1918                                            atomic_read(&dest->activeconns),
1919                                            atomic_read(&dest->inactconns));
1920                         else
1921 #endif
1922                                 seq_printf(seq,
1923                                            "  -> %08X:%04X      "
1924                                            "%-7s %-6d %-10d %-10d\n",
1925                                            ntohl(dest->addr.ip),
1926                                            ntohs(dest->port),
1927                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1928                                            atomic_read(&dest->weight),
1929                                            atomic_read(&dest->activeconns),
1930                                            atomic_read(&dest->inactconns));
1931
1932                 }
1933         }
1934         return 0;
1935 }
1936
1937 static const struct seq_operations ip_vs_info_seq_ops = {
1938         .start = ip_vs_info_seq_start,
1939         .next  = ip_vs_info_seq_next,
1940         .stop  = ip_vs_info_seq_stop,
1941         .show  = ip_vs_info_seq_show,
1942 };
1943
1944 static int ip_vs_info_open(struct inode *inode, struct file *file)
1945 {
1946         return seq_open_net(inode, file, &ip_vs_info_seq_ops,
1947                         sizeof(struct ip_vs_iter));
1948 }
1949
1950 static const struct file_operations ip_vs_info_fops = {
1951         .owner   = THIS_MODULE,
1952         .open    = ip_vs_info_open,
1953         .read    = seq_read,
1954         .llseek  = seq_lseek,
1955         .release = seq_release_private,
1956 };
1957
1958 #endif
1959
1960 #ifdef CONFIG_PROC_FS
1961 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1962 {
1963         struct net *net = seq_file_single_net(seq);
1964         struct ip_vs_stats *tot_stats = net_ipvs(net)->tot_stats;
1965
1966 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1967         seq_puts(seq,
1968                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1969         seq_printf(seq,
1970                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1971
1972         spin_lock_bh(&tot_stats->lock);
1973         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", tot_stats->ustats.conns,
1974                    tot_stats->ustats.inpkts, tot_stats->ustats.outpkts,
1975                    (unsigned long long) tot_stats->ustats.inbytes,
1976                    (unsigned long long) tot_stats->ustats.outbytes);
1977
1978 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1979         seq_puts(seq,
1980                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
1981         seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1982                         tot_stats->ustats.cps,
1983                         tot_stats->ustats.inpps,
1984                         tot_stats->ustats.outpps,
1985                         tot_stats->ustats.inbps,
1986                         tot_stats->ustats.outbps);
1987         spin_unlock_bh(&tot_stats->lock);
1988
1989         return 0;
1990 }
1991
1992 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1993 {
1994         return single_open_net(inode, file, ip_vs_stats_show);
1995 }
1996
1997 static const struct file_operations ip_vs_stats_fops = {
1998         .owner = THIS_MODULE,
1999         .open = ip_vs_stats_seq_open,
2000         .read = seq_read,
2001         .llseek = seq_lseek,
2002         .release = single_release,
2003 };
2004
2005 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
2006 {
2007         struct net *net = seq_file_single_net(seq);
2008         struct ip_vs_stats *tot_stats = net_ipvs(net)->tot_stats;
2009         int i;
2010
2011 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2012         seq_puts(seq,
2013                  "       Total Incoming Outgoing         Incoming         Outgoing\n");
2014         seq_printf(seq,
2015                    "CPU    Conns  Packets  Packets            Bytes            Bytes\n");
2016
2017         for_each_possible_cpu(i) {
2018                 struct ip_vs_cpu_stats *u = per_cpu_ptr(net->ipvs->cpustats, i);
2019                 seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
2020                             i, u->ustats.conns, u->ustats.inpkts,
2021                             u->ustats.outpkts, (__u64)u->ustats.inbytes,
2022                             (__u64)u->ustats.outbytes);
2023         }
2024
2025         spin_lock_bh(&tot_stats->lock);
2026         seq_printf(seq, "  ~ %8X %8X %8X %16LX %16LX\n\n",
2027                    tot_stats->ustats.conns, tot_stats->ustats.inpkts,
2028                    tot_stats->ustats.outpkts,
2029                    (unsigned long long) tot_stats->ustats.inbytes,
2030                    (unsigned long long) tot_stats->ustats.outbytes);
2031
2032 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2033         seq_puts(seq,
2034                    "     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2035         seq_printf(seq, "    %8X %8X %8X %16X %16X\n",
2036                         tot_stats->ustats.cps,
2037                         tot_stats->ustats.inpps,
2038                         tot_stats->ustats.outpps,
2039                         tot_stats->ustats.inbps,
2040                         tot_stats->ustats.outbps);
2041         spin_unlock_bh(&tot_stats->lock);
2042
2043         return 0;
2044 }
2045
2046 static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
2047 {
2048         return single_open_net(inode, file, ip_vs_stats_percpu_show);
2049 }
2050
2051 static const struct file_operations ip_vs_stats_percpu_fops = {
2052         .owner = THIS_MODULE,
2053         .open = ip_vs_stats_percpu_seq_open,
2054         .read = seq_read,
2055         .llseek = seq_lseek,
2056         .release = single_release,
2057 };
2058 #endif
2059
2060 /*
2061  *      Set timeout values for tcp tcpfin udp in the timeout_table.
2062  */
2063 static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
2064 {
2065 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2066         struct ip_vs_proto_data *pd;
2067 #endif
2068
2069         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
2070                   u->tcp_timeout,
2071                   u->tcp_fin_timeout,
2072                   u->udp_timeout);
2073
2074 #ifdef CONFIG_IP_VS_PROTO_TCP
2075         if (u->tcp_timeout) {
2076                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2077                 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
2078                         = u->tcp_timeout * HZ;
2079         }
2080
2081         if (u->tcp_fin_timeout) {
2082                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2083                 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
2084                         = u->tcp_fin_timeout * HZ;
2085         }
2086 #endif
2087
2088 #ifdef CONFIG_IP_VS_PROTO_UDP
2089         if (u->udp_timeout) {
2090                 pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2091                 pd->timeout_table[IP_VS_UDP_S_NORMAL]
2092                         = u->udp_timeout * HZ;
2093         }
2094 #endif
2095         return 0;
2096 }
2097
2098
2099 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2100 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
2101 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
2102                                  sizeof(struct ip_vs_dest_user))
2103 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
2104 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
2105 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
2106
2107 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
2108         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
2109         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
2110         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
2111         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
2112         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
2113         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
2114         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
2115         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
2116         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
2117         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
2118         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
2119 };
2120
2121 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2122                                   struct ip_vs_service_user *usvc_compat)
2123 {
2124         memset(usvc, 0, sizeof(*usvc));
2125
2126         usvc->af                = AF_INET;
2127         usvc->protocol          = usvc_compat->protocol;
2128         usvc->addr.ip           = usvc_compat->addr;
2129         usvc->port              = usvc_compat->port;
2130         usvc->fwmark            = usvc_compat->fwmark;
2131
2132         /* Deep copy of sched_name is not needed here */
2133         usvc->sched_name        = usvc_compat->sched_name;
2134
2135         usvc->flags             = usvc_compat->flags;
2136         usvc->timeout           = usvc_compat->timeout;
2137         usvc->netmask           = usvc_compat->netmask;
2138 }
2139
2140 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2141                                    struct ip_vs_dest_user *udest_compat)
2142 {
2143         memset(udest, 0, sizeof(*udest));
2144
2145         udest->addr.ip          = udest_compat->addr;
2146         udest->port             = udest_compat->port;
2147         udest->conn_flags       = udest_compat->conn_flags;
2148         udest->weight           = udest_compat->weight;
2149         udest->u_threshold      = udest_compat->u_threshold;
2150         udest->l_threshold      = udest_compat->l_threshold;
2151 }
2152
2153 static int
2154 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2155 {
2156         struct net *net = sock_net(sk);
2157         int ret;
2158         unsigned char arg[MAX_ARG_LEN];
2159         struct ip_vs_service_user *usvc_compat;
2160         struct ip_vs_service_user_kern usvc;
2161         struct ip_vs_service *svc;
2162         struct ip_vs_dest_user *udest_compat;
2163         struct ip_vs_dest_user_kern udest;
2164
2165         if (!capable(CAP_NET_ADMIN))
2166                 return -EPERM;
2167
2168         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
2169                 return -EINVAL;
2170         if (len < 0 || len >  MAX_ARG_LEN)
2171                 return -EINVAL;
2172         if (len != set_arglen[SET_CMDID(cmd)]) {
2173                 pr_err("set_ctl: len %u != %u\n",
2174                        len, set_arglen[SET_CMDID(cmd)]);
2175                 return -EINVAL;
2176         }
2177
2178         if (copy_from_user(arg, user, len) != 0)
2179                 return -EFAULT;
2180
2181         /* increase the module use count */
2182         ip_vs_use_count_inc();
2183
2184         if (mutex_lock_interruptible(&__ip_vs_mutex)) {
2185                 ret = -ERESTARTSYS;
2186                 goto out_dec;
2187         }
2188
2189         if (cmd == IP_VS_SO_SET_FLUSH) {
2190                 /* Flush the virtual service */
2191                 ret = ip_vs_flush(net);
2192                 goto out_unlock;
2193         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
2194                 /* Set timeout values for (tcp tcpfin udp) */
2195                 ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
2196                 goto out_unlock;
2197         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
2198                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2199                 ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
2200                                         dm->syncid);
2201                 goto out_unlock;
2202         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
2203                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2204                 ret = stop_sync_thread(net, dm->state);
2205                 goto out_unlock;
2206         }
2207
2208         usvc_compat = (struct ip_vs_service_user *)arg;
2209         udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
2210
2211         /* We only use the new structs internally, so copy userspace compat
2212          * structs to extended internal versions */
2213         ip_vs_copy_usvc_compat(&usvc, usvc_compat);
2214         ip_vs_copy_udest_compat(&udest, udest_compat);
2215
2216         if (cmd == IP_VS_SO_SET_ZERO) {
2217                 /* if no service address is set, zero counters in all */
2218                 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2219                         ret = ip_vs_zero_all(net);
2220                         goto out_unlock;
2221                 }
2222         }
2223
2224         /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
2225         if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
2226             usvc.protocol != IPPROTO_SCTP) {
2227                 pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n",
2228                        usvc.protocol, &usvc.addr.ip,
2229                        ntohs(usvc.port), usvc.sched_name);
2230                 ret = -EFAULT;
2231                 goto out_unlock;
2232         }
2233
2234         /* Lookup the exact service by <protocol, addr, port> or fwmark */
2235         if (usvc.fwmark == 0)
2236                 svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
2237                                            &usvc.addr, usvc.port);
2238         else
2239                 svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
2240
2241         if (cmd != IP_VS_SO_SET_ADD
2242             && (svc == NULL || svc->protocol != usvc.protocol)) {
2243                 ret = -ESRCH;
2244                 goto out_unlock;
2245         }
2246
2247         switch (cmd) {
2248         case IP_VS_SO_SET_ADD:
2249                 if (svc != NULL)
2250                         ret = -EEXIST;
2251                 else
2252                         ret = ip_vs_add_service(net, &usvc, &svc);
2253                 break;
2254         case IP_VS_SO_SET_EDIT:
2255                 ret = ip_vs_edit_service(svc, &usvc);
2256                 break;
2257         case IP_VS_SO_SET_DEL:
2258                 ret = ip_vs_del_service(svc);
2259                 if (!ret)
2260                         goto out_unlock;
2261                 break;
2262         case IP_VS_SO_SET_ZERO:
2263                 ret = ip_vs_zero_service(svc);
2264                 break;
2265         case IP_VS_SO_SET_ADDDEST:
2266                 ret = ip_vs_add_dest(svc, &udest);
2267                 break;
2268         case IP_VS_SO_SET_EDITDEST:
2269                 ret = ip_vs_edit_dest(svc, &udest);
2270                 break;
2271         case IP_VS_SO_SET_DELDEST:
2272                 ret = ip_vs_del_dest(svc, &udest);
2273                 break;
2274         default:
2275                 ret = -EINVAL;
2276         }
2277
2278   out_unlock:
2279         mutex_unlock(&__ip_vs_mutex);
2280   out_dec:
2281         /* decrease the module use count */
2282         ip_vs_use_count_dec();
2283
2284         return ret;
2285 }
2286
2287
2288 static void
2289 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2290 {
2291         spin_lock_bh(&src->lock);
2292         memcpy(dst, &src->ustats, sizeof(*dst));
2293         spin_unlock_bh(&src->lock);
2294 }
2295
2296 static void
2297 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2298 {
2299         dst->protocol = src->protocol;
2300         dst->addr = src->addr.ip;
2301         dst->port = src->port;
2302         dst->fwmark = src->fwmark;
2303         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2304         dst->flags = src->flags;
2305         dst->timeout = src->timeout / HZ;
2306         dst->netmask = src->netmask;
2307         dst->num_dests = src->num_dests;
2308         ip_vs_copy_stats(&dst->stats, &src->stats);
2309 }
2310
2311 static inline int
2312 __ip_vs_get_service_entries(struct net *net,
2313                             const struct ip_vs_get_services *get,
2314                             struct ip_vs_get_services __user *uptr)
2315 {
2316         int idx, count=0;
2317         struct ip_vs_service *svc;
2318         struct ip_vs_service_entry entry;
2319         int ret = 0;
2320
2321         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2322                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2323                         /* Only expose IPv4 entries to old interface */
2324                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2325                                 continue;
2326
2327                         if (count >= get->num_services)
2328                                 goto out;
2329                         memset(&entry, 0, sizeof(entry));
2330                         ip_vs_copy_service(&entry, svc);
2331                         if (copy_to_user(&uptr->entrytable[count],
2332                                          &entry, sizeof(entry))) {
2333                                 ret = -EFAULT;
2334                                 goto out;
2335                         }
2336                         count++;
2337                 }
2338         }
2339
2340         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2341                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2342                         /* Only expose IPv4 entries to old interface */
2343                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2344                                 continue;
2345
2346                         if (count >= get->num_services)
2347                                 goto out;
2348                         memset(&entry, 0, sizeof(entry));
2349                         ip_vs_copy_service(&entry, svc);
2350                         if (copy_to_user(&uptr->entrytable[count],
2351                                          &entry, sizeof(entry))) {
2352                                 ret = -EFAULT;
2353                                 goto out;
2354                         }
2355                         count++;
2356                 }
2357         }
2358   out:
2359         return ret;
2360 }
2361
2362 static inline int
2363 __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
2364                          struct ip_vs_get_dests __user *uptr)
2365 {
2366         struct ip_vs_service *svc;
2367         union nf_inet_addr addr = { .ip = get->addr };
2368         int ret = 0;
2369
2370         if (get->fwmark)
2371                 svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
2372         else
2373                 svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
2374                                            get->port);
2375
2376         if (svc) {
2377                 int count = 0;
2378                 struct ip_vs_dest *dest;
2379                 struct ip_vs_dest_entry entry;
2380
2381                 list_for_each_entry(dest, &svc->destinations, n_list) {
2382                         if (count >= get->num_dests)
2383                                 break;
2384
2385                         entry.addr = dest->addr.ip;
2386                         entry.port = dest->port;
2387                         entry.conn_flags = atomic_read(&dest->conn_flags);
2388                         entry.weight = atomic_read(&dest->weight);
2389                         entry.u_threshold = dest->u_threshold;
2390                         entry.l_threshold = dest->l_threshold;
2391                         entry.activeconns = atomic_read(&dest->activeconns);
2392                         entry.inactconns = atomic_read(&dest->inactconns);
2393                         entry.persistconns = atomic_read(&dest->persistconns);
2394                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2395                         if (copy_to_user(&uptr->entrytable[count],
2396                                          &entry, sizeof(entry))) {
2397                                 ret = -EFAULT;
2398                                 break;
2399                         }
2400                         count++;
2401                 }
2402         } else
2403                 ret = -ESRCH;
2404         return ret;
2405 }
2406
2407 static inline void
2408 __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
2409 {
2410 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2411         struct ip_vs_proto_data *pd;
2412 #endif
2413
2414 #ifdef CONFIG_IP_VS_PROTO_TCP
2415         pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2416         u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2417         u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2418 #endif
2419 #ifdef CONFIG_IP_VS_PROTO_UDP
2420         pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2421         u->udp_timeout =
2422                         pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2423 #endif
2424 }
2425
2426
2427 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2428 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2429 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2430 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2431 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2432 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2433 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2434
2435 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2436         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2437         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2438         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2439         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2440         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2441         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2442         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2443 };
2444
2445 static int
2446 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2447 {
2448         unsigned char arg[128];
2449         int ret = 0;
2450         unsigned int copylen;
2451         struct net *net = sock_net(sk);
2452         struct netns_ipvs *ipvs = net_ipvs(net);
2453
2454         BUG_ON(!net);
2455         if (!capable(CAP_NET_ADMIN))
2456                 return -EPERM;
2457
2458         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
2459                 return -EINVAL;
2460
2461         if (*len < get_arglen[GET_CMDID(cmd)]) {
2462                 pr_err("get_ctl: len %u < %u\n",
2463                        *len, get_arglen[GET_CMDID(cmd)]);
2464                 return -EINVAL;
2465         }
2466
2467         copylen = get_arglen[GET_CMDID(cmd)];
2468         if (copylen > 128)
2469                 return -EINVAL;
2470
2471         if (copy_from_user(arg, user, copylen) != 0)
2472                 return -EFAULT;
2473
2474         if (mutex_lock_interruptible(&__ip_vs_mutex))
2475                 return -ERESTARTSYS;
2476
2477         switch (cmd) {
2478         case IP_VS_SO_GET_VERSION:
2479         {
2480                 char buf[64];
2481
2482                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2483                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2484                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2485                         ret = -EFAULT;
2486                         goto out;
2487                 }
2488                 *len = strlen(buf)+1;
2489         }
2490         break;
2491
2492         case IP_VS_SO_GET_INFO:
2493         {
2494                 struct ip_vs_getinfo info;
2495                 info.version = IP_VS_VERSION_CODE;
2496                 info.size = ip_vs_conn_tab_size;
2497                 info.num_services = ipvs->num_services;
2498                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2499                         ret = -EFAULT;
2500         }
2501         break;
2502
2503         case IP_VS_SO_GET_SERVICES:
2504         {
2505                 struct ip_vs_get_services *get;
2506                 int size;
2507
2508                 get = (struct ip_vs_get_services *)arg;
2509                 size = sizeof(*get) +
2510                         sizeof(struct ip_vs_service_entry) * get->num_services;
2511                 if (*len != size) {
2512                         pr_err("length: %u != %u\n", *len, size);
2513                         ret = -EINVAL;
2514                         goto out;
2515                 }
2516                 ret = __ip_vs_get_service_entries(net, get, user);
2517         }
2518         break;
2519
2520         case IP_VS_SO_GET_SERVICE:
2521         {
2522                 struct ip_vs_service_entry *entry;
2523                 struct ip_vs_service *svc;
2524                 union nf_inet_addr addr;
2525
2526                 entry = (struct ip_vs_service_entry *)arg;
2527                 addr.ip = entry->addr;
2528                 if (entry->fwmark)
2529                         svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
2530                 else
2531                         svc = __ip_vs_service_find(net, AF_INET,
2532                                                    entry->protocol, &addr,
2533                                                    entry->port);
2534                 if (svc) {
2535                         ip_vs_copy_service(entry, svc);
2536                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2537                                 ret = -EFAULT;
2538                 } else
2539                         ret = -ESRCH;
2540         }
2541         break;
2542
2543         case IP_VS_SO_GET_DESTS:
2544         {
2545                 struct ip_vs_get_dests *get;
2546                 int size;
2547
2548                 get = (struct ip_vs_get_dests *)arg;
2549                 size = sizeof(*get) +
2550                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2551                 if (*len != size) {
2552                         pr_err("length: %u != %u\n", *len, size);
2553                         ret = -EINVAL;
2554                         goto out;
2555                 }
2556                 ret = __ip_vs_get_dest_entries(net, get, user);
2557         }
2558         break;
2559
2560         case IP_VS_SO_GET_TIMEOUT:
2561         {
2562                 struct ip_vs_timeout_user t;
2563
2564                 __ip_vs_get_timeouts(net, &t);
2565                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2566                         ret = -EFAULT;
2567         }
2568         break;
2569
2570         case IP_VS_SO_GET_DAEMON:
2571         {
2572                 struct ip_vs_daemon_user d[2];
2573
2574                 memset(&d, 0, sizeof(d));
2575                 if (ipvs->sync_state & IP_VS_STATE_MASTER) {
2576                         d[0].state = IP_VS_STATE_MASTER;
2577                         strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
2578                                 sizeof(d[0].mcast_ifn));
2579                         d[0].syncid = ipvs->master_syncid;
2580                 }
2581                 if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
2582                         d[1].state = IP_VS_STATE_BACKUP;
2583                         strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
2584                                 sizeof(d[1].mcast_ifn));
2585                         d[1].syncid = ipvs->backup_syncid;
2586                 }
2587                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2588                         ret = -EFAULT;
2589         }
2590         break;
2591
2592         default:
2593                 ret = -EINVAL;
2594         }
2595
2596   out:
2597         mutex_unlock(&__ip_vs_mutex);
2598         return ret;
2599 }
2600
2601
2602 static struct nf_sockopt_ops ip_vs_sockopts = {
2603         .pf             = PF_INET,
2604         .set_optmin     = IP_VS_BASE_CTL,
2605         .set_optmax     = IP_VS_SO_SET_MAX+1,
2606         .set            = do_ip_vs_set_ctl,
2607         .get_optmin     = IP_VS_BASE_CTL,
2608         .get_optmax     = IP_VS_SO_GET_MAX+1,
2609         .get            = do_ip_vs_get_ctl,
2610         .owner          = THIS_MODULE,
2611 };
2612
2613 /*
2614  * Generic Netlink interface
2615  */
2616
2617 /* IPVS genetlink family */
2618 static struct genl_family ip_vs_genl_family = {
2619         .id             = GENL_ID_GENERATE,
2620         .hdrsize        = 0,
2621         .name           = IPVS_GENL_NAME,
2622         .version        = IPVS_GENL_VERSION,
2623         .maxattr        = IPVS_CMD_MAX,
2624         .netnsok        = true,         /* Make ipvsadm to work on netns */
2625 };
2626
2627 /* Policy used for first-level command attributes */
2628 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
2629         [IPVS_CMD_ATTR_SERVICE]         = { .type = NLA_NESTED },
2630         [IPVS_CMD_ATTR_DEST]            = { .type = NLA_NESTED },
2631         [IPVS_CMD_ATTR_DAEMON]          = { .type = NLA_NESTED },
2632         [IPVS_CMD_ATTR_TIMEOUT_TCP]     = { .type = NLA_U32 },
2633         [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
2634         [IPVS_CMD_ATTR_TIMEOUT_UDP]     = { .type = NLA_U32 },
2635 };
2636
2637 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
2638 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
2639         [IPVS_DAEMON_ATTR_STATE]        = { .type = NLA_U32 },
2640         [IPVS_DAEMON_ATTR_MCAST_IFN]    = { .type = NLA_NUL_STRING,
2641                                             .len = IP_VS_IFNAME_MAXLEN },
2642         [IPVS_DAEMON_ATTR_SYNC_ID]      = { .type = NLA_U32 },
2643 };
2644
2645 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
2646 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
2647         [IPVS_SVC_ATTR_AF]              = { .type = NLA_U16 },
2648         [IPVS_SVC_ATTR_PROTOCOL]        = { .type = NLA_U16 },
2649         [IPVS_SVC_ATTR_ADDR]            = { .type = NLA_BINARY,
2650                                             .len = sizeof(union nf_inet_addr) },
2651         [IPVS_SVC_ATTR_PORT]            = { .type = NLA_U16 },
2652         [IPVS_SVC_ATTR_FWMARK]          = { .type = NLA_U32 },
2653         [IPVS_SVC_ATTR_SCHED_NAME]      = { .type = NLA_NUL_STRING,
2654                                             .len = IP_VS_SCHEDNAME_MAXLEN },
2655         [IPVS_SVC_ATTR_PE_NAME]         = { .type = NLA_NUL_STRING,
2656                                             .len = IP_VS_PENAME_MAXLEN },
2657         [IPVS_SVC_ATTR_FLAGS]           = { .type = NLA_BINARY,
2658                                             .len = sizeof(struct ip_vs_flags) },
2659         [IPVS_SVC_ATTR_TIMEOUT]         = { .type = NLA_U32 },
2660         [IPVS_SVC_ATTR_NETMASK]         = { .type = NLA_U32 },
2661         [IPVS_SVC_ATTR_STATS]           = { .type = NLA_NESTED },
2662 };
2663
2664 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
2665 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
2666         [IPVS_DEST_ATTR_ADDR]           = { .type = NLA_BINARY,
2667                                             .len = sizeof(union nf_inet_addr) },
2668         [IPVS_DEST_ATTR_PORT]           = { .type = NLA_U16 },
2669         [IPVS_DEST_ATTR_FWD_METHOD]     = { .type = NLA_U32 },
2670         [IPVS_DEST_ATTR_WEIGHT]         = { .type = NLA_U32 },
2671         [IPVS_DEST_ATTR_U_THRESH]       = { .type = NLA_U32 },
2672         [IPVS_DEST_ATTR_L_THRESH]       = { .type = NLA_U32 },
2673         [IPVS_DEST_ATTR_ACTIVE_CONNS]   = { .type = NLA_U32 },
2674         [IPVS_DEST_ATTR_INACT_CONNS]    = { .type = NLA_U32 },
2675         [IPVS_DEST_ATTR_PERSIST_CONNS]  = { .type = NLA_U32 },
2676         [IPVS_DEST_ATTR_STATS]          = { .type = NLA_NESTED },
2677 };
2678
2679 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
2680                                  struct ip_vs_stats *stats)
2681 {
2682         struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2683         if (!nl_stats)
2684                 return -EMSGSIZE;
2685
2686         spin_lock_bh(&stats->lock);
2687
2688         NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns);
2689         NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts);
2690         NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts);
2691         NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes);
2692         NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes);
2693         NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps);
2694         NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps);
2695         NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps);
2696         NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps);
2697         NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps);
2698
2699         spin_unlock_bh(&stats->lock);
2700
2701         nla_nest_end(skb, nl_stats);
2702
2703         return 0;
2704
2705 nla_put_failure:
2706         spin_unlock_bh(&stats->lock);
2707         nla_nest_cancel(skb, nl_stats);
2708         return -EMSGSIZE;
2709 }
2710
2711 static int ip_vs_genl_fill_service(struct sk_buff *skb,
2712                                    struct ip_vs_service *svc)
2713 {
2714         struct nlattr *nl_service;
2715         struct ip_vs_flags flags = { .flags = svc->flags,
2716                                      .mask = ~0 };
2717
2718         nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
2719         if (!nl_service)
2720                 return -EMSGSIZE;
2721
2722         NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af);
2723
2724         if (svc->fwmark) {
2725                 NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
2726         } else {
2727                 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol);
2728                 NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr);
2729                 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port);
2730         }
2731
2732         NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
2733         if (svc->pe)
2734                 NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name);
2735         NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
2736         NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
2737         NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
2738
2739         if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
2740                 goto nla_put_failure;
2741
2742         nla_nest_end(skb, nl_service);
2743
2744         return 0;
2745
2746 nla_put_failure:
2747         nla_nest_cancel(skb, nl_service);
2748         return -EMSGSIZE;
2749 }
2750
2751 static int ip_vs_genl_dump_service(struct sk_buff *skb,
2752                                    struct ip_vs_service *svc,
2753                                    struct netlink_callback *cb)
2754 {
2755         void *hdr;
2756
2757         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2758                           &ip_vs_genl_family, NLM_F_MULTI,
2759                           IPVS_CMD_NEW_SERVICE);
2760         if (!hdr)
2761                 return -EMSGSIZE;
2762
2763         if (ip_vs_genl_fill_service(skb, svc) < 0)
2764                 goto nla_put_failure;
2765
2766         return genlmsg_end(skb, hdr);
2767
2768 nla_put_failure:
2769         genlmsg_cancel(skb, hdr);
2770         return -EMSGSIZE;
2771 }
2772
2773 static int ip_vs_genl_dump_services(struct sk_buff *skb,
2774                                     struct netlink_callback *cb)
2775 {
2776         int idx = 0, i;
2777         int start = cb->args[0];
2778         struct ip_vs_service *svc;
2779         struct net *net = skb_sknet(skb);
2780
2781         mutex_lock(&__ip_vs_mutex);
2782         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2783                 list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
2784                         if (++idx <= start || !net_eq(svc->net, net))
2785                                 continue;
2786                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2787                                 idx--;
2788                                 goto nla_put_failure;
2789                         }
2790                 }
2791         }
2792
2793         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2794                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
2795                         if (++idx <= start || !net_eq(svc->net, net))
2796                                 continue;
2797                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2798                                 idx--;
2799                                 goto nla_put_failure;
2800                         }
2801                 }
2802         }
2803
2804 nla_put_failure:
2805         mutex_unlock(&__ip_vs_mutex);
2806         cb->args[0] = idx;
2807
2808         return skb->len;
2809 }
2810
2811 static int ip_vs_genl_parse_service(struct net *net,
2812                                     struct ip_vs_service_user_kern *usvc,
2813                                     struct nlattr *nla, int full_entry,
2814                                     struct ip_vs_service **ret_svc)
2815 {
2816         struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
2817         struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
2818         struct ip_vs_service *svc;
2819
2820         /* Parse mandatory identifying service fields first */
2821         if (nla == NULL ||
2822             nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
2823                 return -EINVAL;
2824
2825         nla_af          = attrs[IPVS_SVC_ATTR_AF];
2826         nla_protocol    = attrs[IPVS_SVC_ATTR_PROTOCOL];
2827         nla_addr        = attrs[IPVS_SVC_ATTR_ADDR];
2828         nla_port        = attrs[IPVS_SVC_ATTR_PORT];
2829         nla_fwmark      = attrs[IPVS_SVC_ATTR_FWMARK];
2830
2831         if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
2832                 return -EINVAL;
2833
2834         memset(usvc, 0, sizeof(*usvc));
2835
2836         usvc->af = nla_get_u16(nla_af);
2837 #ifdef CONFIG_IP_VS_IPV6
2838         if (usvc->af != AF_INET && usvc->af != AF_INET6)
2839 #else
2840         if (usvc->af != AF_INET)
2841 #endif
2842                 return -EAFNOSUPPORT;
2843
2844         if (nla_fwmark) {
2845                 usvc->protocol = IPPROTO_TCP;
2846                 usvc->fwmark = nla_get_u32(nla_fwmark);
2847         } else {
2848                 usvc->protocol = nla_get_u16(nla_protocol);
2849                 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
2850                 usvc->port = nla_get_u16(nla_port);
2851                 usvc->fwmark = 0;
2852         }
2853
2854         if (usvc->fwmark)
2855                 svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
2856         else
2857                 svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
2858                                            &usvc->addr, usvc->port);
2859         *ret_svc = svc;
2860
2861         /* If a full entry was requested, check for the additional fields */
2862         if (full_entry) {
2863                 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
2864                               *nla_netmask;
2865                 struct ip_vs_flags flags;
2866
2867                 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
2868                 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
2869                 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
2870                 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
2871                 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
2872
2873                 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
2874                         return -EINVAL;
2875
2876                 nla_memcpy(&flags, nla_flags, sizeof(flags));
2877
2878                 /* prefill flags from service if it already exists */
2879                 if (svc)
2880                         usvc->flags = svc->flags;
2881
2882                 /* set new flags from userland */
2883                 usvc->flags = (usvc->flags & ~flags.mask) |
2884                               (flags.flags & flags.mask);
2885                 usvc->sched_name = nla_data(nla_sched);
2886                 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
2887                 usvc->timeout = nla_get_u32(nla_timeout);
2888                 usvc->netmask = nla_get_u32(nla_netmask);
2889         }
2890
2891         return 0;
2892 }
2893
2894 static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
2895                                                      struct nlattr *nla)
2896 {
2897         struct ip_vs_service_user_kern usvc;
2898         struct ip_vs_service *svc;
2899         int ret;
2900
2901         ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
2902         return ret ? ERR_PTR(ret) : svc;
2903 }
2904
2905 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
2906 {
2907         struct nlattr *nl_dest;
2908
2909         nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
2910         if (!nl_dest)
2911                 return -EMSGSIZE;
2912
2913         NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr);
2914         NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port);
2915
2916         NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD,
2917                     atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK);
2918         NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight));
2919         NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold);
2920         NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold);
2921         NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
2922                     atomic_read(&dest->activeconns));
2923         NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS,
2924                     atomic_read(&dest->inactconns));
2925         NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
2926                     atomic_read(&dest->persistconns));
2927
2928         if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
2929                 goto nla_put_failure;
2930
2931         nla_nest_end(skb, nl_dest);
2932
2933         return 0;
2934
2935 nla_put_failure:
2936         nla_nest_cancel(skb, nl_dest);
2937         return -EMSGSIZE;
2938 }
2939
2940 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
2941                                 struct netlink_callback *cb)
2942 {
2943         void *hdr;
2944
2945         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2946                           &ip_vs_genl_family, NLM_F_MULTI,
2947                           IPVS_CMD_NEW_DEST);
2948         if (!hdr)
2949                 return -EMSGSIZE;
2950
2951         if (ip_vs_genl_fill_dest(skb, dest) < 0)
2952                 goto nla_put_failure;
2953
2954         return genlmsg_end(skb, hdr);
2955
2956 nla_put_failure:
2957         genlmsg_cancel(skb, hdr);
2958         return -EMSGSIZE;
2959 }
2960
2961 static int ip_vs_genl_dump_dests(struct sk_buff *skb,
2962                                  struct netlink_callback *cb)
2963 {
2964         int idx = 0;
2965         int start = cb->args[0];
2966         struct ip_vs_service *svc;
2967         struct ip_vs_dest *dest;
2968         struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
2969         struct net *net = skb_sknet(skb);
2970
2971         mutex_lock(&__ip_vs_mutex);
2972
2973         /* Try to find the service for which to dump destinations */
2974         if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
2975                         IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
2976                 goto out_err;
2977
2978
2979         svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
2980         if (IS_ERR(svc) || svc == NULL)
2981                 goto out_err;
2982
2983         /* Dump the destinations */
2984         list_for_each_entry(dest, &svc->destinations, n_list) {
2985                 if (++idx <= start)
2986                         continue;
2987                 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
2988                         idx--;
2989                         goto nla_put_failure;
2990                 }
2991         }
2992
2993 nla_put_failure:
2994         cb->args[0] = idx;
2995
2996 out_err:
2997         mutex_unlock(&__ip_vs_mutex);
2998
2999         return skb->len;
3000 }
3001
3002 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
3003                                  struct nlattr *nla, int full_entry)
3004 {
3005         struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
3006         struct nlattr *nla_addr, *nla_port;
3007
3008         /* Parse mandatory identifying destination fields first */
3009         if (nla == NULL ||
3010             nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
3011                 return -EINVAL;
3012
3013         nla_addr        = attrs[IPVS_DEST_ATTR_ADDR];
3014         nla_port        = attrs[IPVS_DEST_ATTR_PORT];
3015
3016         if (!(nla_addr && nla_port))
3017                 return -EINVAL;
3018
3019         memset(udest, 0, sizeof(*udest));
3020
3021         nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
3022         udest->port = nla_get_u16(nla_port);
3023
3024         /* If a full entry was requested, check for the additional fields */
3025         if (full_entry) {
3026                 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
3027                               *nla_l_thresh;
3028
3029                 nla_fwd         = attrs[IPVS_DEST_ATTR_FWD_METHOD];
3030                 nla_weight      = attrs[IPVS_DEST_ATTR_WEIGHT];
3031                 nla_u_thresh    = attrs[IPVS_DEST_ATTR_U_THRESH];
3032                 nla_l_thresh    = attrs[IPVS_DEST_ATTR_L_THRESH];
3033
3034                 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
3035                         return -EINVAL;
3036
3037                 udest->conn_flags = nla_get_u32(nla_fwd)
3038                                     & IP_VS_CONN_F_FWD_MASK;
3039                 udest->weight = nla_get_u32(nla_weight);
3040                 udest->u_threshold = nla_get_u32(nla_u_thresh);
3041                 udest->l_threshold = nla_get_u32(nla_l_thresh);
3042         }
3043
3044         return 0;
3045 }
3046
3047 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
3048                                   const char *mcast_ifn, __be32 syncid)
3049 {
3050         struct nlattr *nl_daemon;
3051
3052         nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
3053         if (!nl_daemon)
3054                 return -EMSGSIZE;
3055
3056         NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state);
3057         NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn);
3058         NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid);
3059
3060         nla_nest_end(skb, nl_daemon);
3061
3062         return 0;
3063
3064 nla_put_failure:
3065         nla_nest_cancel(skb, nl_daemon);
3066         return -EMSGSIZE;
3067 }
3068
3069 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
3070                                   const char *mcast_ifn, __be32 syncid,
3071                                   struct netlink_callback *cb)
3072 {
3073         void *hdr;
3074         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
3075                           &ip_vs_genl_family, NLM_F_MULTI,
3076                           IPVS_CMD_NEW_DAEMON);
3077         if (!hdr)
3078                 return -EMSGSIZE;
3079
3080         if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
3081                 goto nla_put_failure;
3082
3083         return genlmsg_end(skb, hdr);
3084
3085 nla_put_failure:
3086         genlmsg_cancel(skb, hdr);
3087         return -EMSGSIZE;
3088 }
3089
3090 static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
3091                                    struct netlink_callback *cb)
3092 {
3093         struct net *net = skb_net(skb);
3094         struct netns_ipvs *ipvs = net_ipvs(net);
3095
3096         mutex_lock(&__ip_vs_mutex);
3097         if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
3098                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
3099                                            ipvs->master_mcast_ifn,
3100                                            ipvs->master_syncid, cb) < 0)
3101                         goto nla_put_failure;
3102
3103                 cb->args[0] = 1;
3104         }
3105
3106         if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
3107                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
3108                                            ipvs->backup_mcast_ifn,
3109                                            ipvs->backup_syncid, cb) < 0)
3110                         goto nla_put_failure;
3111
3112                 cb->args[1] = 1;
3113         }
3114
3115 nla_put_failure:
3116         mutex_unlock(&__ip_vs_mutex);
3117
3118         return skb->len;
3119 }
3120
3121 static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
3122 {
3123         if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
3124               attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
3125               attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
3126                 return -EINVAL;
3127
3128         return start_sync_thread(net,
3129                                  nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
3130                                  nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
3131                                  nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
3132 }
3133
3134 static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
3135 {
3136         if (!attrs[IPVS_DAEMON_ATTR_STATE])
3137                 return -EINVAL;
3138
3139         return stop_sync_thread(net,
3140                                 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3141 }
3142
3143 static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
3144 {
3145         struct ip_vs_timeout_user t;
3146
3147         __ip_vs_get_timeouts(net, &t);
3148
3149         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
3150                 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
3151
3152         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
3153                 t.tcp_fin_timeout =
3154                         nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
3155
3156         if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
3157                 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
3158
3159         return ip_vs_set_timeout(net, &t);
3160 }
3161
3162 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3163 {
3164         struct ip_vs_service *svc = NULL;
3165         struct ip_vs_service_user_kern usvc;
3166         struct ip_vs_dest_user_kern udest;
3167         int ret = 0, cmd;
3168         int need_full_svc = 0, need_full_dest = 0;
3169         struct net *net;
3170         struct netns_ipvs *ipvs;
3171
3172         net = skb_sknet(skb);
3173         ipvs = net_ipvs(net);
3174         cmd = info->genlhdr->cmd;
3175
3176         mutex_lock(&__ip_vs_mutex);
3177
3178         if (cmd == IPVS_CMD_FLUSH) {
3179                 ret = ip_vs_flush(net);
3180                 goto out;
3181         } else if (cmd == IPVS_CMD_SET_CONFIG) {
3182                 ret = ip_vs_genl_set_config(net, info->attrs);
3183                 goto out;
3184         } else if (cmd == IPVS_CMD_NEW_DAEMON ||
3185                    cmd == IPVS_CMD_DEL_DAEMON) {
3186
3187                 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
3188
3189                 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
3190                     nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
3191                                      info->attrs[IPVS_CMD_ATTR_DAEMON],
3192                                      ip_vs_daemon_policy)) {
3193                         ret = -EINVAL;
3194                         goto out;
3195                 }
3196
3197                 if (cmd == IPVS_CMD_NEW_DAEMON)
3198                         ret = ip_vs_genl_new_daemon(net, daemon_attrs);
3199                 else
3200                         ret = ip_vs_genl_del_daemon(net, daemon_attrs);
3201                 goto out;
3202         } else if (cmd == IPVS_CMD_ZERO &&
3203                    !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3204                 ret = ip_vs_zero_all(net);
3205                 goto out;
3206         }
3207
3208         /* All following commands require a service argument, so check if we
3209          * received a valid one. We need a full service specification when
3210          * adding / editing a service. Only identifying members otherwise. */
3211         if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
3212                 need_full_svc = 1;
3213
3214         ret = ip_vs_genl_parse_service(net, &usvc,
3215                                        info->attrs[IPVS_CMD_ATTR_SERVICE],
3216                                        need_full_svc, &svc);
3217         if (ret)
3218                 goto out;
3219
3220         /* Unless we're adding a new service, the service must already exist */
3221         if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
3222                 ret = -ESRCH;
3223                 goto out;
3224         }
3225
3226         /* Destination commands require a valid destination argument. For
3227          * adding / editing a destination, we need a full destination
3228          * specification. */
3229         if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
3230             cmd == IPVS_CMD_DEL_DEST) {
3231                 if (cmd != IPVS_CMD_DEL_DEST)
3232                         need_full_dest = 1;
3233
3234                 ret = ip_vs_genl_parse_dest(&udest,
3235                                             info->attrs[IPVS_CMD_ATTR_DEST],
3236                                             need_full_dest);
3237                 if (ret)
3238                         goto out;
3239         }
3240
3241         switch (cmd) {
3242         case IPVS_CMD_NEW_SERVICE:
3243                 if (svc == NULL)
3244                         ret = ip_vs_add_service(net, &usvc, &svc);
3245                 else
3246                         ret = -EEXIST;
3247                 break;
3248         case IPVS_CMD_SET_SERVICE:
3249                 ret = ip_vs_edit_service(svc, &usvc);
3250                 break;
3251         case IPVS_CMD_DEL_SERVICE:
3252                 ret = ip_vs_del_service(svc);
3253                 /* do not use svc, it can be freed */
3254                 break;
3255         case IPVS_CMD_NEW_DEST:
3256                 ret = ip_vs_add_dest(svc, &udest);
3257                 break;
3258         case IPVS_CMD_SET_DEST:
3259                 ret = ip_vs_edit_dest(svc, &udest);
3260                 break;
3261         case IPVS_CMD_DEL_DEST:
3262                 ret = ip_vs_del_dest(svc, &udest);
3263                 break;
3264         case IPVS_CMD_ZERO:
3265                 ret = ip_vs_zero_service(svc);
3266                 break;
3267         default:
3268                 ret = -EINVAL;
3269         }
3270
3271 out:
3272         mutex_unlock(&__ip_vs_mutex);
3273
3274         return ret;
3275 }
3276
3277 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3278 {
3279         struct sk_buff *msg;
3280         void *reply;
3281         int ret, cmd, reply_cmd;
3282         struct net *net;
3283         struct netns_ipvs *ipvs;
3284
3285         net = skb_sknet(skb);
3286         ipvs = net_ipvs(net);
3287         cmd = info->genlhdr->cmd;
3288
3289         if (cmd == IPVS_CMD_GET_SERVICE)
3290                 reply_cmd = IPVS_CMD_NEW_SERVICE;
3291         else if (cmd == IPVS_CMD_GET_INFO)
3292                 reply_cmd = IPVS_CMD_SET_INFO;
3293         else if (cmd == IPVS_CMD_GET_CONFIG)
3294                 reply_cmd = IPVS_CMD_SET_CONFIG;
3295         else {
3296                 pr_err("unknown Generic Netlink command\n");
3297                 return -EINVAL;
3298         }
3299
3300         msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
3301         if (!msg)
3302                 return -ENOMEM;
3303
3304         mutex_lock(&__ip_vs_mutex);
3305
3306         reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
3307         if (reply == NULL)
3308                 goto nla_put_failure;
3309
3310         switch (cmd) {
3311         case IPVS_CMD_GET_SERVICE:
3312         {
3313                 struct ip_vs_service *svc;
3314
3315                 svc = ip_vs_genl_find_service(net,
3316                                               info->attrs[IPVS_CMD_ATTR_SERVICE]);
3317                 if (IS_ERR(svc)) {
3318                         ret = PTR_ERR(svc);
3319                         goto out_err;
3320                 } else if (svc) {
3321                         ret = ip_vs_genl_fill_service(msg, svc);
3322                         if (ret)
3323                                 goto nla_put_failure;
3324                 } else {
3325                         ret = -ESRCH;
3326                         goto out_err;
3327                 }
3328
3329                 break;
3330         }
3331
3332         case IPVS_CMD_GET_CONFIG:
3333         {
3334                 struct ip_vs_timeout_user t;
3335
3336                 __ip_vs_get_timeouts(net, &t);
3337 #ifdef CONFIG_IP_VS_PROTO_TCP
3338                 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
3339                 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
3340                             t.tcp_fin_timeout);
3341 #endif
3342 #ifdef CONFIG_IP_VS_PROTO_UDP
3343                 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout);
3344 #endif
3345
3346                 break;
3347         }
3348
3349         case IPVS_CMD_GET_INFO:
3350                 NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
3351                 NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
3352                             ip_vs_conn_tab_size);
3353                 break;
3354         }
3355
3356         genlmsg_end(msg, reply);
3357         ret = genlmsg_reply(msg, info);
3358         goto out;
3359
3360 nla_put_failure:
3361         pr_err("not enough space in Netlink message\n");
3362         ret = -EMSGSIZE;
3363
3364 out_err:
3365         nlmsg_free(msg);
3366 out:
3367         mutex_unlock(&__ip_vs_mutex);
3368
3369         return ret;
3370 }
3371
3372
3373 static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
3374         {
3375                 .cmd    = IPVS_CMD_NEW_SERVICE,
3376                 .flags  = GENL_ADMIN_PERM,
3377                 .policy = ip_vs_cmd_policy,
3378                 .doit   = ip_vs_genl_set_cmd,
3379         },
3380         {
3381                 .cmd    = IPVS_CMD_SET_SERVICE,
3382                 .flags  = GENL_ADMIN_PERM,
3383                 .policy = ip_vs_cmd_policy,
3384                 .doit   = ip_vs_genl_set_cmd,
3385         },
3386         {
3387                 .cmd    = IPVS_CMD_DEL_SERVICE,
3388                 .flags  = GENL_ADMIN_PERM,
3389                 .policy = ip_vs_cmd_policy,
3390                 .doit   = ip_vs_genl_set_cmd,
3391         },
3392         {
3393                 .cmd    = IPVS_CMD_GET_SERVICE,
3394                 .flags  = GENL_ADMIN_PERM,
3395                 .doit   = ip_vs_genl_get_cmd,
3396                 .dumpit = ip_vs_genl_dump_services,
3397                 .policy = ip_vs_cmd_policy,
3398         },
3399         {
3400                 .cmd    = IPVS_CMD_NEW_DEST,
3401                 .flags  = GENL_ADMIN_PERM,
3402                 .policy = ip_vs_cmd_policy,
3403                 .doit   = ip_vs_genl_set_cmd,
3404         },
3405         {
3406                 .cmd    = IPVS_CMD_SET_DEST,
3407                 .flags  = GENL_ADMIN_PERM,
3408                 .policy = ip_vs_cmd_policy,
3409                 .doit   = ip_vs_genl_set_cmd,
3410         },
3411         {
3412                 .cmd    = IPVS_CMD_DEL_DEST,
3413                 .flags  = GENL_ADMIN_PERM,
3414                 .policy = ip_vs_cmd_policy,
3415                 .doit   = ip_vs_genl_set_cmd,
3416         },
3417         {
3418                 .cmd    = IPVS_CMD_GET_DEST,
3419                 .flags  = GENL_ADMIN_PERM,
3420                 .policy = ip_vs_cmd_policy,
3421                 .dumpit = ip_vs_genl_dump_dests,
3422         },
3423         {
3424                 .cmd    = IPVS_CMD_NEW_DAEMON,
3425                 .flags  = GENL_ADMIN_PERM,
3426                 .policy = ip_vs_cmd_policy,
3427                 .doit   = ip_vs_genl_set_cmd,
3428         },
3429         {
3430                 .cmd    = IPVS_CMD_DEL_DAEMON,
3431                 .flags  = GENL_ADMIN_PERM,
3432                 .policy = ip_vs_cmd_policy,
3433                 .doit   = ip_vs_genl_set_cmd,
3434         },
3435         {
3436                 .cmd    = IPVS_CMD_GET_DAEMON,
3437                 .flags  = GENL_ADMIN_PERM,
3438                 .dumpit = ip_vs_genl_dump_daemons,
3439         },
3440         {
3441                 .cmd    = IPVS_CMD_SET_CONFIG,
3442                 .flags  = GENL_ADMIN_PERM,
3443                 .policy = ip_vs_cmd_policy,
3444                 .doit   = ip_vs_genl_set_cmd,
3445         },
3446         {
3447                 .cmd    = IPVS_CMD_GET_CONFIG,
3448                 .flags  = GENL_ADMIN_PERM,
3449                 .doit   = ip_vs_genl_get_cmd,
3450         },
3451         {
3452                 .cmd    = IPVS_CMD_GET_INFO,
3453                 .flags  = GENL_ADMIN_PERM,
3454                 .doit   = ip_vs_genl_get_cmd,
3455         },
3456         {
3457                 .cmd    = IPVS_CMD_ZERO,
3458                 .flags  = GENL_ADMIN_PERM,
3459                 .policy = ip_vs_cmd_policy,
3460                 .doit   = ip_vs_genl_set_cmd,
3461         },
3462         {
3463                 .cmd    = IPVS_CMD_FLUSH,
3464                 .flags  = GENL_ADMIN_PERM,
3465                 .doit   = ip_vs_genl_set_cmd,
3466         },
3467 };
3468
3469 static int __init ip_vs_genl_register(void)
3470 {
3471         return genl_register_family_with_ops(&ip_vs_genl_family,
3472                 ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops));
3473 }
3474
3475 static void ip_vs_genl_unregister(void)
3476 {
3477         genl_unregister_family(&ip_vs_genl_family);
3478 }
3479
3480 /* End of Generic Netlink interface definitions */
3481
3482 /*
3483  * per netns intit/exit func.
3484  */
3485 int __net_init __ip_vs_control_init(struct net *net)
3486 {
3487         int idx;
3488         struct netns_ipvs *ipvs = net_ipvs(net);
3489         struct ctl_table *tbl;
3490
3491         atomic_set(&ipvs->dropentry, 0);
3492         spin_lock_init(&ipvs->dropentry_lock);
3493         spin_lock_init(&ipvs->droppacket_lock);
3494         spin_lock_init(&ipvs->securetcp_lock);
3495         ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock);
3496
3497         /* Initialize rs_table */
3498         for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
3499                 INIT_LIST_HEAD(&ipvs->rs_table[idx]);
3500
3501         INIT_LIST_HEAD(&ipvs->dest_trash);
3502         atomic_set(&ipvs->ftpsvc_counter, 0);
3503         atomic_set(&ipvs->nullsvc_counter, 0);
3504
3505         /* procfs stats */
3506         ipvs->tot_stats = kzalloc(sizeof(struct ip_vs_stats), GFP_KERNEL);
3507         if (ipvs->tot_stats == NULL) {
3508                 pr_err("%s(): no memory.\n", __func__);
3509                 return -ENOMEM;
3510         }
3511         ipvs->cpustats = alloc_percpu(struct ip_vs_cpu_stats);
3512         if (!ipvs->cpustats) {
3513                 pr_err("%s() alloc_percpu failed\n", __func__);
3514                 goto err_alloc;
3515         }
3516         spin_lock_init(&ipvs->tot_stats->lock);
3517
3518         proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops);
3519         proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops);
3520         proc_net_fops_create(net, "ip_vs_stats_percpu", 0,
3521                              &ip_vs_stats_percpu_fops);
3522
3523         if (!net_eq(net, &init_net)) {
3524                 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
3525                 if (tbl == NULL)
3526                         goto err_dup;
3527         } else
3528                 tbl = vs_vars;
3529         /* Initialize sysctl defaults */
3530         idx = 0;
3531         ipvs->sysctl_amemthresh = 1024;
3532         tbl[idx++].data = &ipvs->sysctl_amemthresh;
3533         ipvs->sysctl_am_droprate = 10;
3534         tbl[idx++].data = &ipvs->sysctl_am_droprate;
3535         tbl[idx++].data = &ipvs->sysctl_drop_entry;
3536         tbl[idx++].data = &ipvs->sysctl_drop_packet;
3537 #ifdef CONFIG_IP_VS_NFCT
3538         tbl[idx++].data = &ipvs->sysctl_conntrack;
3539 #endif
3540         tbl[idx++].data = &ipvs->sysctl_secure_tcp;
3541         ipvs->sysctl_snat_reroute = 1;
3542         tbl[idx++].data = &ipvs->sysctl_snat_reroute;
3543         ipvs->sysctl_sync_ver = 1;
3544         tbl[idx++].data = &ipvs->sysctl_sync_ver;
3545         tbl[idx++].data = &ipvs->sysctl_cache_bypass;
3546         tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
3547         tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
3548         ipvs->sysctl_sync_threshold[0] = 3;
3549         ipvs->sysctl_sync_threshold[1] = 50;
3550         tbl[idx].data = &ipvs->sysctl_sync_threshold;
3551         tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
3552         tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
3553
3554
3555 #ifdef CONFIG_SYSCTL
3556         ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path,
3557                                                      tbl);
3558         if (ipvs->sysctl_hdr == NULL) {
3559                 if (!net_eq(net, &init_net))
3560                         kfree(tbl);
3561                 goto err_dup;
3562         }
3563 #endif
3564         ip_vs_new_estimator(net, ipvs->tot_stats);
3565         ipvs->sysctl_tbl = tbl;
3566         /* Schedule defense work */
3567         INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
3568         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
3569         return 0;
3570
3571 err_dup:
3572         free_percpu(ipvs->cpustats);
3573 err_alloc:
3574         kfree(ipvs->tot_stats);
3575         return -ENOMEM;
3576 }
3577
3578 static void __net_exit __ip_vs_control_cleanup(struct net *net)
3579 {
3580         struct netns_ipvs *ipvs = net_ipvs(net);
3581
3582         ip_vs_trash_cleanup(net);
3583         ip_vs_kill_estimator(net, ipvs->tot_stats);
3584         cancel_delayed_work_sync(&ipvs->defense_work);
3585         cancel_work_sync(&ipvs->defense_work.work);
3586 #ifdef CONFIG_SYSCTL
3587         unregister_net_sysctl_table(ipvs->sysctl_hdr);
3588 #endif
3589         proc_net_remove(net, "ip_vs_stats_percpu");
3590         proc_net_remove(net, "ip_vs_stats");
3591         proc_net_remove(net, "ip_vs");
3592         free_percpu(ipvs->cpustats);
3593         kfree(ipvs->tot_stats);
3594 }
3595
3596 static struct pernet_operations ipvs_control_ops = {
3597         .init = __ip_vs_control_init,
3598         .exit = __ip_vs_control_cleanup,
3599 };
3600
3601 int __init ip_vs_control_init(void)
3602 {
3603         int idx;
3604         int ret;
3605
3606         EnterFunction(2);
3607
3608         /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
3609         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
3610                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
3611                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
3612         }
3613
3614         ret = register_pernet_subsys(&ipvs_control_ops);
3615         if (ret) {
3616                 pr_err("cannot register namespace.\n");
3617                 goto err;
3618         }
3619
3620         smp_wmb();      /* Do we really need it now ? */
3621
3622         ret = nf_register_sockopt(&ip_vs_sockopts);
3623         if (ret) {
3624                 pr_err("cannot register sockopt.\n");
3625                 goto err_net;
3626         }
3627
3628         ret = ip_vs_genl_register();
3629         if (ret) {
3630                 pr_err("cannot register Generic Netlink interface.\n");
3631                 nf_unregister_sockopt(&ip_vs_sockopts);
3632                 goto err_net;
3633         }
3634
3635         LeaveFunction(2);
3636         return 0;
3637
3638 err_net:
3639         unregister_pernet_subsys(&ipvs_control_ops);
3640 err:
3641         return ret;
3642 }
3643
3644
3645 void ip_vs_control_cleanup(void)
3646 {
3647         EnterFunction(2);
3648         unregister_pernet_subsys(&ipvs_control_ops);
3649         ip_vs_genl_unregister();
3650         nf_unregister_sockopt(&ip_vs_sockopts);
3651         LeaveFunction(2);
3652 }