ipvs: optimize rates reading
[pandora-kernel.git] / net / netfilter / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * Changes:
18  *
19  */
20
21 #define KMSG_COMPONENT "IPVS"
22 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23
24 #include <linux/module.h>
25 #include <linux/init.h>
26 #include <linux/types.h>
27 #include <linux/capability.h>
28 #include <linux/fs.h>
29 #include <linux/sysctl.h>
30 #include <linux/proc_fs.h>
31 #include <linux/workqueue.h>
32 #include <linux/swap.h>
33 #include <linux/seq_file.h>
34 #include <linux/slab.h>
35
36 #include <linux/netfilter.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/mutex.h>
39
40 #include <net/net_namespace.h>
41 #include <linux/nsproxy.h>
42 #include <net/ip.h>
43 #ifdef CONFIG_IP_VS_IPV6
44 #include <net/ipv6.h>
45 #include <net/ip6_route.h>
46 #endif
47 #include <net/route.h>
48 #include <net/sock.h>
49 #include <net/genetlink.h>
50
51 #include <asm/uaccess.h>
52
53 #include <net/ip_vs.h>
54
55 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
56 static DEFINE_MUTEX(__ip_vs_mutex);
57
58 /* lock for service table */
59 static DEFINE_RWLOCK(__ip_vs_svc_lock);
60
61 /* sysctl variables */
62
63 #ifdef CONFIG_IP_VS_DEBUG
64 static int sysctl_ip_vs_debug_level = 0;
65
66 int ip_vs_get_debug_level(void)
67 {
68         return sysctl_ip_vs_debug_level;
69 }
70 #endif
71
72 #ifdef CONFIG_IP_VS_IPV6
73 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
74 static int __ip_vs_addr_is_local_v6(struct net *net,
75                                     const struct in6_addr *addr)
76 {
77         struct rt6_info *rt;
78         struct flowi fl = {
79                 .oif = 0,
80                 .fl6_dst = *addr,
81                 .fl6_src = { .s6_addr32 = {0, 0, 0, 0} },
82         };
83
84         rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl);
85         if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
86                         return 1;
87
88         return 0;
89 }
90 #endif
91 /*
92  *      update_defense_level is called from keventd and from sysctl,
93  *      so it needs to protect itself from softirqs
94  */
95 static void update_defense_level(struct netns_ipvs *ipvs)
96 {
97         struct sysinfo i;
98         static int old_secure_tcp = 0;
99         int availmem;
100         int nomem;
101         int to_change = -1;
102
103         /* we only count free and buffered memory (in pages) */
104         si_meminfo(&i);
105         availmem = i.freeram + i.bufferram;
106         /* however in linux 2.5 the i.bufferram is total page cache size,
107            we need adjust it */
108         /* si_swapinfo(&i); */
109         /* availmem = availmem - (i.totalswap - i.freeswap); */
110
111         nomem = (availmem < ipvs->sysctl_amemthresh);
112
113         local_bh_disable();
114
115         /* drop_entry */
116         spin_lock(&ipvs->dropentry_lock);
117         switch (ipvs->sysctl_drop_entry) {
118         case 0:
119                 atomic_set(&ipvs->dropentry, 0);
120                 break;
121         case 1:
122                 if (nomem) {
123                         atomic_set(&ipvs->dropentry, 1);
124                         ipvs->sysctl_drop_entry = 2;
125                 } else {
126                         atomic_set(&ipvs->dropentry, 0);
127                 }
128                 break;
129         case 2:
130                 if (nomem) {
131                         atomic_set(&ipvs->dropentry, 1);
132                 } else {
133                         atomic_set(&ipvs->dropentry, 0);
134                         ipvs->sysctl_drop_entry = 1;
135                 };
136                 break;
137         case 3:
138                 atomic_set(&ipvs->dropentry, 1);
139                 break;
140         }
141         spin_unlock(&ipvs->dropentry_lock);
142
143         /* drop_packet */
144         spin_lock(&ipvs->droppacket_lock);
145         switch (ipvs->sysctl_drop_packet) {
146         case 0:
147                 ipvs->drop_rate = 0;
148                 break;
149         case 1:
150                 if (nomem) {
151                         ipvs->drop_rate = ipvs->drop_counter
152                                 = ipvs->sysctl_amemthresh /
153                                 (ipvs->sysctl_amemthresh-availmem);
154                         ipvs->sysctl_drop_packet = 2;
155                 } else {
156                         ipvs->drop_rate = 0;
157                 }
158                 break;
159         case 2:
160                 if (nomem) {
161                         ipvs->drop_rate = ipvs->drop_counter
162                                 = ipvs->sysctl_amemthresh /
163                                 (ipvs->sysctl_amemthresh-availmem);
164                 } else {
165                         ipvs->drop_rate = 0;
166                         ipvs->sysctl_drop_packet = 1;
167                 }
168                 break;
169         case 3:
170                 ipvs->drop_rate = ipvs->sysctl_am_droprate;
171                 break;
172         }
173         spin_unlock(&ipvs->droppacket_lock);
174
175         /* secure_tcp */
176         spin_lock(&ipvs->securetcp_lock);
177         switch (ipvs->sysctl_secure_tcp) {
178         case 0:
179                 if (old_secure_tcp >= 2)
180                         to_change = 0;
181                 break;
182         case 1:
183                 if (nomem) {
184                         if (old_secure_tcp < 2)
185                                 to_change = 1;
186                         ipvs->sysctl_secure_tcp = 2;
187                 } else {
188                         if (old_secure_tcp >= 2)
189                                 to_change = 0;
190                 }
191                 break;
192         case 2:
193                 if (nomem) {
194                         if (old_secure_tcp < 2)
195                                 to_change = 1;
196                 } else {
197                         if (old_secure_tcp >= 2)
198                                 to_change = 0;
199                         ipvs->sysctl_secure_tcp = 1;
200                 }
201                 break;
202         case 3:
203                 if (old_secure_tcp < 2)
204                         to_change = 1;
205                 break;
206         }
207         old_secure_tcp = ipvs->sysctl_secure_tcp;
208         if (to_change >= 0)
209                 ip_vs_protocol_timeout_change(ipvs,
210                                               ipvs->sysctl_secure_tcp > 1);
211         spin_unlock(&ipvs->securetcp_lock);
212
213         local_bh_enable();
214 }
215
216
217 /*
218  *      Timer for checking the defense
219  */
220 #define DEFENSE_TIMER_PERIOD    1*HZ
221
222 static void defense_work_handler(struct work_struct *work)
223 {
224         struct netns_ipvs *ipvs =
225                 container_of(work, struct netns_ipvs, defense_work.work);
226
227         update_defense_level(ipvs);
228         if (atomic_read(&ipvs->dropentry))
229                 ip_vs_random_dropentry(ipvs->net);
230         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
231 }
232
233 int
234 ip_vs_use_count_inc(void)
235 {
236         return try_module_get(THIS_MODULE);
237 }
238
239 void
240 ip_vs_use_count_dec(void)
241 {
242         module_put(THIS_MODULE);
243 }
244
245
246 /*
247  *      Hash table: for virtual service lookups
248  */
249 #define IP_VS_SVC_TAB_BITS 8
250 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
251 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
252
253 /* the service table hashed by <protocol, addr, port> */
254 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
255 /* the service table hashed by fwmark */
256 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
257
258
259 /*
260  *      Returns hash value for virtual service
261  */
262 static inline unsigned
263 ip_vs_svc_hashkey(struct net *net, int af, unsigned proto,
264                   const union nf_inet_addr *addr, __be16 port)
265 {
266         register unsigned porth = ntohs(port);
267         __be32 addr_fold = addr->ip;
268
269 #ifdef CONFIG_IP_VS_IPV6
270         if (af == AF_INET6)
271                 addr_fold = addr->ip6[0]^addr->ip6[1]^
272                             addr->ip6[2]^addr->ip6[3];
273 #endif
274         addr_fold ^= ((size_t)net>>8);
275
276         return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
277                 & IP_VS_SVC_TAB_MASK;
278 }
279
280 /*
281  *      Returns hash value of fwmark for virtual service lookup
282  */
283 static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
284 {
285         return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
286 }
287
288 /*
289  *      Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
290  *      or in the ip_vs_svc_fwm_table by fwmark.
291  *      Should be called with locked tables.
292  */
293 static int ip_vs_svc_hash(struct ip_vs_service *svc)
294 {
295         unsigned hash;
296
297         if (svc->flags & IP_VS_SVC_F_HASHED) {
298                 pr_err("%s(): request for already hashed, called from %pF\n",
299                        __func__, __builtin_return_address(0));
300                 return 0;
301         }
302
303         if (svc->fwmark == 0) {
304                 /*
305                  *  Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
306                  */
307                 hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
308                                          &svc->addr, svc->port);
309                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
310         } else {
311                 /*
312                  *  Hash it by fwmark in svc_fwm_table
313                  */
314                 hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
315                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
316         }
317
318         svc->flags |= IP_VS_SVC_F_HASHED;
319         /* increase its refcnt because it is referenced by the svc table */
320         atomic_inc(&svc->refcnt);
321         return 1;
322 }
323
324
325 /*
326  *      Unhashes a service from svc_table / svc_fwm_table.
327  *      Should be called with locked tables.
328  */
329 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
330 {
331         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
332                 pr_err("%s(): request for unhash flagged, called from %pF\n",
333                        __func__, __builtin_return_address(0));
334                 return 0;
335         }
336
337         if (svc->fwmark == 0) {
338                 /* Remove it from the svc_table table */
339                 list_del(&svc->s_list);
340         } else {
341                 /* Remove it from the svc_fwm_table table */
342                 list_del(&svc->f_list);
343         }
344
345         svc->flags &= ~IP_VS_SVC_F_HASHED;
346         atomic_dec(&svc->refcnt);
347         return 1;
348 }
349
350
351 /*
352  *      Get service by {netns, proto,addr,port} in the service table.
353  */
354 static inline struct ip_vs_service *
355 __ip_vs_service_find(struct net *net, int af, __u16 protocol,
356                      const union nf_inet_addr *vaddr, __be16 vport)
357 {
358         unsigned hash;
359         struct ip_vs_service *svc;
360
361         /* Check for "full" addressed entries */
362         hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
363
364         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
365                 if ((svc->af == af)
366                     && ip_vs_addr_equal(af, &svc->addr, vaddr)
367                     && (svc->port == vport)
368                     && (svc->protocol == protocol)
369                     && net_eq(svc->net, net)) {
370                         /* HIT */
371                         return svc;
372                 }
373         }
374
375         return NULL;
376 }
377
378
379 /*
380  *      Get service by {fwmark} in the service table.
381  */
382 static inline struct ip_vs_service *
383 __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
384 {
385         unsigned hash;
386         struct ip_vs_service *svc;
387
388         /* Check for fwmark addressed entries */
389         hash = ip_vs_svc_fwm_hashkey(net, fwmark);
390
391         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
392                 if (svc->fwmark == fwmark && svc->af == af
393                     && net_eq(svc->net, net)) {
394                         /* HIT */
395                         return svc;
396                 }
397         }
398
399         return NULL;
400 }
401
402 struct ip_vs_service *
403 ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
404                   const union nf_inet_addr *vaddr, __be16 vport)
405 {
406         struct ip_vs_service *svc;
407         struct netns_ipvs *ipvs = net_ipvs(net);
408
409         read_lock(&__ip_vs_svc_lock);
410
411         /*
412          *      Check the table hashed by fwmark first
413          */
414         if (fwmark) {
415                 svc = __ip_vs_svc_fwm_find(net, af, fwmark);
416                 if (svc)
417                         goto out;
418         }
419
420         /*
421          *      Check the table hashed by <protocol,addr,port>
422          *      for "full" addressed entries
423          */
424         svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
425
426         if (svc == NULL
427             && protocol == IPPROTO_TCP
428             && atomic_read(&ipvs->ftpsvc_counter)
429             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
430                 /*
431                  * Check if ftp service entry exists, the packet
432                  * might belong to FTP data connections.
433                  */
434                 svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
435         }
436
437         if (svc == NULL
438             && atomic_read(&ipvs->nullsvc_counter)) {
439                 /*
440                  * Check if the catch-all port (port zero) exists
441                  */
442                 svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
443         }
444
445   out:
446         if (svc)
447                 atomic_inc(&svc->usecnt);
448         read_unlock(&__ip_vs_svc_lock);
449
450         IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
451                       fwmark, ip_vs_proto_name(protocol),
452                       IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
453                       svc ? "hit" : "not hit");
454
455         return svc;
456 }
457
458
459 static inline void
460 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
461 {
462         atomic_inc(&svc->refcnt);
463         dest->svc = svc;
464 }
465
466 static void
467 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
468 {
469         struct ip_vs_service *svc = dest->svc;
470
471         dest->svc = NULL;
472         if (atomic_dec_and_test(&svc->refcnt)) {
473                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
474                               svc->fwmark,
475                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
476                               ntohs(svc->port), atomic_read(&svc->usecnt));
477                 free_percpu(svc->stats.cpustats);
478                 kfree(svc);
479         }
480 }
481
482
483 /*
484  *      Returns hash value for real service
485  */
486 static inline unsigned ip_vs_rs_hashkey(int af,
487                                             const union nf_inet_addr *addr,
488                                             __be16 port)
489 {
490         register unsigned porth = ntohs(port);
491         __be32 addr_fold = addr->ip;
492
493 #ifdef CONFIG_IP_VS_IPV6
494         if (af == AF_INET6)
495                 addr_fold = addr->ip6[0]^addr->ip6[1]^
496                             addr->ip6[2]^addr->ip6[3];
497 #endif
498
499         return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
500                 & IP_VS_RTAB_MASK;
501 }
502
503 /*
504  *      Hashes ip_vs_dest in rs_table by <proto,addr,port>.
505  *      should be called with locked tables.
506  */
507 static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
508 {
509         unsigned hash;
510
511         if (!list_empty(&dest->d_list)) {
512                 return 0;
513         }
514
515         /*
516          *      Hash by proto,addr,port,
517          *      which are the parameters of the real service.
518          */
519         hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
520
521         list_add(&dest->d_list, &ipvs->rs_table[hash]);
522
523         return 1;
524 }
525
526 /*
527  *      UNhashes ip_vs_dest from rs_table.
528  *      should be called with locked tables.
529  */
530 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
531 {
532         /*
533          * Remove it from the rs_table table.
534          */
535         if (!list_empty(&dest->d_list)) {
536                 list_del(&dest->d_list);
537                 INIT_LIST_HEAD(&dest->d_list);
538         }
539
540         return 1;
541 }
542
543 /*
544  *      Lookup real service by <proto,addr,port> in the real service table.
545  */
546 struct ip_vs_dest *
547 ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
548                           const union nf_inet_addr *daddr,
549                           __be16 dport)
550 {
551         struct netns_ipvs *ipvs = net_ipvs(net);
552         unsigned hash;
553         struct ip_vs_dest *dest;
554
555         /*
556          *      Check for "full" addressed entries
557          *      Return the first found entry
558          */
559         hash = ip_vs_rs_hashkey(af, daddr, dport);
560
561         read_lock(&ipvs->rs_lock);
562         list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
563                 if ((dest->af == af)
564                     && ip_vs_addr_equal(af, &dest->addr, daddr)
565                     && (dest->port == dport)
566                     && ((dest->protocol == protocol) ||
567                         dest->vfwmark)) {
568                         /* HIT */
569                         read_unlock(&ipvs->rs_lock);
570                         return dest;
571                 }
572         }
573         read_unlock(&ipvs->rs_lock);
574
575         return NULL;
576 }
577
578 /*
579  *      Lookup destination by {addr,port} in the given service
580  */
581 static struct ip_vs_dest *
582 ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
583                   __be16 dport)
584 {
585         struct ip_vs_dest *dest;
586
587         /*
588          * Find the destination for the given service
589          */
590         list_for_each_entry(dest, &svc->destinations, n_list) {
591                 if ((dest->af == svc->af)
592                     && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
593                     && (dest->port == dport)) {
594                         /* HIT */
595                         return dest;
596                 }
597         }
598
599         return NULL;
600 }
601
602 /*
603  * Find destination by {daddr,dport,vaddr,protocol}
604  * Cretaed to be used in ip_vs_process_message() in
605  * the backup synchronization daemon. It finds the
606  * destination to be bound to the received connection
607  * on the backup.
608  *
609  * ip_vs_lookup_real_service() looked promissing, but
610  * seems not working as expected.
611  */
612 struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
613                                    const union nf_inet_addr *daddr,
614                                    __be16 dport,
615                                    const union nf_inet_addr *vaddr,
616                                    __be16 vport, __u16 protocol, __u32 fwmark)
617 {
618         struct ip_vs_dest *dest;
619         struct ip_vs_service *svc;
620
621         svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
622         if (!svc)
623                 return NULL;
624         dest = ip_vs_lookup_dest(svc, daddr, dport);
625         if (dest)
626                 atomic_inc(&dest->refcnt);
627         ip_vs_service_put(svc);
628         return dest;
629 }
630
631 /*
632  *  Lookup dest by {svc,addr,port} in the destination trash.
633  *  The destination trash is used to hold the destinations that are removed
634  *  from the service table but are still referenced by some conn entries.
635  *  The reason to add the destination trash is when the dest is temporary
636  *  down (either by administrator or by monitor program), the dest can be
637  *  picked back from the trash, the remaining connections to the dest can
638  *  continue, and the counting information of the dest is also useful for
639  *  scheduling.
640  */
641 static struct ip_vs_dest *
642 ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
643                      __be16 dport)
644 {
645         struct ip_vs_dest *dest, *nxt;
646         struct netns_ipvs *ipvs = net_ipvs(svc->net);
647
648         /*
649          * Find the destination in trash
650          */
651         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
652                 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
653                               "dest->refcnt=%d\n",
654                               dest->vfwmark,
655                               IP_VS_DBG_ADDR(svc->af, &dest->addr),
656                               ntohs(dest->port),
657                               atomic_read(&dest->refcnt));
658                 if (dest->af == svc->af &&
659                     ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
660                     dest->port == dport &&
661                     dest->vfwmark == svc->fwmark &&
662                     dest->protocol == svc->protocol &&
663                     (svc->fwmark ||
664                      (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
665                       dest->vport == svc->port))) {
666                         /* HIT */
667                         return dest;
668                 }
669
670                 /*
671                  * Try to purge the destination from trash if not referenced
672                  */
673                 if (atomic_read(&dest->refcnt) == 1) {
674                         IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
675                                       "from trash\n",
676                                       dest->vfwmark,
677                                       IP_VS_DBG_ADDR(svc->af, &dest->addr),
678                                       ntohs(dest->port));
679                         list_del(&dest->n_list);
680                         ip_vs_dst_reset(dest);
681                         __ip_vs_unbind_svc(dest);
682                         free_percpu(dest->stats.cpustats);
683                         kfree(dest);
684                 }
685         }
686
687         return NULL;
688 }
689
690
691 /*
692  *  Clean up all the destinations in the trash
693  *  Called by the ip_vs_control_cleanup()
694  *
695  *  When the ip_vs_control_clearup is activated by ipvs module exit,
696  *  the service tables must have been flushed and all the connections
697  *  are expired, and the refcnt of each destination in the trash must
698  *  be 1, so we simply release them here.
699  */
700 static void ip_vs_trash_cleanup(struct net *net)
701 {
702         struct ip_vs_dest *dest, *nxt;
703         struct netns_ipvs *ipvs = net_ipvs(net);
704
705         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
706                 list_del(&dest->n_list);
707                 ip_vs_dst_reset(dest);
708                 __ip_vs_unbind_svc(dest);
709                 free_percpu(dest->stats.cpustats);
710                 kfree(dest);
711         }
712 }
713
714 static void
715 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
716 {
717 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c
718
719         spin_lock_bh(&src->lock);
720
721         IP_VS_SHOW_STATS_COUNTER(conns);
722         IP_VS_SHOW_STATS_COUNTER(inpkts);
723         IP_VS_SHOW_STATS_COUNTER(outpkts);
724         IP_VS_SHOW_STATS_COUNTER(inbytes);
725         IP_VS_SHOW_STATS_COUNTER(outbytes);
726
727         ip_vs_read_estimator(dst, src);
728
729         spin_unlock_bh(&src->lock);
730 }
731
732 static void
733 ip_vs_zero_stats(struct ip_vs_stats *stats)
734 {
735         spin_lock_bh(&stats->lock);
736
737         /* get current counters as zero point, rates are zeroed */
738
739 #define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c
740
741         IP_VS_ZERO_STATS_COUNTER(conns);
742         IP_VS_ZERO_STATS_COUNTER(inpkts);
743         IP_VS_ZERO_STATS_COUNTER(outpkts);
744         IP_VS_ZERO_STATS_COUNTER(inbytes);
745         IP_VS_ZERO_STATS_COUNTER(outbytes);
746
747         ip_vs_zero_estimator(stats);
748
749         spin_unlock_bh(&stats->lock);
750 }
751
752 /*
753  *      Update a destination in the given service
754  */
755 static void
756 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
757                     struct ip_vs_dest_user_kern *udest, int add)
758 {
759         struct netns_ipvs *ipvs = net_ipvs(svc->net);
760         int conn_flags;
761
762         /* set the weight and the flags */
763         atomic_set(&dest->weight, udest->weight);
764         conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
765         conn_flags |= IP_VS_CONN_F_INACTIVE;
766
767         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
768         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
769                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
770         } else {
771                 /*
772                  *    Put the real service in rs_table if not present.
773                  *    For now only for NAT!
774                  */
775                 write_lock_bh(&ipvs->rs_lock);
776                 ip_vs_rs_hash(ipvs, dest);
777                 write_unlock_bh(&ipvs->rs_lock);
778         }
779         atomic_set(&dest->conn_flags, conn_flags);
780
781         /* bind the service */
782         if (!dest->svc) {
783                 __ip_vs_bind_svc(dest, svc);
784         } else {
785                 if (dest->svc != svc) {
786                         __ip_vs_unbind_svc(dest);
787                         ip_vs_zero_stats(&dest->stats);
788                         __ip_vs_bind_svc(dest, svc);
789                 }
790         }
791
792         /* set the dest status flags */
793         dest->flags |= IP_VS_DEST_F_AVAILABLE;
794
795         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
796                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
797         dest->u_threshold = udest->u_threshold;
798         dest->l_threshold = udest->l_threshold;
799
800         spin_lock(&dest->dst_lock);
801         ip_vs_dst_reset(dest);
802         spin_unlock(&dest->dst_lock);
803
804         if (add)
805                 ip_vs_new_estimator(svc->net, &dest->stats);
806
807         write_lock_bh(&__ip_vs_svc_lock);
808
809         /* Wait until all other svc users go away */
810         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
811
812         if (add) {
813                 list_add(&dest->n_list, &svc->destinations);
814                 svc->num_dests++;
815         }
816
817         /* call the update_service, because server weight may be changed */
818         if (svc->scheduler->update_service)
819                 svc->scheduler->update_service(svc);
820
821         write_unlock_bh(&__ip_vs_svc_lock);
822 }
823
824
825 /*
826  *      Create a destination for the given service
827  */
828 static int
829 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
830                struct ip_vs_dest **dest_p)
831 {
832         struct ip_vs_dest *dest;
833         unsigned atype;
834
835         EnterFunction(2);
836
837 #ifdef CONFIG_IP_VS_IPV6
838         if (svc->af == AF_INET6) {
839                 atype = ipv6_addr_type(&udest->addr.in6);
840                 if ((!(atype & IPV6_ADDR_UNICAST) ||
841                         atype & IPV6_ADDR_LINKLOCAL) &&
842                         !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
843                         return -EINVAL;
844         } else
845 #endif
846         {
847                 atype = inet_addr_type(svc->net, udest->addr.ip);
848                 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
849                         return -EINVAL;
850         }
851
852         dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
853         if (dest == NULL) {
854                 pr_err("%s(): no memory.\n", __func__);
855                 return -ENOMEM;
856         }
857         dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
858         if (!dest->stats.cpustats) {
859                 pr_err("%s() alloc_percpu failed\n", __func__);
860                 goto err_alloc;
861         }
862
863         dest->af = svc->af;
864         dest->protocol = svc->protocol;
865         dest->vaddr = svc->addr;
866         dest->vport = svc->port;
867         dest->vfwmark = svc->fwmark;
868         ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
869         dest->port = udest->port;
870
871         atomic_set(&dest->activeconns, 0);
872         atomic_set(&dest->inactconns, 0);
873         atomic_set(&dest->persistconns, 0);
874         atomic_set(&dest->refcnt, 1);
875
876         INIT_LIST_HEAD(&dest->d_list);
877         spin_lock_init(&dest->dst_lock);
878         spin_lock_init(&dest->stats.lock);
879         __ip_vs_update_dest(svc, dest, udest, 1);
880
881         *dest_p = dest;
882
883         LeaveFunction(2);
884         return 0;
885
886 err_alloc:
887         kfree(dest);
888         return -ENOMEM;
889 }
890
891
892 /*
893  *      Add a destination into an existing service
894  */
895 static int
896 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
897 {
898         struct ip_vs_dest *dest;
899         union nf_inet_addr daddr;
900         __be16 dport = udest->port;
901         int ret;
902
903         EnterFunction(2);
904
905         if (udest->weight < 0) {
906                 pr_err("%s(): server weight less than zero\n", __func__);
907                 return -ERANGE;
908         }
909
910         if (udest->l_threshold > udest->u_threshold) {
911                 pr_err("%s(): lower threshold is higher than upper threshold\n",
912                         __func__);
913                 return -ERANGE;
914         }
915
916         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
917
918         /*
919          * Check if the dest already exists in the list
920          */
921         dest = ip_vs_lookup_dest(svc, &daddr, dport);
922
923         if (dest != NULL) {
924                 IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
925                 return -EEXIST;
926         }
927
928         /*
929          * Check if the dest already exists in the trash and
930          * is from the same service
931          */
932         dest = ip_vs_trash_get_dest(svc, &daddr, dport);
933
934         if (dest != NULL) {
935                 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
936                               "dest->refcnt=%d, service %u/%s:%u\n",
937                               IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
938                               atomic_read(&dest->refcnt),
939                               dest->vfwmark,
940                               IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
941                               ntohs(dest->vport));
942
943                 /*
944                  * Get the destination from the trash
945                  */
946                 list_del(&dest->n_list);
947
948                 __ip_vs_update_dest(svc, dest, udest, 1);
949                 ret = 0;
950         } else {
951                 /*
952                  * Allocate and initialize the dest structure
953                  */
954                 ret = ip_vs_new_dest(svc, udest, &dest);
955         }
956         LeaveFunction(2);
957
958         return ret;
959 }
960
961
962 /*
963  *      Edit a destination in the given service
964  */
965 static int
966 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
967 {
968         struct ip_vs_dest *dest;
969         union nf_inet_addr daddr;
970         __be16 dport = udest->port;
971
972         EnterFunction(2);
973
974         if (udest->weight < 0) {
975                 pr_err("%s(): server weight less than zero\n", __func__);
976                 return -ERANGE;
977         }
978
979         if (udest->l_threshold > udest->u_threshold) {
980                 pr_err("%s(): lower threshold is higher than upper threshold\n",
981                         __func__);
982                 return -ERANGE;
983         }
984
985         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
986
987         /*
988          *  Lookup the destination list
989          */
990         dest = ip_vs_lookup_dest(svc, &daddr, dport);
991
992         if (dest == NULL) {
993                 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
994                 return -ENOENT;
995         }
996
997         __ip_vs_update_dest(svc, dest, udest, 0);
998         LeaveFunction(2);
999
1000         return 0;
1001 }
1002
1003
1004 /*
1005  *      Delete a destination (must be already unlinked from the service)
1006  */
1007 static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
1008 {
1009         struct netns_ipvs *ipvs = net_ipvs(net);
1010
1011         ip_vs_kill_estimator(net, &dest->stats);
1012
1013         /*
1014          *  Remove it from the d-linked list with the real services.
1015          */
1016         write_lock_bh(&ipvs->rs_lock);
1017         ip_vs_rs_unhash(dest);
1018         write_unlock_bh(&ipvs->rs_lock);
1019
1020         /*
1021          *  Decrease the refcnt of the dest, and free the dest
1022          *  if nobody refers to it (refcnt=0). Otherwise, throw
1023          *  the destination into the trash.
1024          */
1025         if (atomic_dec_and_test(&dest->refcnt)) {
1026                 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
1027                               dest->vfwmark,
1028                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1029                               ntohs(dest->port));
1030                 ip_vs_dst_reset(dest);
1031                 /* simply decrease svc->refcnt here, let the caller check
1032                    and release the service if nobody refers to it.
1033                    Only user context can release destination and service,
1034                    and only one user context can update virtual service at a
1035                    time, so the operation here is OK */
1036                 atomic_dec(&dest->svc->refcnt);
1037                 free_percpu(dest->stats.cpustats);
1038                 kfree(dest);
1039         } else {
1040                 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
1041                               "dest->refcnt=%d\n",
1042                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1043                               ntohs(dest->port),
1044                               atomic_read(&dest->refcnt));
1045                 list_add(&dest->n_list, &ipvs->dest_trash);
1046                 atomic_inc(&dest->refcnt);
1047         }
1048 }
1049
1050
1051 /*
1052  *      Unlink a destination from the given service
1053  */
1054 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1055                                 struct ip_vs_dest *dest,
1056                                 int svcupd)
1057 {
1058         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1059
1060         /*
1061          *  Remove it from the d-linked destination list.
1062          */
1063         list_del(&dest->n_list);
1064         svc->num_dests--;
1065
1066         /*
1067          *  Call the update_service function of its scheduler
1068          */
1069         if (svcupd && svc->scheduler->update_service)
1070                         svc->scheduler->update_service(svc);
1071 }
1072
1073
1074 /*
1075  *      Delete a destination server in the given service
1076  */
1077 static int
1078 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1079 {
1080         struct ip_vs_dest *dest;
1081         __be16 dport = udest->port;
1082
1083         EnterFunction(2);
1084
1085         dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
1086
1087         if (dest == NULL) {
1088                 IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
1089                 return -ENOENT;
1090         }
1091
1092         write_lock_bh(&__ip_vs_svc_lock);
1093
1094         /*
1095          *      Wait until all other svc users go away.
1096          */
1097         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1098
1099         /*
1100          *      Unlink dest from the service
1101          */
1102         __ip_vs_unlink_dest(svc, dest, 1);
1103
1104         write_unlock_bh(&__ip_vs_svc_lock);
1105
1106         /*
1107          *      Delete the destination
1108          */
1109         __ip_vs_del_dest(svc->net, dest);
1110
1111         LeaveFunction(2);
1112
1113         return 0;
1114 }
1115
1116
1117 /*
1118  *      Add a service into the service hash table
1119  */
1120 static int
1121 ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
1122                   struct ip_vs_service **svc_p)
1123 {
1124         int ret = 0;
1125         struct ip_vs_scheduler *sched = NULL;
1126         struct ip_vs_pe *pe = NULL;
1127         struct ip_vs_service *svc = NULL;
1128         struct netns_ipvs *ipvs = net_ipvs(net);
1129
1130         /* increase the module use count */
1131         ip_vs_use_count_inc();
1132
1133         /* Lookup the scheduler by 'u->sched_name' */
1134         sched = ip_vs_scheduler_get(u->sched_name);
1135         if (sched == NULL) {
1136                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1137                 ret = -ENOENT;
1138                 goto out_err;
1139         }
1140
1141         if (u->pe_name && *u->pe_name) {
1142                 pe = ip_vs_pe_getbyname(u->pe_name);
1143                 if (pe == NULL) {
1144                         pr_info("persistence engine module ip_vs_pe_%s "
1145                                 "not found\n", u->pe_name);
1146                         ret = -ENOENT;
1147                         goto out_err;
1148                 }
1149         }
1150
1151 #ifdef CONFIG_IP_VS_IPV6
1152         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1153                 ret = -EINVAL;
1154                 goto out_err;
1155         }
1156 #endif
1157
1158         svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
1159         if (svc == NULL) {
1160                 IP_VS_DBG(1, "%s(): no memory\n", __func__);
1161                 ret = -ENOMEM;
1162                 goto out_err;
1163         }
1164         svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1165         if (!svc->stats.cpustats) {
1166                 pr_err("%s() alloc_percpu failed\n", __func__);
1167                 goto out_err;
1168         }
1169
1170         /* I'm the first user of the service */
1171         atomic_set(&svc->usecnt, 0);
1172         atomic_set(&svc->refcnt, 0);
1173
1174         svc->af = u->af;
1175         svc->protocol = u->protocol;
1176         ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1177         svc->port = u->port;
1178         svc->fwmark = u->fwmark;
1179         svc->flags = u->flags;
1180         svc->timeout = u->timeout * HZ;
1181         svc->netmask = u->netmask;
1182         svc->net = net;
1183
1184         INIT_LIST_HEAD(&svc->destinations);
1185         rwlock_init(&svc->sched_lock);
1186         spin_lock_init(&svc->stats.lock);
1187
1188         /* Bind the scheduler */
1189         ret = ip_vs_bind_scheduler(svc, sched);
1190         if (ret)
1191                 goto out_err;
1192         sched = NULL;
1193
1194         /* Bind the ct retriever */
1195         ip_vs_bind_pe(svc, pe);
1196         pe = NULL;
1197
1198         /* Update the virtual service counters */
1199         if (svc->port == FTPPORT)
1200                 atomic_inc(&ipvs->ftpsvc_counter);
1201         else if (svc->port == 0)
1202                 atomic_inc(&ipvs->nullsvc_counter);
1203
1204         ip_vs_new_estimator(net, &svc->stats);
1205
1206         /* Count only IPv4 services for old get/setsockopt interface */
1207         if (svc->af == AF_INET)
1208                 ipvs->num_services++;
1209
1210         /* Hash the service into the service table */
1211         write_lock_bh(&__ip_vs_svc_lock);
1212         ip_vs_svc_hash(svc);
1213         write_unlock_bh(&__ip_vs_svc_lock);
1214
1215         *svc_p = svc;
1216         return 0;
1217
1218
1219  out_err:
1220         if (svc != NULL) {
1221                 ip_vs_unbind_scheduler(svc);
1222                 if (svc->inc) {
1223                         local_bh_disable();
1224                         ip_vs_app_inc_put(svc->inc);
1225                         local_bh_enable();
1226                 }
1227                 if (svc->stats.cpustats)
1228                         free_percpu(svc->stats.cpustats);
1229                 kfree(svc);
1230         }
1231         ip_vs_scheduler_put(sched);
1232         ip_vs_pe_put(pe);
1233
1234         /* decrease the module use count */
1235         ip_vs_use_count_dec();
1236
1237         return ret;
1238 }
1239
1240
1241 /*
1242  *      Edit a service and bind it with a new scheduler
1243  */
1244 static int
1245 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1246 {
1247         struct ip_vs_scheduler *sched, *old_sched;
1248         struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1249         int ret = 0;
1250
1251         /*
1252          * Lookup the scheduler, by 'u->sched_name'
1253          */
1254         sched = ip_vs_scheduler_get(u->sched_name);
1255         if (sched == NULL) {
1256                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1257                 return -ENOENT;
1258         }
1259         old_sched = sched;
1260
1261         if (u->pe_name && *u->pe_name) {
1262                 pe = ip_vs_pe_getbyname(u->pe_name);
1263                 if (pe == NULL) {
1264                         pr_info("persistence engine module ip_vs_pe_%s "
1265                                 "not found\n", u->pe_name);
1266                         ret = -ENOENT;
1267                         goto out;
1268                 }
1269                 old_pe = pe;
1270         }
1271
1272 #ifdef CONFIG_IP_VS_IPV6
1273         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1274                 ret = -EINVAL;
1275                 goto out;
1276         }
1277 #endif
1278
1279         write_lock_bh(&__ip_vs_svc_lock);
1280
1281         /*
1282          * Wait until all other svc users go away.
1283          */
1284         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1285
1286         /*
1287          * Set the flags and timeout value
1288          */
1289         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1290         svc->timeout = u->timeout * HZ;
1291         svc->netmask = u->netmask;
1292
1293         old_sched = svc->scheduler;
1294         if (sched != old_sched) {
1295                 /*
1296                  * Unbind the old scheduler
1297                  */
1298                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1299                         old_sched = sched;
1300                         goto out_unlock;
1301                 }
1302
1303                 /*
1304                  * Bind the new scheduler
1305                  */
1306                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1307                         /*
1308                          * If ip_vs_bind_scheduler fails, restore the old
1309                          * scheduler.
1310                          * The main reason of failure is out of memory.
1311                          *
1312                          * The question is if the old scheduler can be
1313                          * restored all the time. TODO: if it cannot be
1314                          * restored some time, we must delete the service,
1315                          * otherwise the system may crash.
1316                          */
1317                         ip_vs_bind_scheduler(svc, old_sched);
1318                         old_sched = sched;
1319                         goto out_unlock;
1320                 }
1321         }
1322
1323         old_pe = svc->pe;
1324         if (pe != old_pe) {
1325                 ip_vs_unbind_pe(svc);
1326                 ip_vs_bind_pe(svc, pe);
1327         }
1328
1329   out_unlock:
1330         write_unlock_bh(&__ip_vs_svc_lock);
1331   out:
1332         ip_vs_scheduler_put(old_sched);
1333         ip_vs_pe_put(old_pe);
1334         return ret;
1335 }
1336
1337
1338 /*
1339  *      Delete a service from the service list
1340  *      - The service must be unlinked, unlocked and not referenced!
1341  *      - We are called under _bh lock
1342  */
1343 static void __ip_vs_del_service(struct ip_vs_service *svc)
1344 {
1345         struct ip_vs_dest *dest, *nxt;
1346         struct ip_vs_scheduler *old_sched;
1347         struct ip_vs_pe *old_pe;
1348         struct netns_ipvs *ipvs = net_ipvs(svc->net);
1349
1350         pr_info("%s: enter\n", __func__);
1351
1352         /* Count only IPv4 services for old get/setsockopt interface */
1353         if (svc->af == AF_INET)
1354                 ipvs->num_services--;
1355
1356         ip_vs_kill_estimator(svc->net, &svc->stats);
1357
1358         /* Unbind scheduler */
1359         old_sched = svc->scheduler;
1360         ip_vs_unbind_scheduler(svc);
1361         ip_vs_scheduler_put(old_sched);
1362
1363         /* Unbind persistence engine */
1364         old_pe = svc->pe;
1365         ip_vs_unbind_pe(svc);
1366         ip_vs_pe_put(old_pe);
1367
1368         /* Unbind app inc */
1369         if (svc->inc) {
1370                 ip_vs_app_inc_put(svc->inc);
1371                 svc->inc = NULL;
1372         }
1373
1374         /*
1375          *    Unlink the whole destination list
1376          */
1377         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1378                 __ip_vs_unlink_dest(svc, dest, 0);
1379                 __ip_vs_del_dest(svc->net, dest);
1380         }
1381
1382         /*
1383          *    Update the virtual service counters
1384          */
1385         if (svc->port == FTPPORT)
1386                 atomic_dec(&ipvs->ftpsvc_counter);
1387         else if (svc->port == 0)
1388                 atomic_dec(&ipvs->nullsvc_counter);
1389
1390         /*
1391          *    Free the service if nobody refers to it
1392          */
1393         if (atomic_read(&svc->refcnt) == 0) {
1394                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
1395                               svc->fwmark,
1396                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
1397                               ntohs(svc->port), atomic_read(&svc->usecnt));
1398                 free_percpu(svc->stats.cpustats);
1399                 kfree(svc);
1400         }
1401
1402         /* decrease the module use count */
1403         ip_vs_use_count_dec();
1404 }
1405
1406 /*
1407  * Unlink a service from list and try to delete it if its refcnt reached 0
1408  */
1409 static void ip_vs_unlink_service(struct ip_vs_service *svc)
1410 {
1411         /*
1412          * Unhash it from the service table
1413          */
1414         write_lock_bh(&__ip_vs_svc_lock);
1415
1416         ip_vs_svc_unhash(svc);
1417
1418         /*
1419          * Wait until all the svc users go away.
1420          */
1421         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1422
1423         __ip_vs_del_service(svc);
1424
1425         write_unlock_bh(&__ip_vs_svc_lock);
1426 }
1427
1428 /*
1429  *      Delete a service from the service list
1430  */
1431 static int ip_vs_del_service(struct ip_vs_service *svc)
1432 {
1433         if (svc == NULL)
1434                 return -EEXIST;
1435         ip_vs_unlink_service(svc);
1436
1437         return 0;
1438 }
1439
1440
1441 /*
1442  *      Flush all the virtual services
1443  */
1444 static int ip_vs_flush(struct net *net)
1445 {
1446         int idx;
1447         struct ip_vs_service *svc, *nxt;
1448
1449         /*
1450          * Flush the service table hashed by <netns,protocol,addr,port>
1451          */
1452         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1453                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
1454                                          s_list) {
1455                         if (net_eq(svc->net, net))
1456                                 ip_vs_unlink_service(svc);
1457                 }
1458         }
1459
1460         /*
1461          * Flush the service table hashed by fwmark
1462          */
1463         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1464                 list_for_each_entry_safe(svc, nxt,
1465                                          &ip_vs_svc_fwm_table[idx], f_list) {
1466                         if (net_eq(svc->net, net))
1467                                 ip_vs_unlink_service(svc);
1468                 }
1469         }
1470
1471         return 0;
1472 }
1473
1474
1475 /*
1476  *      Zero counters in a service or all services
1477  */
1478 static int ip_vs_zero_service(struct ip_vs_service *svc)
1479 {
1480         struct ip_vs_dest *dest;
1481
1482         write_lock_bh(&__ip_vs_svc_lock);
1483         list_for_each_entry(dest, &svc->destinations, n_list) {
1484                 ip_vs_zero_stats(&dest->stats);
1485         }
1486         ip_vs_zero_stats(&svc->stats);
1487         write_unlock_bh(&__ip_vs_svc_lock);
1488         return 0;
1489 }
1490
1491 static int ip_vs_zero_all(struct net *net)
1492 {
1493         int idx;
1494         struct ip_vs_service *svc;
1495
1496         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1497                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1498                         if (net_eq(svc->net, net))
1499                                 ip_vs_zero_service(svc);
1500                 }
1501         }
1502
1503         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1504                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1505                         if (net_eq(svc->net, net))
1506                                 ip_vs_zero_service(svc);
1507                 }
1508         }
1509
1510         ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
1511         return 0;
1512 }
1513
1514
1515 static int
1516 proc_do_defense_mode(ctl_table *table, int write,
1517                      void __user *buffer, size_t *lenp, loff_t *ppos)
1518 {
1519         struct net *net = current->nsproxy->net_ns;
1520         int *valp = table->data;
1521         int val = *valp;
1522         int rc;
1523
1524         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1525         if (write && (*valp != val)) {
1526                 if ((*valp < 0) || (*valp > 3)) {
1527                         /* Restore the correct value */
1528                         *valp = val;
1529                 } else {
1530                         update_defense_level(net_ipvs(net));
1531                 }
1532         }
1533         return rc;
1534 }
1535
1536
1537 static int
1538 proc_do_sync_threshold(ctl_table *table, int write,
1539                        void __user *buffer, size_t *lenp, loff_t *ppos)
1540 {
1541         int *valp = table->data;
1542         int val[2];
1543         int rc;
1544
1545         /* backup the value first */
1546         memcpy(val, valp, sizeof(val));
1547
1548         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1549         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1550                 /* Restore the correct value */
1551                 memcpy(valp, val, sizeof(val));
1552         }
1553         return rc;
1554 }
1555
1556 static int
1557 proc_do_sync_mode(ctl_table *table, int write,
1558                      void __user *buffer, size_t *lenp, loff_t *ppos)
1559 {
1560         int *valp = table->data;
1561         int val = *valp;
1562         int rc;
1563
1564         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1565         if (write && (*valp != val)) {
1566                 if ((*valp < 0) || (*valp > 1)) {
1567                         /* Restore the correct value */
1568                         *valp = val;
1569                 } else {
1570                         struct net *net = current->nsproxy->net_ns;
1571                         ip_vs_sync_switch_mode(net, val);
1572                 }
1573         }
1574         return rc;
1575 }
1576
1577 /*
1578  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1579  *      Do not change order or insert new entries without
1580  *      align with netns init in __ip_vs_control_init()
1581  */
1582
1583 static struct ctl_table vs_vars[] = {
1584         {
1585                 .procname       = "amemthresh",
1586                 .maxlen         = sizeof(int),
1587                 .mode           = 0644,
1588                 .proc_handler   = proc_dointvec,
1589         },
1590         {
1591                 .procname       = "am_droprate",
1592                 .maxlen         = sizeof(int),
1593                 .mode           = 0644,
1594                 .proc_handler   = proc_dointvec,
1595         },
1596         {
1597                 .procname       = "drop_entry",
1598                 .maxlen         = sizeof(int),
1599                 .mode           = 0644,
1600                 .proc_handler   = proc_do_defense_mode,
1601         },
1602         {
1603                 .procname       = "drop_packet",
1604                 .maxlen         = sizeof(int),
1605                 .mode           = 0644,
1606                 .proc_handler   = proc_do_defense_mode,
1607         },
1608 #ifdef CONFIG_IP_VS_NFCT
1609         {
1610                 .procname       = "conntrack",
1611                 .maxlen         = sizeof(int),
1612                 .mode           = 0644,
1613                 .proc_handler   = &proc_dointvec,
1614         },
1615 #endif
1616         {
1617                 .procname       = "secure_tcp",
1618                 .maxlen         = sizeof(int),
1619                 .mode           = 0644,
1620                 .proc_handler   = proc_do_defense_mode,
1621         },
1622         {
1623                 .procname       = "snat_reroute",
1624                 .maxlen         = sizeof(int),
1625                 .mode           = 0644,
1626                 .proc_handler   = &proc_dointvec,
1627         },
1628         {
1629                 .procname       = "sync_version",
1630                 .maxlen         = sizeof(int),
1631                 .mode           = 0644,
1632                 .proc_handler   = &proc_do_sync_mode,
1633         },
1634         {
1635                 .procname       = "cache_bypass",
1636                 .maxlen         = sizeof(int),
1637                 .mode           = 0644,
1638                 .proc_handler   = proc_dointvec,
1639         },
1640         {
1641                 .procname       = "expire_nodest_conn",
1642                 .maxlen         = sizeof(int),
1643                 .mode           = 0644,
1644                 .proc_handler   = proc_dointvec,
1645         },
1646         {
1647                 .procname       = "expire_quiescent_template",
1648                 .maxlen         = sizeof(int),
1649                 .mode           = 0644,
1650                 .proc_handler   = proc_dointvec,
1651         },
1652         {
1653                 .procname       = "sync_threshold",
1654                 .maxlen         =
1655                         sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
1656                 .mode           = 0644,
1657                 .proc_handler   = proc_do_sync_threshold,
1658         },
1659         {
1660                 .procname       = "nat_icmp_send",
1661                 .maxlen         = sizeof(int),
1662                 .mode           = 0644,
1663                 .proc_handler   = proc_dointvec,
1664         },
1665 #ifdef CONFIG_IP_VS_DEBUG
1666         {
1667                 .procname       = "debug_level",
1668                 .data           = &sysctl_ip_vs_debug_level,
1669                 .maxlen         = sizeof(int),
1670                 .mode           = 0644,
1671                 .proc_handler   = proc_dointvec,
1672         },
1673 #endif
1674 #if 0
1675         {
1676                 .procname       = "timeout_established",
1677                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1678                 .maxlen         = sizeof(int),
1679                 .mode           = 0644,
1680                 .proc_handler   = proc_dointvec_jiffies,
1681         },
1682         {
1683                 .procname       = "timeout_synsent",
1684                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1685                 .maxlen         = sizeof(int),
1686                 .mode           = 0644,
1687                 .proc_handler   = proc_dointvec_jiffies,
1688         },
1689         {
1690                 .procname       = "timeout_synrecv",
1691                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1692                 .maxlen         = sizeof(int),
1693                 .mode           = 0644,
1694                 .proc_handler   = proc_dointvec_jiffies,
1695         },
1696         {
1697                 .procname       = "timeout_finwait",
1698                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1699                 .maxlen         = sizeof(int),
1700                 .mode           = 0644,
1701                 .proc_handler   = proc_dointvec_jiffies,
1702         },
1703         {
1704                 .procname       = "timeout_timewait",
1705                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1706                 .maxlen         = sizeof(int),
1707                 .mode           = 0644,
1708                 .proc_handler   = proc_dointvec_jiffies,
1709         },
1710         {
1711                 .procname       = "timeout_close",
1712                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1713                 .maxlen         = sizeof(int),
1714                 .mode           = 0644,
1715                 .proc_handler   = proc_dointvec_jiffies,
1716         },
1717         {
1718                 .procname       = "timeout_closewait",
1719                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1720                 .maxlen         = sizeof(int),
1721                 .mode           = 0644,
1722                 .proc_handler   = proc_dointvec_jiffies,
1723         },
1724         {
1725                 .procname       = "timeout_lastack",
1726                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1727                 .maxlen         = sizeof(int),
1728                 .mode           = 0644,
1729                 .proc_handler   = proc_dointvec_jiffies,
1730         },
1731         {
1732                 .procname       = "timeout_listen",
1733                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1734                 .maxlen         = sizeof(int),
1735                 .mode           = 0644,
1736                 .proc_handler   = proc_dointvec_jiffies,
1737         },
1738         {
1739                 .procname       = "timeout_synack",
1740                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1741                 .maxlen         = sizeof(int),
1742                 .mode           = 0644,
1743                 .proc_handler   = proc_dointvec_jiffies,
1744         },
1745         {
1746                 .procname       = "timeout_udp",
1747                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1748                 .maxlen         = sizeof(int),
1749                 .mode           = 0644,
1750                 .proc_handler   = proc_dointvec_jiffies,
1751         },
1752         {
1753                 .procname       = "timeout_icmp",
1754                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1755                 .maxlen         = sizeof(int),
1756                 .mode           = 0644,
1757                 .proc_handler   = proc_dointvec_jiffies,
1758         },
1759 #endif
1760         { }
1761 };
1762
1763 const struct ctl_path net_vs_ctl_path[] = {
1764         { .procname = "net", },
1765         { .procname = "ipv4", },
1766         { .procname = "vs", },
1767         { }
1768 };
1769 EXPORT_SYMBOL_GPL(net_vs_ctl_path);
1770
1771 #ifdef CONFIG_PROC_FS
1772
1773 struct ip_vs_iter {
1774         struct seq_net_private p;  /* Do not move this, netns depends upon it*/
1775         struct list_head *table;
1776         int bucket;
1777 };
1778
1779 /*
1780  *      Write the contents of the VS rule table to a PROCfs file.
1781  *      (It is kept just for backward compatibility)
1782  */
1783 static inline const char *ip_vs_fwd_name(unsigned flags)
1784 {
1785         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1786         case IP_VS_CONN_F_LOCALNODE:
1787                 return "Local";
1788         case IP_VS_CONN_F_TUNNEL:
1789                 return "Tunnel";
1790         case IP_VS_CONN_F_DROUTE:
1791                 return "Route";
1792         default:
1793                 return "Masq";
1794         }
1795 }
1796
1797
1798 /* Get the Nth entry in the two lists */
1799 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1800 {
1801         struct net *net = seq_file_net(seq);
1802         struct ip_vs_iter *iter = seq->private;
1803         int idx;
1804         struct ip_vs_service *svc;
1805
1806         /* look in hash by protocol */
1807         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1808                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1809                         if (net_eq(svc->net, net) && pos-- == 0) {
1810                                 iter->table = ip_vs_svc_table;
1811                                 iter->bucket = idx;
1812                                 return svc;
1813                         }
1814                 }
1815         }
1816
1817         /* keep looking in fwmark */
1818         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1819                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1820                         if (net_eq(svc->net, net) && pos-- == 0) {
1821                                 iter->table = ip_vs_svc_fwm_table;
1822                                 iter->bucket = idx;
1823                                 return svc;
1824                         }
1825                 }
1826         }
1827
1828         return NULL;
1829 }
1830
1831 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1832 __acquires(__ip_vs_svc_lock)
1833 {
1834
1835         read_lock_bh(&__ip_vs_svc_lock);
1836         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1837 }
1838
1839
1840 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1841 {
1842         struct list_head *e;
1843         struct ip_vs_iter *iter;
1844         struct ip_vs_service *svc;
1845
1846         ++*pos;
1847         if (v == SEQ_START_TOKEN)
1848                 return ip_vs_info_array(seq,0);
1849
1850         svc = v;
1851         iter = seq->private;
1852
1853         if (iter->table == ip_vs_svc_table) {
1854                 /* next service in table hashed by protocol */
1855                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1856                         return list_entry(e, struct ip_vs_service, s_list);
1857
1858
1859                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1860                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1861                                             s_list) {
1862                                 return svc;
1863                         }
1864                 }
1865
1866                 iter->table = ip_vs_svc_fwm_table;
1867                 iter->bucket = -1;
1868                 goto scan_fwmark;
1869         }
1870
1871         /* next service in hashed by fwmark */
1872         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1873                 return list_entry(e, struct ip_vs_service, f_list);
1874
1875  scan_fwmark:
1876         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1877                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1878                                     f_list)
1879                         return svc;
1880         }
1881
1882         return NULL;
1883 }
1884
1885 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1886 __releases(__ip_vs_svc_lock)
1887 {
1888         read_unlock_bh(&__ip_vs_svc_lock);
1889 }
1890
1891
1892 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1893 {
1894         if (v == SEQ_START_TOKEN) {
1895                 seq_printf(seq,
1896                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1897                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
1898                 seq_puts(seq,
1899                          "Prot LocalAddress:Port Scheduler Flags\n");
1900                 seq_puts(seq,
1901                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1902         } else {
1903                 const struct ip_vs_service *svc = v;
1904                 const struct ip_vs_iter *iter = seq->private;
1905                 const struct ip_vs_dest *dest;
1906
1907                 if (iter->table == ip_vs_svc_table) {
1908 #ifdef CONFIG_IP_VS_IPV6
1909                         if (svc->af == AF_INET6)
1910                                 seq_printf(seq, "%s  [%pI6]:%04X %s ",
1911                                            ip_vs_proto_name(svc->protocol),
1912                                            &svc->addr.in6,
1913                                            ntohs(svc->port),
1914                                            svc->scheduler->name);
1915                         else
1916 #endif
1917                                 seq_printf(seq, "%s  %08X:%04X %s %s ",
1918                                            ip_vs_proto_name(svc->protocol),
1919                                            ntohl(svc->addr.ip),
1920                                            ntohs(svc->port),
1921                                            svc->scheduler->name,
1922                                            (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
1923                 } else {
1924                         seq_printf(seq, "FWM  %08X %s %s",
1925                                    svc->fwmark, svc->scheduler->name,
1926                                    (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
1927                 }
1928
1929                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1930                         seq_printf(seq, "persistent %d %08X\n",
1931                                 svc->timeout,
1932                                 ntohl(svc->netmask));
1933                 else
1934                         seq_putc(seq, '\n');
1935
1936                 list_for_each_entry(dest, &svc->destinations, n_list) {
1937 #ifdef CONFIG_IP_VS_IPV6
1938                         if (dest->af == AF_INET6)
1939                                 seq_printf(seq,
1940                                            "  -> [%pI6]:%04X"
1941                                            "      %-7s %-6d %-10d %-10d\n",
1942                                            &dest->addr.in6,
1943                                            ntohs(dest->port),
1944                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1945                                            atomic_read(&dest->weight),
1946                                            atomic_read(&dest->activeconns),
1947                                            atomic_read(&dest->inactconns));
1948                         else
1949 #endif
1950                                 seq_printf(seq,
1951                                            "  -> %08X:%04X      "
1952                                            "%-7s %-6d %-10d %-10d\n",
1953                                            ntohl(dest->addr.ip),
1954                                            ntohs(dest->port),
1955                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1956                                            atomic_read(&dest->weight),
1957                                            atomic_read(&dest->activeconns),
1958                                            atomic_read(&dest->inactconns));
1959
1960                 }
1961         }
1962         return 0;
1963 }
1964
1965 static const struct seq_operations ip_vs_info_seq_ops = {
1966         .start = ip_vs_info_seq_start,
1967         .next  = ip_vs_info_seq_next,
1968         .stop  = ip_vs_info_seq_stop,
1969         .show  = ip_vs_info_seq_show,
1970 };
1971
1972 static int ip_vs_info_open(struct inode *inode, struct file *file)
1973 {
1974         return seq_open_net(inode, file, &ip_vs_info_seq_ops,
1975                         sizeof(struct ip_vs_iter));
1976 }
1977
1978 static const struct file_operations ip_vs_info_fops = {
1979         .owner   = THIS_MODULE,
1980         .open    = ip_vs_info_open,
1981         .read    = seq_read,
1982         .llseek  = seq_lseek,
1983         .release = seq_release_private,
1984 };
1985
1986 #endif
1987
1988 #ifdef CONFIG_PROC_FS
1989 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1990 {
1991         struct net *net = seq_file_single_net(seq);
1992         struct ip_vs_stats_user show;
1993
1994 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1995         seq_puts(seq,
1996                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1997         seq_printf(seq,
1998                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1999
2000         ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
2001         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns,
2002                    show.inpkts, show.outpkts,
2003                    (unsigned long long) show.inbytes,
2004                    (unsigned long long) show.outbytes);
2005
2006 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2007         seq_puts(seq,
2008                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2009         seq_printf(seq, "%8X %8X %8X %16X %16X\n",
2010                         show.cps, show.inpps, show.outpps,
2011                         show.inbps, show.outbps);
2012
2013         return 0;
2014 }
2015
2016 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
2017 {
2018         return single_open_net(inode, file, ip_vs_stats_show);
2019 }
2020
2021 static const struct file_operations ip_vs_stats_fops = {
2022         .owner = THIS_MODULE,
2023         .open = ip_vs_stats_seq_open,
2024         .read = seq_read,
2025         .llseek = seq_lseek,
2026         .release = single_release,
2027 };
2028
2029 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
2030 {
2031         struct net *net = seq_file_single_net(seq);
2032         struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
2033         struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats;
2034         struct ip_vs_stats_user rates;
2035         int i;
2036
2037 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2038         seq_puts(seq,
2039                  "       Total Incoming Outgoing         Incoming         Outgoing\n");
2040         seq_printf(seq,
2041                    "CPU    Conns  Packets  Packets            Bytes            Bytes\n");
2042
2043         for_each_possible_cpu(i) {
2044                 struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
2045                 unsigned int start;
2046                 __u64 inbytes, outbytes;
2047
2048                 do {
2049                         start = u64_stats_fetch_begin_bh(&u->syncp);
2050                         inbytes = u->ustats.inbytes;
2051                         outbytes = u->ustats.outbytes;
2052                 } while (u64_stats_fetch_retry_bh(&u->syncp, start));
2053
2054                 seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
2055                            i, u->ustats.conns, u->ustats.inpkts,
2056                            u->ustats.outpkts, (__u64)inbytes,
2057                            (__u64)outbytes);
2058         }
2059
2060         spin_lock_bh(&tot_stats->lock);
2061
2062         seq_printf(seq, "  ~ %8X %8X %8X %16LX %16LX\n\n",
2063                    tot_stats->ustats.conns, tot_stats->ustats.inpkts,
2064                    tot_stats->ustats.outpkts,
2065                    (unsigned long long) tot_stats->ustats.inbytes,
2066                    (unsigned long long) tot_stats->ustats.outbytes);
2067
2068         ip_vs_read_estimator(&rates, tot_stats);
2069
2070         spin_unlock_bh(&tot_stats->lock);
2071
2072 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2073         seq_puts(seq,
2074                    "     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2075         seq_printf(seq, "    %8X %8X %8X %16X %16X\n",
2076                         rates.cps,
2077                         rates.inpps,
2078                         rates.outpps,
2079                         rates.inbps,
2080                         rates.outbps);
2081
2082         return 0;
2083 }
2084
2085 static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
2086 {
2087         return single_open_net(inode, file, ip_vs_stats_percpu_show);
2088 }
2089
2090 static const struct file_operations ip_vs_stats_percpu_fops = {
2091         .owner = THIS_MODULE,
2092         .open = ip_vs_stats_percpu_seq_open,
2093         .read = seq_read,
2094         .llseek = seq_lseek,
2095         .release = single_release,
2096 };
2097 #endif
2098
2099 /*
2100  *      Set timeout values for tcp tcpfin udp in the timeout_table.
2101  */
2102 static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
2103 {
2104 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2105         struct ip_vs_proto_data *pd;
2106 #endif
2107
2108         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
2109                   u->tcp_timeout,
2110                   u->tcp_fin_timeout,
2111                   u->udp_timeout);
2112
2113 #ifdef CONFIG_IP_VS_PROTO_TCP
2114         if (u->tcp_timeout) {
2115                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2116                 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
2117                         = u->tcp_timeout * HZ;
2118         }
2119
2120         if (u->tcp_fin_timeout) {
2121                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2122                 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
2123                         = u->tcp_fin_timeout * HZ;
2124         }
2125 #endif
2126
2127 #ifdef CONFIG_IP_VS_PROTO_UDP
2128         if (u->udp_timeout) {
2129                 pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2130                 pd->timeout_table[IP_VS_UDP_S_NORMAL]
2131                         = u->udp_timeout * HZ;
2132         }
2133 #endif
2134         return 0;
2135 }
2136
2137
2138 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2139 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
2140 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
2141                                  sizeof(struct ip_vs_dest_user))
2142 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
2143 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
2144 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
2145
2146 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
2147         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
2148         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
2149         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
2150         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
2151         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
2152         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
2153         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
2154         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
2155         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
2156         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
2157         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
2158 };
2159
2160 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2161                                   struct ip_vs_service_user *usvc_compat)
2162 {
2163         memset(usvc, 0, sizeof(*usvc));
2164
2165         usvc->af                = AF_INET;
2166         usvc->protocol          = usvc_compat->protocol;
2167         usvc->addr.ip           = usvc_compat->addr;
2168         usvc->port              = usvc_compat->port;
2169         usvc->fwmark            = usvc_compat->fwmark;
2170
2171         /* Deep copy of sched_name is not needed here */
2172         usvc->sched_name        = usvc_compat->sched_name;
2173
2174         usvc->flags             = usvc_compat->flags;
2175         usvc->timeout           = usvc_compat->timeout;
2176         usvc->netmask           = usvc_compat->netmask;
2177 }
2178
2179 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2180                                    struct ip_vs_dest_user *udest_compat)
2181 {
2182         memset(udest, 0, sizeof(*udest));
2183
2184         udest->addr.ip          = udest_compat->addr;
2185         udest->port             = udest_compat->port;
2186         udest->conn_flags       = udest_compat->conn_flags;
2187         udest->weight           = udest_compat->weight;
2188         udest->u_threshold      = udest_compat->u_threshold;
2189         udest->l_threshold      = udest_compat->l_threshold;
2190 }
2191
2192 static int
2193 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2194 {
2195         struct net *net = sock_net(sk);
2196         int ret;
2197         unsigned char arg[MAX_ARG_LEN];
2198         struct ip_vs_service_user *usvc_compat;
2199         struct ip_vs_service_user_kern usvc;
2200         struct ip_vs_service *svc;
2201         struct ip_vs_dest_user *udest_compat;
2202         struct ip_vs_dest_user_kern udest;
2203
2204         if (!capable(CAP_NET_ADMIN))
2205                 return -EPERM;
2206
2207         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
2208                 return -EINVAL;
2209         if (len < 0 || len >  MAX_ARG_LEN)
2210                 return -EINVAL;
2211         if (len != set_arglen[SET_CMDID(cmd)]) {
2212                 pr_err("set_ctl: len %u != %u\n",
2213                        len, set_arglen[SET_CMDID(cmd)]);
2214                 return -EINVAL;
2215         }
2216
2217         if (copy_from_user(arg, user, len) != 0)
2218                 return -EFAULT;
2219
2220         /* increase the module use count */
2221         ip_vs_use_count_inc();
2222
2223         if (mutex_lock_interruptible(&__ip_vs_mutex)) {
2224                 ret = -ERESTARTSYS;
2225                 goto out_dec;
2226         }
2227
2228         if (cmd == IP_VS_SO_SET_FLUSH) {
2229                 /* Flush the virtual service */
2230                 ret = ip_vs_flush(net);
2231                 goto out_unlock;
2232         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
2233                 /* Set timeout values for (tcp tcpfin udp) */
2234                 ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
2235                 goto out_unlock;
2236         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
2237                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2238                 ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
2239                                         dm->syncid);
2240                 goto out_unlock;
2241         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
2242                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2243                 ret = stop_sync_thread(net, dm->state);
2244                 goto out_unlock;
2245         }
2246
2247         usvc_compat = (struct ip_vs_service_user *)arg;
2248         udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
2249
2250         /* We only use the new structs internally, so copy userspace compat
2251          * structs to extended internal versions */
2252         ip_vs_copy_usvc_compat(&usvc, usvc_compat);
2253         ip_vs_copy_udest_compat(&udest, udest_compat);
2254
2255         if (cmd == IP_VS_SO_SET_ZERO) {
2256                 /* if no service address is set, zero counters in all */
2257                 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2258                         ret = ip_vs_zero_all(net);
2259                         goto out_unlock;
2260                 }
2261         }
2262
2263         /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
2264         if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
2265             usvc.protocol != IPPROTO_SCTP) {
2266                 pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n",
2267                        usvc.protocol, &usvc.addr.ip,
2268                        ntohs(usvc.port), usvc.sched_name);
2269                 ret = -EFAULT;
2270                 goto out_unlock;
2271         }
2272
2273         /* Lookup the exact service by <protocol, addr, port> or fwmark */
2274         if (usvc.fwmark == 0)
2275                 svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
2276                                            &usvc.addr, usvc.port);
2277         else
2278                 svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
2279
2280         if (cmd != IP_VS_SO_SET_ADD
2281             && (svc == NULL || svc->protocol != usvc.protocol)) {
2282                 ret = -ESRCH;
2283                 goto out_unlock;
2284         }
2285
2286         switch (cmd) {
2287         case IP_VS_SO_SET_ADD:
2288                 if (svc != NULL)
2289                         ret = -EEXIST;
2290                 else
2291                         ret = ip_vs_add_service(net, &usvc, &svc);
2292                 break;
2293         case IP_VS_SO_SET_EDIT:
2294                 ret = ip_vs_edit_service(svc, &usvc);
2295                 break;
2296         case IP_VS_SO_SET_DEL:
2297                 ret = ip_vs_del_service(svc);
2298                 if (!ret)
2299                         goto out_unlock;
2300                 break;
2301         case IP_VS_SO_SET_ZERO:
2302                 ret = ip_vs_zero_service(svc);
2303                 break;
2304         case IP_VS_SO_SET_ADDDEST:
2305                 ret = ip_vs_add_dest(svc, &udest);
2306                 break;
2307         case IP_VS_SO_SET_EDITDEST:
2308                 ret = ip_vs_edit_dest(svc, &udest);
2309                 break;
2310         case IP_VS_SO_SET_DELDEST:
2311                 ret = ip_vs_del_dest(svc, &udest);
2312                 break;
2313         default:
2314                 ret = -EINVAL;
2315         }
2316
2317   out_unlock:
2318         mutex_unlock(&__ip_vs_mutex);
2319   out_dec:
2320         /* decrease the module use count */
2321         ip_vs_use_count_dec();
2322
2323         return ret;
2324 }
2325
2326
2327 static void
2328 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2329 {
2330         dst->protocol = src->protocol;
2331         dst->addr = src->addr.ip;
2332         dst->port = src->port;
2333         dst->fwmark = src->fwmark;
2334         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2335         dst->flags = src->flags;
2336         dst->timeout = src->timeout / HZ;
2337         dst->netmask = src->netmask;
2338         dst->num_dests = src->num_dests;
2339         ip_vs_copy_stats(&dst->stats, &src->stats);
2340 }
2341
2342 static inline int
2343 __ip_vs_get_service_entries(struct net *net,
2344                             const struct ip_vs_get_services *get,
2345                             struct ip_vs_get_services __user *uptr)
2346 {
2347         int idx, count=0;
2348         struct ip_vs_service *svc;
2349         struct ip_vs_service_entry entry;
2350         int ret = 0;
2351
2352         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2353                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2354                         /* Only expose IPv4 entries to old interface */
2355                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2356                                 continue;
2357
2358                         if (count >= get->num_services)
2359                                 goto out;
2360                         memset(&entry, 0, sizeof(entry));
2361                         ip_vs_copy_service(&entry, svc);
2362                         if (copy_to_user(&uptr->entrytable[count],
2363                                          &entry, sizeof(entry))) {
2364                                 ret = -EFAULT;
2365                                 goto out;
2366                         }
2367                         count++;
2368                 }
2369         }
2370
2371         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2372                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2373                         /* Only expose IPv4 entries to old interface */
2374                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2375                                 continue;
2376
2377                         if (count >= get->num_services)
2378                                 goto out;
2379                         memset(&entry, 0, sizeof(entry));
2380                         ip_vs_copy_service(&entry, svc);
2381                         if (copy_to_user(&uptr->entrytable[count],
2382                                          &entry, sizeof(entry))) {
2383                                 ret = -EFAULT;
2384                                 goto out;
2385                         }
2386                         count++;
2387                 }
2388         }
2389   out:
2390         return ret;
2391 }
2392
2393 static inline int
2394 __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
2395                          struct ip_vs_get_dests __user *uptr)
2396 {
2397         struct ip_vs_service *svc;
2398         union nf_inet_addr addr = { .ip = get->addr };
2399         int ret = 0;
2400
2401         if (get->fwmark)
2402                 svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
2403         else
2404                 svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
2405                                            get->port);
2406
2407         if (svc) {
2408                 int count = 0;
2409                 struct ip_vs_dest *dest;
2410                 struct ip_vs_dest_entry entry;
2411
2412                 list_for_each_entry(dest, &svc->destinations, n_list) {
2413                         if (count >= get->num_dests)
2414                                 break;
2415
2416                         entry.addr = dest->addr.ip;
2417                         entry.port = dest->port;
2418                         entry.conn_flags = atomic_read(&dest->conn_flags);
2419                         entry.weight = atomic_read(&dest->weight);
2420                         entry.u_threshold = dest->u_threshold;
2421                         entry.l_threshold = dest->l_threshold;
2422                         entry.activeconns = atomic_read(&dest->activeconns);
2423                         entry.inactconns = atomic_read(&dest->inactconns);
2424                         entry.persistconns = atomic_read(&dest->persistconns);
2425                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2426                         if (copy_to_user(&uptr->entrytable[count],
2427                                          &entry, sizeof(entry))) {
2428                                 ret = -EFAULT;
2429                                 break;
2430                         }
2431                         count++;
2432                 }
2433         } else
2434                 ret = -ESRCH;
2435         return ret;
2436 }
2437
2438 static inline void
2439 __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
2440 {
2441 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2442         struct ip_vs_proto_data *pd;
2443 #endif
2444
2445 #ifdef CONFIG_IP_VS_PROTO_TCP
2446         pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2447         u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2448         u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2449 #endif
2450 #ifdef CONFIG_IP_VS_PROTO_UDP
2451         pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2452         u->udp_timeout =
2453                         pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2454 #endif
2455 }
2456
2457
2458 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2459 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2460 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2461 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2462 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2463 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2464 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2465
2466 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2467         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2468         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2469         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2470         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2471         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2472         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2473         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2474 };
2475
2476 static int
2477 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2478 {
2479         unsigned char arg[128];
2480         int ret = 0;
2481         unsigned int copylen;
2482         struct net *net = sock_net(sk);
2483         struct netns_ipvs *ipvs = net_ipvs(net);
2484
2485         BUG_ON(!net);
2486         if (!capable(CAP_NET_ADMIN))
2487                 return -EPERM;
2488
2489         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
2490                 return -EINVAL;
2491
2492         if (*len < get_arglen[GET_CMDID(cmd)]) {
2493                 pr_err("get_ctl: len %u < %u\n",
2494                        *len, get_arglen[GET_CMDID(cmd)]);
2495                 return -EINVAL;
2496         }
2497
2498         copylen = get_arglen[GET_CMDID(cmd)];
2499         if (copylen > 128)
2500                 return -EINVAL;
2501
2502         if (copy_from_user(arg, user, copylen) != 0)
2503                 return -EFAULT;
2504
2505         if (mutex_lock_interruptible(&__ip_vs_mutex))
2506                 return -ERESTARTSYS;
2507
2508         switch (cmd) {
2509         case IP_VS_SO_GET_VERSION:
2510         {
2511                 char buf[64];
2512
2513                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2514                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2515                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2516                         ret = -EFAULT;
2517                         goto out;
2518                 }
2519                 *len = strlen(buf)+1;
2520         }
2521         break;
2522
2523         case IP_VS_SO_GET_INFO:
2524         {
2525                 struct ip_vs_getinfo info;
2526                 info.version = IP_VS_VERSION_CODE;
2527                 info.size = ip_vs_conn_tab_size;
2528                 info.num_services = ipvs->num_services;
2529                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2530                         ret = -EFAULT;
2531         }
2532         break;
2533
2534         case IP_VS_SO_GET_SERVICES:
2535         {
2536                 struct ip_vs_get_services *get;
2537                 int size;
2538
2539                 get = (struct ip_vs_get_services *)arg;
2540                 size = sizeof(*get) +
2541                         sizeof(struct ip_vs_service_entry) * get->num_services;
2542                 if (*len != size) {
2543                         pr_err("length: %u != %u\n", *len, size);
2544                         ret = -EINVAL;
2545                         goto out;
2546                 }
2547                 ret = __ip_vs_get_service_entries(net, get, user);
2548         }
2549         break;
2550
2551         case IP_VS_SO_GET_SERVICE:
2552         {
2553                 struct ip_vs_service_entry *entry;
2554                 struct ip_vs_service *svc;
2555                 union nf_inet_addr addr;
2556
2557                 entry = (struct ip_vs_service_entry *)arg;
2558                 addr.ip = entry->addr;
2559                 if (entry->fwmark)
2560                         svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
2561                 else
2562                         svc = __ip_vs_service_find(net, AF_INET,
2563                                                    entry->protocol, &addr,
2564                                                    entry->port);
2565                 if (svc) {
2566                         ip_vs_copy_service(entry, svc);
2567                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2568                                 ret = -EFAULT;
2569                 } else
2570                         ret = -ESRCH;
2571         }
2572         break;
2573
2574         case IP_VS_SO_GET_DESTS:
2575         {
2576                 struct ip_vs_get_dests *get;
2577                 int size;
2578
2579                 get = (struct ip_vs_get_dests *)arg;
2580                 size = sizeof(*get) +
2581                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2582                 if (*len != size) {
2583                         pr_err("length: %u != %u\n", *len, size);
2584                         ret = -EINVAL;
2585                         goto out;
2586                 }
2587                 ret = __ip_vs_get_dest_entries(net, get, user);
2588         }
2589         break;
2590
2591         case IP_VS_SO_GET_TIMEOUT:
2592         {
2593                 struct ip_vs_timeout_user t;
2594
2595                 __ip_vs_get_timeouts(net, &t);
2596                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2597                         ret = -EFAULT;
2598         }
2599         break;
2600
2601         case IP_VS_SO_GET_DAEMON:
2602         {
2603                 struct ip_vs_daemon_user d[2];
2604
2605                 memset(&d, 0, sizeof(d));
2606                 if (ipvs->sync_state & IP_VS_STATE_MASTER) {
2607                         d[0].state = IP_VS_STATE_MASTER;
2608                         strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
2609                                 sizeof(d[0].mcast_ifn));
2610                         d[0].syncid = ipvs->master_syncid;
2611                 }
2612                 if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
2613                         d[1].state = IP_VS_STATE_BACKUP;
2614                         strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
2615                                 sizeof(d[1].mcast_ifn));
2616                         d[1].syncid = ipvs->backup_syncid;
2617                 }
2618                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2619                         ret = -EFAULT;
2620         }
2621         break;
2622
2623         default:
2624                 ret = -EINVAL;
2625         }
2626
2627   out:
2628         mutex_unlock(&__ip_vs_mutex);
2629         return ret;
2630 }
2631
2632
2633 static struct nf_sockopt_ops ip_vs_sockopts = {
2634         .pf             = PF_INET,
2635         .set_optmin     = IP_VS_BASE_CTL,
2636         .set_optmax     = IP_VS_SO_SET_MAX+1,
2637         .set            = do_ip_vs_set_ctl,
2638         .get_optmin     = IP_VS_BASE_CTL,
2639         .get_optmax     = IP_VS_SO_GET_MAX+1,
2640         .get            = do_ip_vs_get_ctl,
2641         .owner          = THIS_MODULE,
2642 };
2643
2644 /*
2645  * Generic Netlink interface
2646  */
2647
2648 /* IPVS genetlink family */
2649 static struct genl_family ip_vs_genl_family = {
2650         .id             = GENL_ID_GENERATE,
2651         .hdrsize        = 0,
2652         .name           = IPVS_GENL_NAME,
2653         .version        = IPVS_GENL_VERSION,
2654         .maxattr        = IPVS_CMD_MAX,
2655         .netnsok        = true,         /* Make ipvsadm to work on netns */
2656 };
2657
2658 /* Policy used for first-level command attributes */
2659 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
2660         [IPVS_CMD_ATTR_SERVICE]         = { .type = NLA_NESTED },
2661         [IPVS_CMD_ATTR_DEST]            = { .type = NLA_NESTED },
2662         [IPVS_CMD_ATTR_DAEMON]          = { .type = NLA_NESTED },
2663         [IPVS_CMD_ATTR_TIMEOUT_TCP]     = { .type = NLA_U32 },
2664         [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
2665         [IPVS_CMD_ATTR_TIMEOUT_UDP]     = { .type = NLA_U32 },
2666 };
2667
2668 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
2669 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
2670         [IPVS_DAEMON_ATTR_STATE]        = { .type = NLA_U32 },
2671         [IPVS_DAEMON_ATTR_MCAST_IFN]    = { .type = NLA_NUL_STRING,
2672                                             .len = IP_VS_IFNAME_MAXLEN },
2673         [IPVS_DAEMON_ATTR_SYNC_ID]      = { .type = NLA_U32 },
2674 };
2675
2676 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
2677 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
2678         [IPVS_SVC_ATTR_AF]              = { .type = NLA_U16 },
2679         [IPVS_SVC_ATTR_PROTOCOL]        = { .type = NLA_U16 },
2680         [IPVS_SVC_ATTR_ADDR]            = { .type = NLA_BINARY,
2681                                             .len = sizeof(union nf_inet_addr) },
2682         [IPVS_SVC_ATTR_PORT]            = { .type = NLA_U16 },
2683         [IPVS_SVC_ATTR_FWMARK]          = { .type = NLA_U32 },
2684         [IPVS_SVC_ATTR_SCHED_NAME]      = { .type = NLA_NUL_STRING,
2685                                             .len = IP_VS_SCHEDNAME_MAXLEN },
2686         [IPVS_SVC_ATTR_PE_NAME]         = { .type = NLA_NUL_STRING,
2687                                             .len = IP_VS_PENAME_MAXLEN },
2688         [IPVS_SVC_ATTR_FLAGS]           = { .type = NLA_BINARY,
2689                                             .len = sizeof(struct ip_vs_flags) },
2690         [IPVS_SVC_ATTR_TIMEOUT]         = { .type = NLA_U32 },
2691         [IPVS_SVC_ATTR_NETMASK]         = { .type = NLA_U32 },
2692         [IPVS_SVC_ATTR_STATS]           = { .type = NLA_NESTED },
2693 };
2694
2695 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
2696 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
2697         [IPVS_DEST_ATTR_ADDR]           = { .type = NLA_BINARY,
2698                                             .len = sizeof(union nf_inet_addr) },
2699         [IPVS_DEST_ATTR_PORT]           = { .type = NLA_U16 },
2700         [IPVS_DEST_ATTR_FWD_METHOD]     = { .type = NLA_U32 },
2701         [IPVS_DEST_ATTR_WEIGHT]         = { .type = NLA_U32 },
2702         [IPVS_DEST_ATTR_U_THRESH]       = { .type = NLA_U32 },
2703         [IPVS_DEST_ATTR_L_THRESH]       = { .type = NLA_U32 },
2704         [IPVS_DEST_ATTR_ACTIVE_CONNS]   = { .type = NLA_U32 },
2705         [IPVS_DEST_ATTR_INACT_CONNS]    = { .type = NLA_U32 },
2706         [IPVS_DEST_ATTR_PERSIST_CONNS]  = { .type = NLA_U32 },
2707         [IPVS_DEST_ATTR_STATS]          = { .type = NLA_NESTED },
2708 };
2709
2710 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
2711                                  struct ip_vs_stats *stats)
2712 {
2713         struct ip_vs_stats_user ustats;
2714         struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2715         if (!nl_stats)
2716                 return -EMSGSIZE;
2717
2718         ip_vs_copy_stats(&ustats, stats);
2719
2720         NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns);
2721         NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts);
2722         NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts);
2723         NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes);
2724         NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes);
2725         NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, ustats.cps);
2726         NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps);
2727         NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps);
2728         NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps);
2729         NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps);
2730
2731         nla_nest_end(skb, nl_stats);
2732
2733         return 0;
2734
2735 nla_put_failure:
2736         nla_nest_cancel(skb, nl_stats);
2737         return -EMSGSIZE;
2738 }
2739
2740 static int ip_vs_genl_fill_service(struct sk_buff *skb,
2741                                    struct ip_vs_service *svc)
2742 {
2743         struct nlattr *nl_service;
2744         struct ip_vs_flags flags = { .flags = svc->flags,
2745                                      .mask = ~0 };
2746
2747         nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
2748         if (!nl_service)
2749                 return -EMSGSIZE;
2750
2751         NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af);
2752
2753         if (svc->fwmark) {
2754                 NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
2755         } else {
2756                 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol);
2757                 NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr);
2758                 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port);
2759         }
2760
2761         NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
2762         if (svc->pe)
2763                 NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name);
2764         NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
2765         NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
2766         NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
2767
2768         if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
2769                 goto nla_put_failure;
2770
2771         nla_nest_end(skb, nl_service);
2772
2773         return 0;
2774
2775 nla_put_failure:
2776         nla_nest_cancel(skb, nl_service);
2777         return -EMSGSIZE;
2778 }
2779
2780 static int ip_vs_genl_dump_service(struct sk_buff *skb,
2781                                    struct ip_vs_service *svc,
2782                                    struct netlink_callback *cb)
2783 {
2784         void *hdr;
2785
2786         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2787                           &ip_vs_genl_family, NLM_F_MULTI,
2788                           IPVS_CMD_NEW_SERVICE);
2789         if (!hdr)
2790                 return -EMSGSIZE;
2791
2792         if (ip_vs_genl_fill_service(skb, svc) < 0)
2793                 goto nla_put_failure;
2794
2795         return genlmsg_end(skb, hdr);
2796
2797 nla_put_failure:
2798         genlmsg_cancel(skb, hdr);
2799         return -EMSGSIZE;
2800 }
2801
2802 static int ip_vs_genl_dump_services(struct sk_buff *skb,
2803                                     struct netlink_callback *cb)
2804 {
2805         int idx = 0, i;
2806         int start = cb->args[0];
2807         struct ip_vs_service *svc;
2808         struct net *net = skb_sknet(skb);
2809
2810         mutex_lock(&__ip_vs_mutex);
2811         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2812                 list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
2813                         if (++idx <= start || !net_eq(svc->net, net))
2814                                 continue;
2815                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2816                                 idx--;
2817                                 goto nla_put_failure;
2818                         }
2819                 }
2820         }
2821
2822         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2823                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
2824                         if (++idx <= start || !net_eq(svc->net, net))
2825                                 continue;
2826                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2827                                 idx--;
2828                                 goto nla_put_failure;
2829                         }
2830                 }
2831         }
2832
2833 nla_put_failure:
2834         mutex_unlock(&__ip_vs_mutex);
2835         cb->args[0] = idx;
2836
2837         return skb->len;
2838 }
2839
2840 static int ip_vs_genl_parse_service(struct net *net,
2841                                     struct ip_vs_service_user_kern *usvc,
2842                                     struct nlattr *nla, int full_entry,
2843                                     struct ip_vs_service **ret_svc)
2844 {
2845         struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
2846         struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
2847         struct ip_vs_service *svc;
2848
2849         /* Parse mandatory identifying service fields first */
2850         if (nla == NULL ||
2851             nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
2852                 return -EINVAL;
2853
2854         nla_af          = attrs[IPVS_SVC_ATTR_AF];
2855         nla_protocol    = attrs[IPVS_SVC_ATTR_PROTOCOL];
2856         nla_addr        = attrs[IPVS_SVC_ATTR_ADDR];
2857         nla_port        = attrs[IPVS_SVC_ATTR_PORT];
2858         nla_fwmark      = attrs[IPVS_SVC_ATTR_FWMARK];
2859
2860         if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
2861                 return -EINVAL;
2862
2863         memset(usvc, 0, sizeof(*usvc));
2864
2865         usvc->af = nla_get_u16(nla_af);
2866 #ifdef CONFIG_IP_VS_IPV6
2867         if (usvc->af != AF_INET && usvc->af != AF_INET6)
2868 #else
2869         if (usvc->af != AF_INET)
2870 #endif
2871                 return -EAFNOSUPPORT;
2872
2873         if (nla_fwmark) {
2874                 usvc->protocol = IPPROTO_TCP;
2875                 usvc->fwmark = nla_get_u32(nla_fwmark);
2876         } else {
2877                 usvc->protocol = nla_get_u16(nla_protocol);
2878                 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
2879                 usvc->port = nla_get_u16(nla_port);
2880                 usvc->fwmark = 0;
2881         }
2882
2883         if (usvc->fwmark)
2884                 svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
2885         else
2886                 svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
2887                                            &usvc->addr, usvc->port);
2888         *ret_svc = svc;
2889
2890         /* If a full entry was requested, check for the additional fields */
2891         if (full_entry) {
2892                 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
2893                               *nla_netmask;
2894                 struct ip_vs_flags flags;
2895
2896                 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
2897                 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
2898                 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
2899                 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
2900                 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
2901
2902                 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
2903                         return -EINVAL;
2904
2905                 nla_memcpy(&flags, nla_flags, sizeof(flags));
2906
2907                 /* prefill flags from service if it already exists */
2908                 if (svc)
2909                         usvc->flags = svc->flags;
2910
2911                 /* set new flags from userland */
2912                 usvc->flags = (usvc->flags & ~flags.mask) |
2913                               (flags.flags & flags.mask);
2914                 usvc->sched_name = nla_data(nla_sched);
2915                 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
2916                 usvc->timeout = nla_get_u32(nla_timeout);
2917                 usvc->netmask = nla_get_u32(nla_netmask);
2918         }
2919
2920         return 0;
2921 }
2922
2923 static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
2924                                                      struct nlattr *nla)
2925 {
2926         struct ip_vs_service_user_kern usvc;
2927         struct ip_vs_service *svc;
2928         int ret;
2929
2930         ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
2931         return ret ? ERR_PTR(ret) : svc;
2932 }
2933
2934 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
2935 {
2936         struct nlattr *nl_dest;
2937
2938         nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
2939         if (!nl_dest)
2940                 return -EMSGSIZE;
2941
2942         NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr);
2943         NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port);
2944
2945         NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD,
2946                     atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK);
2947         NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight));
2948         NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold);
2949         NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold);
2950         NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
2951                     atomic_read(&dest->activeconns));
2952         NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS,
2953                     atomic_read(&dest->inactconns));
2954         NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
2955                     atomic_read(&dest->persistconns));
2956
2957         if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
2958                 goto nla_put_failure;
2959
2960         nla_nest_end(skb, nl_dest);
2961
2962         return 0;
2963
2964 nla_put_failure:
2965         nla_nest_cancel(skb, nl_dest);
2966         return -EMSGSIZE;
2967 }
2968
2969 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
2970                                 struct netlink_callback *cb)
2971 {
2972         void *hdr;
2973
2974         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2975                           &ip_vs_genl_family, NLM_F_MULTI,
2976                           IPVS_CMD_NEW_DEST);
2977         if (!hdr)
2978                 return -EMSGSIZE;
2979
2980         if (ip_vs_genl_fill_dest(skb, dest) < 0)
2981                 goto nla_put_failure;
2982
2983         return genlmsg_end(skb, hdr);
2984
2985 nla_put_failure:
2986         genlmsg_cancel(skb, hdr);
2987         return -EMSGSIZE;
2988 }
2989
2990 static int ip_vs_genl_dump_dests(struct sk_buff *skb,
2991                                  struct netlink_callback *cb)
2992 {
2993         int idx = 0;
2994         int start = cb->args[0];
2995         struct ip_vs_service *svc;
2996         struct ip_vs_dest *dest;
2997         struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
2998         struct net *net = skb_sknet(skb);
2999
3000         mutex_lock(&__ip_vs_mutex);
3001
3002         /* Try to find the service for which to dump destinations */
3003         if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
3004                         IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
3005                 goto out_err;
3006
3007
3008         svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
3009         if (IS_ERR(svc) || svc == NULL)
3010                 goto out_err;
3011
3012         /* Dump the destinations */
3013         list_for_each_entry(dest, &svc->destinations, n_list) {
3014                 if (++idx <= start)
3015                         continue;
3016                 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
3017                         idx--;
3018                         goto nla_put_failure;
3019                 }
3020         }
3021
3022 nla_put_failure:
3023         cb->args[0] = idx;
3024
3025 out_err:
3026         mutex_unlock(&__ip_vs_mutex);
3027
3028         return skb->len;
3029 }
3030
3031 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
3032                                  struct nlattr *nla, int full_entry)
3033 {
3034         struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
3035         struct nlattr *nla_addr, *nla_port;
3036
3037         /* Parse mandatory identifying destination fields first */
3038         if (nla == NULL ||
3039             nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
3040                 return -EINVAL;
3041
3042         nla_addr        = attrs[IPVS_DEST_ATTR_ADDR];
3043         nla_port        = attrs[IPVS_DEST_ATTR_PORT];
3044
3045         if (!(nla_addr && nla_port))
3046                 return -EINVAL;
3047
3048         memset(udest, 0, sizeof(*udest));
3049
3050         nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
3051         udest->port = nla_get_u16(nla_port);
3052
3053         /* If a full entry was requested, check for the additional fields */
3054         if (full_entry) {
3055                 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
3056                               *nla_l_thresh;
3057
3058                 nla_fwd         = attrs[IPVS_DEST_ATTR_FWD_METHOD];
3059                 nla_weight      = attrs[IPVS_DEST_ATTR_WEIGHT];
3060                 nla_u_thresh    = attrs[IPVS_DEST_ATTR_U_THRESH];
3061                 nla_l_thresh    = attrs[IPVS_DEST_ATTR_L_THRESH];
3062
3063                 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
3064                         return -EINVAL;
3065
3066                 udest->conn_flags = nla_get_u32(nla_fwd)
3067                                     & IP_VS_CONN_F_FWD_MASK;
3068                 udest->weight = nla_get_u32(nla_weight);
3069                 udest->u_threshold = nla_get_u32(nla_u_thresh);
3070                 udest->l_threshold = nla_get_u32(nla_l_thresh);
3071         }
3072
3073         return 0;
3074 }
3075
3076 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
3077                                   const char *mcast_ifn, __be32 syncid)
3078 {
3079         struct nlattr *nl_daemon;
3080
3081         nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
3082         if (!nl_daemon)
3083                 return -EMSGSIZE;
3084
3085         NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state);
3086         NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn);
3087         NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid);
3088
3089         nla_nest_end(skb, nl_daemon);
3090
3091         return 0;
3092
3093 nla_put_failure:
3094         nla_nest_cancel(skb, nl_daemon);
3095         return -EMSGSIZE;
3096 }
3097
3098 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
3099                                   const char *mcast_ifn, __be32 syncid,
3100                                   struct netlink_callback *cb)
3101 {
3102         void *hdr;
3103         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
3104                           &ip_vs_genl_family, NLM_F_MULTI,
3105                           IPVS_CMD_NEW_DAEMON);
3106         if (!hdr)
3107                 return -EMSGSIZE;
3108
3109         if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
3110                 goto nla_put_failure;
3111
3112         return genlmsg_end(skb, hdr);
3113
3114 nla_put_failure:
3115         genlmsg_cancel(skb, hdr);
3116         return -EMSGSIZE;
3117 }
3118
3119 static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
3120                                    struct netlink_callback *cb)
3121 {
3122         struct net *net = skb_net(skb);
3123         struct netns_ipvs *ipvs = net_ipvs(net);
3124
3125         mutex_lock(&__ip_vs_mutex);
3126         if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
3127                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
3128                                            ipvs->master_mcast_ifn,
3129                                            ipvs->master_syncid, cb) < 0)
3130                         goto nla_put_failure;
3131
3132                 cb->args[0] = 1;
3133         }
3134
3135         if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
3136                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
3137                                            ipvs->backup_mcast_ifn,
3138                                            ipvs->backup_syncid, cb) < 0)
3139                         goto nla_put_failure;
3140
3141                 cb->args[1] = 1;
3142         }
3143
3144 nla_put_failure:
3145         mutex_unlock(&__ip_vs_mutex);
3146
3147         return skb->len;
3148 }
3149
3150 static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
3151 {
3152         if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
3153               attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
3154               attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
3155                 return -EINVAL;
3156
3157         return start_sync_thread(net,
3158                                  nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
3159                                  nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
3160                                  nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
3161 }
3162
3163 static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
3164 {
3165         if (!attrs[IPVS_DAEMON_ATTR_STATE])
3166                 return -EINVAL;
3167
3168         return stop_sync_thread(net,
3169                                 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3170 }
3171
3172 static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
3173 {
3174         struct ip_vs_timeout_user t;
3175
3176         __ip_vs_get_timeouts(net, &t);
3177
3178         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
3179                 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
3180
3181         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
3182                 t.tcp_fin_timeout =
3183                         nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
3184
3185         if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
3186                 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
3187
3188         return ip_vs_set_timeout(net, &t);
3189 }
3190
3191 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3192 {
3193         struct ip_vs_service *svc = NULL;
3194         struct ip_vs_service_user_kern usvc;
3195         struct ip_vs_dest_user_kern udest;
3196         int ret = 0, cmd;
3197         int need_full_svc = 0, need_full_dest = 0;
3198         struct net *net;
3199         struct netns_ipvs *ipvs;
3200
3201         net = skb_sknet(skb);
3202         ipvs = net_ipvs(net);
3203         cmd = info->genlhdr->cmd;
3204
3205         mutex_lock(&__ip_vs_mutex);
3206
3207         if (cmd == IPVS_CMD_FLUSH) {
3208                 ret = ip_vs_flush(net);
3209                 goto out;
3210         } else if (cmd == IPVS_CMD_SET_CONFIG) {
3211                 ret = ip_vs_genl_set_config(net, info->attrs);
3212                 goto out;
3213         } else if (cmd == IPVS_CMD_NEW_DAEMON ||
3214                    cmd == IPVS_CMD_DEL_DAEMON) {
3215
3216                 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
3217
3218                 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
3219                     nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
3220                                      info->attrs[IPVS_CMD_ATTR_DAEMON],
3221                                      ip_vs_daemon_policy)) {
3222                         ret = -EINVAL;
3223                         goto out;
3224                 }
3225
3226                 if (cmd == IPVS_CMD_NEW_DAEMON)
3227                         ret = ip_vs_genl_new_daemon(net, daemon_attrs);
3228                 else
3229                         ret = ip_vs_genl_del_daemon(net, daemon_attrs);
3230                 goto out;
3231         } else if (cmd == IPVS_CMD_ZERO &&
3232                    !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3233                 ret = ip_vs_zero_all(net);
3234                 goto out;
3235         }
3236
3237         /* All following commands require a service argument, so check if we
3238          * received a valid one. We need a full service specification when
3239          * adding / editing a service. Only identifying members otherwise. */
3240         if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
3241                 need_full_svc = 1;
3242
3243         ret = ip_vs_genl_parse_service(net, &usvc,
3244                                        info->attrs[IPVS_CMD_ATTR_SERVICE],
3245                                        need_full_svc, &svc);
3246         if (ret)
3247                 goto out;
3248
3249         /* Unless we're adding a new service, the service must already exist */
3250         if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
3251                 ret = -ESRCH;
3252                 goto out;
3253         }
3254
3255         /* Destination commands require a valid destination argument. For
3256          * adding / editing a destination, we need a full destination
3257          * specification. */
3258         if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
3259             cmd == IPVS_CMD_DEL_DEST) {
3260                 if (cmd != IPVS_CMD_DEL_DEST)
3261                         need_full_dest = 1;
3262
3263                 ret = ip_vs_genl_parse_dest(&udest,
3264                                             info->attrs[IPVS_CMD_ATTR_DEST],
3265                                             need_full_dest);
3266                 if (ret)
3267                         goto out;
3268         }
3269
3270         switch (cmd) {
3271         case IPVS_CMD_NEW_SERVICE:
3272                 if (svc == NULL)
3273                         ret = ip_vs_add_service(net, &usvc, &svc);
3274                 else
3275                         ret = -EEXIST;
3276                 break;
3277         case IPVS_CMD_SET_SERVICE:
3278                 ret = ip_vs_edit_service(svc, &usvc);
3279                 break;
3280         case IPVS_CMD_DEL_SERVICE:
3281                 ret = ip_vs_del_service(svc);
3282                 /* do not use svc, it can be freed */
3283                 break;
3284         case IPVS_CMD_NEW_DEST:
3285                 ret = ip_vs_add_dest(svc, &udest);
3286                 break;
3287         case IPVS_CMD_SET_DEST:
3288                 ret = ip_vs_edit_dest(svc, &udest);
3289                 break;
3290         case IPVS_CMD_DEL_DEST:
3291                 ret = ip_vs_del_dest(svc, &udest);
3292                 break;
3293         case IPVS_CMD_ZERO:
3294                 ret = ip_vs_zero_service(svc);
3295                 break;
3296         default:
3297                 ret = -EINVAL;
3298         }
3299
3300 out:
3301         mutex_unlock(&__ip_vs_mutex);
3302
3303         return ret;
3304 }
3305
3306 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3307 {
3308         struct sk_buff *msg;
3309         void *reply;
3310         int ret, cmd, reply_cmd;
3311         struct net *net;
3312         struct netns_ipvs *ipvs;
3313
3314         net = skb_sknet(skb);
3315         ipvs = net_ipvs(net);
3316         cmd = info->genlhdr->cmd;
3317
3318         if (cmd == IPVS_CMD_GET_SERVICE)
3319                 reply_cmd = IPVS_CMD_NEW_SERVICE;
3320         else if (cmd == IPVS_CMD_GET_INFO)
3321                 reply_cmd = IPVS_CMD_SET_INFO;
3322         else if (cmd == IPVS_CMD_GET_CONFIG)
3323                 reply_cmd = IPVS_CMD_SET_CONFIG;
3324         else {
3325                 pr_err("unknown Generic Netlink command\n");
3326                 return -EINVAL;
3327         }
3328
3329         msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
3330         if (!msg)
3331                 return -ENOMEM;
3332
3333         mutex_lock(&__ip_vs_mutex);
3334
3335         reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
3336         if (reply == NULL)
3337                 goto nla_put_failure;
3338
3339         switch (cmd) {
3340         case IPVS_CMD_GET_SERVICE:
3341         {
3342                 struct ip_vs_service *svc;
3343
3344                 svc = ip_vs_genl_find_service(net,
3345                                               info->attrs[IPVS_CMD_ATTR_SERVICE]);
3346                 if (IS_ERR(svc)) {
3347                         ret = PTR_ERR(svc);
3348                         goto out_err;
3349                 } else if (svc) {
3350                         ret = ip_vs_genl_fill_service(msg, svc);
3351                         if (ret)
3352                                 goto nla_put_failure;
3353                 } else {
3354                         ret = -ESRCH;
3355                         goto out_err;
3356                 }
3357
3358                 break;
3359         }
3360
3361         case IPVS_CMD_GET_CONFIG:
3362         {
3363                 struct ip_vs_timeout_user t;
3364
3365                 __ip_vs_get_timeouts(net, &t);
3366 #ifdef CONFIG_IP_VS_PROTO_TCP
3367                 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
3368                 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
3369                             t.tcp_fin_timeout);
3370 #endif
3371 #ifdef CONFIG_IP_VS_PROTO_UDP
3372                 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout);
3373 #endif
3374
3375                 break;
3376         }
3377
3378         case IPVS_CMD_GET_INFO:
3379                 NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
3380                 NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
3381                             ip_vs_conn_tab_size);
3382                 break;
3383         }
3384
3385         genlmsg_end(msg, reply);
3386         ret = genlmsg_reply(msg, info);
3387         goto out;
3388
3389 nla_put_failure:
3390         pr_err("not enough space in Netlink message\n");
3391         ret = -EMSGSIZE;
3392
3393 out_err:
3394         nlmsg_free(msg);
3395 out:
3396         mutex_unlock(&__ip_vs_mutex);
3397
3398         return ret;
3399 }
3400
3401
3402 static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
3403         {
3404                 .cmd    = IPVS_CMD_NEW_SERVICE,
3405                 .flags  = GENL_ADMIN_PERM,
3406                 .policy = ip_vs_cmd_policy,
3407                 .doit   = ip_vs_genl_set_cmd,
3408         },
3409         {
3410                 .cmd    = IPVS_CMD_SET_SERVICE,
3411                 .flags  = GENL_ADMIN_PERM,
3412                 .policy = ip_vs_cmd_policy,
3413                 .doit   = ip_vs_genl_set_cmd,
3414         },
3415         {
3416                 .cmd    = IPVS_CMD_DEL_SERVICE,
3417                 .flags  = GENL_ADMIN_PERM,
3418                 .policy = ip_vs_cmd_policy,
3419                 .doit   = ip_vs_genl_set_cmd,
3420         },
3421         {
3422                 .cmd    = IPVS_CMD_GET_SERVICE,
3423                 .flags  = GENL_ADMIN_PERM,
3424                 .doit   = ip_vs_genl_get_cmd,
3425                 .dumpit = ip_vs_genl_dump_services,
3426                 .policy = ip_vs_cmd_policy,
3427         },
3428         {
3429                 .cmd    = IPVS_CMD_NEW_DEST,
3430                 .flags  = GENL_ADMIN_PERM,
3431                 .policy = ip_vs_cmd_policy,
3432                 .doit   = ip_vs_genl_set_cmd,
3433         },
3434         {
3435                 .cmd    = IPVS_CMD_SET_DEST,
3436                 .flags  = GENL_ADMIN_PERM,
3437                 .policy = ip_vs_cmd_policy,
3438                 .doit   = ip_vs_genl_set_cmd,
3439         },
3440         {
3441                 .cmd    = IPVS_CMD_DEL_DEST,
3442                 .flags  = GENL_ADMIN_PERM,
3443                 .policy = ip_vs_cmd_policy,
3444                 .doit   = ip_vs_genl_set_cmd,
3445         },
3446         {
3447                 .cmd    = IPVS_CMD_GET_DEST,
3448                 .flags  = GENL_ADMIN_PERM,
3449                 .policy = ip_vs_cmd_policy,
3450                 .dumpit = ip_vs_genl_dump_dests,
3451         },
3452         {
3453                 .cmd    = IPVS_CMD_NEW_DAEMON,
3454                 .flags  = GENL_ADMIN_PERM,
3455                 .policy = ip_vs_cmd_policy,
3456                 .doit   = ip_vs_genl_set_cmd,
3457         },
3458         {
3459                 .cmd    = IPVS_CMD_DEL_DAEMON,
3460                 .flags  = GENL_ADMIN_PERM,
3461                 .policy = ip_vs_cmd_policy,
3462                 .doit   = ip_vs_genl_set_cmd,
3463         },
3464         {
3465                 .cmd    = IPVS_CMD_GET_DAEMON,
3466                 .flags  = GENL_ADMIN_PERM,
3467                 .dumpit = ip_vs_genl_dump_daemons,
3468         },
3469         {
3470                 .cmd    = IPVS_CMD_SET_CONFIG,
3471                 .flags  = GENL_ADMIN_PERM,
3472                 .policy = ip_vs_cmd_policy,
3473                 .doit   = ip_vs_genl_set_cmd,
3474         },
3475         {
3476                 .cmd    = IPVS_CMD_GET_CONFIG,
3477                 .flags  = GENL_ADMIN_PERM,
3478                 .doit   = ip_vs_genl_get_cmd,
3479         },
3480         {
3481                 .cmd    = IPVS_CMD_GET_INFO,
3482                 .flags  = GENL_ADMIN_PERM,
3483                 .doit   = ip_vs_genl_get_cmd,
3484         },
3485         {
3486                 .cmd    = IPVS_CMD_ZERO,
3487                 .flags  = GENL_ADMIN_PERM,
3488                 .policy = ip_vs_cmd_policy,
3489                 .doit   = ip_vs_genl_set_cmd,
3490         },
3491         {
3492                 .cmd    = IPVS_CMD_FLUSH,
3493                 .flags  = GENL_ADMIN_PERM,
3494                 .doit   = ip_vs_genl_set_cmd,
3495         },
3496 };
3497
3498 static int __init ip_vs_genl_register(void)
3499 {
3500         return genl_register_family_with_ops(&ip_vs_genl_family,
3501                 ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops));
3502 }
3503
3504 static void ip_vs_genl_unregister(void)
3505 {
3506         genl_unregister_family(&ip_vs_genl_family);
3507 }
3508
3509 /* End of Generic Netlink interface definitions */
3510
3511 /*
3512  * per netns intit/exit func.
3513  */
3514 int __net_init __ip_vs_control_init(struct net *net)
3515 {
3516         int idx;
3517         struct netns_ipvs *ipvs = net_ipvs(net);
3518         struct ctl_table *tbl;
3519
3520         atomic_set(&ipvs->dropentry, 0);
3521         spin_lock_init(&ipvs->dropentry_lock);
3522         spin_lock_init(&ipvs->droppacket_lock);
3523         spin_lock_init(&ipvs->securetcp_lock);
3524         ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock);
3525
3526         /* Initialize rs_table */
3527         for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
3528                 INIT_LIST_HEAD(&ipvs->rs_table[idx]);
3529
3530         INIT_LIST_HEAD(&ipvs->dest_trash);
3531         atomic_set(&ipvs->ftpsvc_counter, 0);
3532         atomic_set(&ipvs->nullsvc_counter, 0);
3533
3534         /* procfs stats */
3535         ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
3536         if (!ipvs->tot_stats.cpustats) {
3537                 pr_err("%s() alloc_percpu failed\n", __func__);
3538                 goto err_alloc;
3539         }
3540         spin_lock_init(&ipvs->tot_stats.lock);
3541
3542         proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops);
3543         proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops);
3544         proc_net_fops_create(net, "ip_vs_stats_percpu", 0,
3545                              &ip_vs_stats_percpu_fops);
3546
3547         if (!net_eq(net, &init_net)) {
3548                 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
3549                 if (tbl == NULL)
3550                         goto err_dup;
3551         } else
3552                 tbl = vs_vars;
3553         /* Initialize sysctl defaults */
3554         idx = 0;
3555         ipvs->sysctl_amemthresh = 1024;
3556         tbl[idx++].data = &ipvs->sysctl_amemthresh;
3557         ipvs->sysctl_am_droprate = 10;
3558         tbl[idx++].data = &ipvs->sysctl_am_droprate;
3559         tbl[idx++].data = &ipvs->sysctl_drop_entry;
3560         tbl[idx++].data = &ipvs->sysctl_drop_packet;
3561 #ifdef CONFIG_IP_VS_NFCT
3562         tbl[idx++].data = &ipvs->sysctl_conntrack;
3563 #endif
3564         tbl[idx++].data = &ipvs->sysctl_secure_tcp;
3565         ipvs->sysctl_snat_reroute = 1;
3566         tbl[idx++].data = &ipvs->sysctl_snat_reroute;
3567         ipvs->sysctl_sync_ver = 1;
3568         tbl[idx++].data = &ipvs->sysctl_sync_ver;
3569         tbl[idx++].data = &ipvs->sysctl_cache_bypass;
3570         tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
3571         tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
3572         ipvs->sysctl_sync_threshold[0] = 3;
3573         ipvs->sysctl_sync_threshold[1] = 50;
3574         tbl[idx].data = &ipvs->sysctl_sync_threshold;
3575         tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
3576         tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
3577
3578
3579 #ifdef CONFIG_SYSCTL
3580         ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path,
3581                                                      tbl);
3582         if (ipvs->sysctl_hdr == NULL) {
3583                 if (!net_eq(net, &init_net))
3584                         kfree(tbl);
3585                 goto err_dup;
3586         }
3587 #endif
3588         ip_vs_new_estimator(net, &ipvs->tot_stats);
3589         ipvs->sysctl_tbl = tbl;
3590         /* Schedule defense work */
3591         INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
3592         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
3593         return 0;
3594
3595 err_dup:
3596         free_percpu(ipvs->tot_stats.cpustats);
3597 err_alloc:
3598         return -ENOMEM;
3599 }
3600
3601 static void __net_exit __ip_vs_control_cleanup(struct net *net)
3602 {
3603         struct netns_ipvs *ipvs = net_ipvs(net);
3604
3605         ip_vs_trash_cleanup(net);
3606         ip_vs_kill_estimator(net, &ipvs->tot_stats);
3607         cancel_delayed_work_sync(&ipvs->defense_work);
3608         cancel_work_sync(&ipvs->defense_work.work);
3609 #ifdef CONFIG_SYSCTL
3610         unregister_net_sysctl_table(ipvs->sysctl_hdr);
3611 #endif
3612         proc_net_remove(net, "ip_vs_stats_percpu");
3613         proc_net_remove(net, "ip_vs_stats");
3614         proc_net_remove(net, "ip_vs");
3615         free_percpu(ipvs->tot_stats.cpustats);
3616 }
3617
3618 static struct pernet_operations ipvs_control_ops = {
3619         .init = __ip_vs_control_init,
3620         .exit = __ip_vs_control_cleanup,
3621 };
3622
3623 int __init ip_vs_control_init(void)
3624 {
3625         int idx;
3626         int ret;
3627
3628         EnterFunction(2);
3629
3630         /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
3631         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
3632                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
3633                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
3634         }
3635
3636         ret = register_pernet_subsys(&ipvs_control_ops);
3637         if (ret) {
3638                 pr_err("cannot register namespace.\n");
3639                 goto err;
3640         }
3641
3642         smp_wmb();      /* Do we really need it now ? */
3643
3644         ret = nf_register_sockopt(&ip_vs_sockopts);
3645         if (ret) {
3646                 pr_err("cannot register sockopt.\n");
3647                 goto err_net;
3648         }
3649
3650         ret = ip_vs_genl_register();
3651         if (ret) {
3652                 pr_err("cannot register Generic Netlink interface.\n");
3653                 nf_unregister_sockopt(&ip_vs_sockopts);
3654                 goto err_net;
3655         }
3656
3657         LeaveFunction(2);
3658         return 0;
3659
3660 err_net:
3661         unregister_pernet_subsys(&ipvs_control_ops);
3662 err:
3663         return ret;
3664 }
3665
3666
3667 void ip_vs_control_cleanup(void)
3668 {
3669         EnterFunction(2);
3670         unregister_pernet_subsys(&ipvs_control_ops);
3671         ip_vs_genl_unregister();
3672         nf_unregister_sockopt(&ip_vs_sockopts);
3673         LeaveFunction(2);
3674 }