a2a67ad7e094bb146e0eaad560e20b3ff177470c
[pandora-kernel.git] / net / netfilter / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * Changes:
18  *
19  */
20
21 #define KMSG_COMPONENT "IPVS"
22 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23
24 #include <linux/module.h>
25 #include <linux/init.h>
26 #include <linux/types.h>
27 #include <linux/capability.h>
28 #include <linux/fs.h>
29 #include <linux/sysctl.h>
30 #include <linux/proc_fs.h>
31 #include <linux/workqueue.h>
32 #include <linux/swap.h>
33 #include <linux/seq_file.h>
34 #include <linux/slab.h>
35
36 #include <linux/netfilter.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/mutex.h>
39
40 #include <net/net_namespace.h>
41 #include <linux/nsproxy.h>
42 #include <net/ip.h>
43 #ifdef CONFIG_IP_VS_IPV6
44 #include <net/ipv6.h>
45 #include <net/ip6_route.h>
46 #endif
47 #include <net/route.h>
48 #include <net/sock.h>
49 #include <net/genetlink.h>
50
51 #include <asm/uaccess.h>
52
53 #include <net/ip_vs.h>
54
55 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
56 static DEFINE_MUTEX(__ip_vs_mutex);
57
58 /* lock for service table */
59 static DEFINE_RWLOCK(__ip_vs_svc_lock);
60
61 /* sysctl variables */
62
63 #ifdef CONFIG_IP_VS_DEBUG
64 static int sysctl_ip_vs_debug_level = 0;
65
66 int ip_vs_get_debug_level(void)
67 {
68         return sysctl_ip_vs_debug_level;
69 }
70 #endif
71
72 #ifdef CONFIG_IP_VS_IPV6
73 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
74 static int __ip_vs_addr_is_local_v6(struct net *net,
75                                     const struct in6_addr *addr)
76 {
77         struct rt6_info *rt;
78         struct flowi fl = {
79                 .oif = 0,
80                 .fl6_dst = *addr,
81                 .fl6_src = { .s6_addr32 = {0, 0, 0, 0} },
82         };
83
84         rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl);
85         if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
86                         return 1;
87
88         return 0;
89 }
90 #endif
91 /*
92  *      update_defense_level is called from keventd and from sysctl,
93  *      so it needs to protect itself from softirqs
94  */
95 static void update_defense_level(struct netns_ipvs *ipvs)
96 {
97         struct sysinfo i;
98         static int old_secure_tcp = 0;
99         int availmem;
100         int nomem;
101         int to_change = -1;
102
103         /* we only count free and buffered memory (in pages) */
104         si_meminfo(&i);
105         availmem = i.freeram + i.bufferram;
106         /* however in linux 2.5 the i.bufferram is total page cache size,
107            we need adjust it */
108         /* si_swapinfo(&i); */
109         /* availmem = availmem - (i.totalswap - i.freeswap); */
110
111         nomem = (availmem < ipvs->sysctl_amemthresh);
112
113         local_bh_disable();
114
115         /* drop_entry */
116         spin_lock(&ipvs->dropentry_lock);
117         switch (ipvs->sysctl_drop_entry) {
118         case 0:
119                 atomic_set(&ipvs->dropentry, 0);
120                 break;
121         case 1:
122                 if (nomem) {
123                         atomic_set(&ipvs->dropentry, 1);
124                         ipvs->sysctl_drop_entry = 2;
125                 } else {
126                         atomic_set(&ipvs->dropentry, 0);
127                 }
128                 break;
129         case 2:
130                 if (nomem) {
131                         atomic_set(&ipvs->dropentry, 1);
132                 } else {
133                         atomic_set(&ipvs->dropentry, 0);
134                         ipvs->sysctl_drop_entry = 1;
135                 };
136                 break;
137         case 3:
138                 atomic_set(&ipvs->dropentry, 1);
139                 break;
140         }
141         spin_unlock(&ipvs->dropentry_lock);
142
143         /* drop_packet */
144         spin_lock(&ipvs->droppacket_lock);
145         switch (ipvs->sysctl_drop_packet) {
146         case 0:
147                 ipvs->drop_rate = 0;
148                 break;
149         case 1:
150                 if (nomem) {
151                         ipvs->drop_rate = ipvs->drop_counter
152                                 = ipvs->sysctl_amemthresh /
153                                 (ipvs->sysctl_amemthresh-availmem);
154                         ipvs->sysctl_drop_packet = 2;
155                 } else {
156                         ipvs->drop_rate = 0;
157                 }
158                 break;
159         case 2:
160                 if (nomem) {
161                         ipvs->drop_rate = ipvs->drop_counter
162                                 = ipvs->sysctl_amemthresh /
163                                 (ipvs->sysctl_amemthresh-availmem);
164                 } else {
165                         ipvs->drop_rate = 0;
166                         ipvs->sysctl_drop_packet = 1;
167                 }
168                 break;
169         case 3:
170                 ipvs->drop_rate = ipvs->sysctl_am_droprate;
171                 break;
172         }
173         spin_unlock(&ipvs->droppacket_lock);
174
175         /* secure_tcp */
176         spin_lock(&ipvs->securetcp_lock);
177         switch (ipvs->sysctl_secure_tcp) {
178         case 0:
179                 if (old_secure_tcp >= 2)
180                         to_change = 0;
181                 break;
182         case 1:
183                 if (nomem) {
184                         if (old_secure_tcp < 2)
185                                 to_change = 1;
186                         ipvs->sysctl_secure_tcp = 2;
187                 } else {
188                         if (old_secure_tcp >= 2)
189                                 to_change = 0;
190                 }
191                 break;
192         case 2:
193                 if (nomem) {
194                         if (old_secure_tcp < 2)
195                                 to_change = 1;
196                 } else {
197                         if (old_secure_tcp >= 2)
198                                 to_change = 0;
199                         ipvs->sysctl_secure_tcp = 1;
200                 }
201                 break;
202         case 3:
203                 if (old_secure_tcp < 2)
204                         to_change = 1;
205                 break;
206         }
207         old_secure_tcp = ipvs->sysctl_secure_tcp;
208         if (to_change >= 0)
209                 ip_vs_protocol_timeout_change(ipvs,
210                                               ipvs->sysctl_secure_tcp > 1);
211         spin_unlock(&ipvs->securetcp_lock);
212
213         local_bh_enable();
214 }
215
216
217 /*
218  *      Timer for checking the defense
219  */
220 #define DEFENSE_TIMER_PERIOD    1*HZ
221
222 static void defense_work_handler(struct work_struct *work)
223 {
224         struct netns_ipvs *ipvs =
225                 container_of(work, struct netns_ipvs, defense_work.work);
226
227         update_defense_level(ipvs);
228         if (atomic_read(&ipvs->dropentry))
229                 ip_vs_random_dropentry(ipvs->net);
230         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
231 }
232
233 int
234 ip_vs_use_count_inc(void)
235 {
236         return try_module_get(THIS_MODULE);
237 }
238
239 void
240 ip_vs_use_count_dec(void)
241 {
242         module_put(THIS_MODULE);
243 }
244
245
246 /*
247  *      Hash table: for virtual service lookups
248  */
249 #define IP_VS_SVC_TAB_BITS 8
250 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
251 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
252
253 /* the service table hashed by <protocol, addr, port> */
254 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
255 /* the service table hashed by fwmark */
256 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
257
258
259 /*
260  *      Returns hash value for virtual service
261  */
262 static inline unsigned
263 ip_vs_svc_hashkey(struct net *net, int af, unsigned proto,
264                   const union nf_inet_addr *addr, __be16 port)
265 {
266         register unsigned porth = ntohs(port);
267         __be32 addr_fold = addr->ip;
268
269 #ifdef CONFIG_IP_VS_IPV6
270         if (af == AF_INET6)
271                 addr_fold = addr->ip6[0]^addr->ip6[1]^
272                             addr->ip6[2]^addr->ip6[3];
273 #endif
274         addr_fold ^= ((size_t)net>>8);
275
276         return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
277                 & IP_VS_SVC_TAB_MASK;
278 }
279
280 /*
281  *      Returns hash value of fwmark for virtual service lookup
282  */
283 static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
284 {
285         return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
286 }
287
288 /*
289  *      Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
290  *      or in the ip_vs_svc_fwm_table by fwmark.
291  *      Should be called with locked tables.
292  */
293 static int ip_vs_svc_hash(struct ip_vs_service *svc)
294 {
295         unsigned hash;
296
297         if (svc->flags & IP_VS_SVC_F_HASHED) {
298                 pr_err("%s(): request for already hashed, called from %pF\n",
299                        __func__, __builtin_return_address(0));
300                 return 0;
301         }
302
303         if (svc->fwmark == 0) {
304                 /*
305                  *  Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
306                  */
307                 hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
308                                          &svc->addr, svc->port);
309                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
310         } else {
311                 /*
312                  *  Hash it by fwmark in svc_fwm_table
313                  */
314                 hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
315                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
316         }
317
318         svc->flags |= IP_VS_SVC_F_HASHED;
319         /* increase its refcnt because it is referenced by the svc table */
320         atomic_inc(&svc->refcnt);
321         return 1;
322 }
323
324
325 /*
326  *      Unhashes a service from svc_table / svc_fwm_table.
327  *      Should be called with locked tables.
328  */
329 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
330 {
331         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
332                 pr_err("%s(): request for unhash flagged, called from %pF\n",
333                        __func__, __builtin_return_address(0));
334                 return 0;
335         }
336
337         if (svc->fwmark == 0) {
338                 /* Remove it from the svc_table table */
339                 list_del(&svc->s_list);
340         } else {
341                 /* Remove it from the svc_fwm_table table */
342                 list_del(&svc->f_list);
343         }
344
345         svc->flags &= ~IP_VS_SVC_F_HASHED;
346         atomic_dec(&svc->refcnt);
347         return 1;
348 }
349
350
351 /*
352  *      Get service by {netns, proto,addr,port} in the service table.
353  */
354 static inline struct ip_vs_service *
355 __ip_vs_service_find(struct net *net, int af, __u16 protocol,
356                      const union nf_inet_addr *vaddr, __be16 vport)
357 {
358         unsigned hash;
359         struct ip_vs_service *svc;
360
361         /* Check for "full" addressed entries */
362         hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
363
364         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
365                 if ((svc->af == af)
366                     && ip_vs_addr_equal(af, &svc->addr, vaddr)
367                     && (svc->port == vport)
368                     && (svc->protocol == protocol)
369                     && net_eq(svc->net, net)) {
370                         /* HIT */
371                         return svc;
372                 }
373         }
374
375         return NULL;
376 }
377
378
379 /*
380  *      Get service by {fwmark} in the service table.
381  */
382 static inline struct ip_vs_service *
383 __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
384 {
385         unsigned hash;
386         struct ip_vs_service *svc;
387
388         /* Check for fwmark addressed entries */
389         hash = ip_vs_svc_fwm_hashkey(net, fwmark);
390
391         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
392                 if (svc->fwmark == fwmark && svc->af == af
393                     && net_eq(svc->net, net)) {
394                         /* HIT */
395                         return svc;
396                 }
397         }
398
399         return NULL;
400 }
401
402 struct ip_vs_service *
403 ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
404                   const union nf_inet_addr *vaddr, __be16 vport)
405 {
406         struct ip_vs_service *svc;
407         struct netns_ipvs *ipvs = net_ipvs(net);
408
409         read_lock(&__ip_vs_svc_lock);
410
411         /*
412          *      Check the table hashed by fwmark first
413          */
414         if (fwmark) {
415                 svc = __ip_vs_svc_fwm_find(net, af, fwmark);
416                 if (svc)
417                         goto out;
418         }
419
420         /*
421          *      Check the table hashed by <protocol,addr,port>
422          *      for "full" addressed entries
423          */
424         svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
425
426         if (svc == NULL
427             && protocol == IPPROTO_TCP
428             && atomic_read(&ipvs->ftpsvc_counter)
429             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
430                 /*
431                  * Check if ftp service entry exists, the packet
432                  * might belong to FTP data connections.
433                  */
434                 svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
435         }
436
437         if (svc == NULL
438             && atomic_read(&ipvs->nullsvc_counter)) {
439                 /*
440                  * Check if the catch-all port (port zero) exists
441                  */
442                 svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
443         }
444
445   out:
446         if (svc)
447                 atomic_inc(&svc->usecnt);
448         read_unlock(&__ip_vs_svc_lock);
449
450         IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
451                       fwmark, ip_vs_proto_name(protocol),
452                       IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
453                       svc ? "hit" : "not hit");
454
455         return svc;
456 }
457
458
459 static inline void
460 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
461 {
462         atomic_inc(&svc->refcnt);
463         dest->svc = svc;
464 }
465
466 static void
467 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
468 {
469         struct ip_vs_service *svc = dest->svc;
470
471         dest->svc = NULL;
472         if (atomic_dec_and_test(&svc->refcnt)) {
473                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
474                               svc->fwmark,
475                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
476                               ntohs(svc->port), atomic_read(&svc->usecnt));
477                 free_percpu(svc->stats.cpustats);
478                 kfree(svc);
479         }
480 }
481
482
483 /*
484  *      Returns hash value for real service
485  */
486 static inline unsigned ip_vs_rs_hashkey(int af,
487                                             const union nf_inet_addr *addr,
488                                             __be16 port)
489 {
490         register unsigned porth = ntohs(port);
491         __be32 addr_fold = addr->ip;
492
493 #ifdef CONFIG_IP_VS_IPV6
494         if (af == AF_INET6)
495                 addr_fold = addr->ip6[0]^addr->ip6[1]^
496                             addr->ip6[2]^addr->ip6[3];
497 #endif
498
499         return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
500                 & IP_VS_RTAB_MASK;
501 }
502
503 /*
504  *      Hashes ip_vs_dest in rs_table by <proto,addr,port>.
505  *      should be called with locked tables.
506  */
507 static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
508 {
509         unsigned hash;
510
511         if (!list_empty(&dest->d_list)) {
512                 return 0;
513         }
514
515         /*
516          *      Hash by proto,addr,port,
517          *      which are the parameters of the real service.
518          */
519         hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
520
521         list_add(&dest->d_list, &ipvs->rs_table[hash]);
522
523         return 1;
524 }
525
526 /*
527  *      UNhashes ip_vs_dest from rs_table.
528  *      should be called with locked tables.
529  */
530 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
531 {
532         /*
533          * Remove it from the rs_table table.
534          */
535         if (!list_empty(&dest->d_list)) {
536                 list_del(&dest->d_list);
537                 INIT_LIST_HEAD(&dest->d_list);
538         }
539
540         return 1;
541 }
542
543 /*
544  *      Lookup real service by <proto,addr,port> in the real service table.
545  */
546 struct ip_vs_dest *
547 ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
548                           const union nf_inet_addr *daddr,
549                           __be16 dport)
550 {
551         struct netns_ipvs *ipvs = net_ipvs(net);
552         unsigned hash;
553         struct ip_vs_dest *dest;
554
555         /*
556          *      Check for "full" addressed entries
557          *      Return the first found entry
558          */
559         hash = ip_vs_rs_hashkey(af, daddr, dport);
560
561         read_lock(&ipvs->rs_lock);
562         list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
563                 if ((dest->af == af)
564                     && ip_vs_addr_equal(af, &dest->addr, daddr)
565                     && (dest->port == dport)
566                     && ((dest->protocol == protocol) ||
567                         dest->vfwmark)) {
568                         /* HIT */
569                         read_unlock(&ipvs->rs_lock);
570                         return dest;
571                 }
572         }
573         read_unlock(&ipvs->rs_lock);
574
575         return NULL;
576 }
577
578 /*
579  *      Lookup destination by {addr,port} in the given service
580  */
581 static struct ip_vs_dest *
582 ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
583                   __be16 dport)
584 {
585         struct ip_vs_dest *dest;
586
587         /*
588          * Find the destination for the given service
589          */
590         list_for_each_entry(dest, &svc->destinations, n_list) {
591                 if ((dest->af == svc->af)
592                     && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
593                     && (dest->port == dport)) {
594                         /* HIT */
595                         return dest;
596                 }
597         }
598
599         return NULL;
600 }
601
602 /*
603  * Find destination by {daddr,dport,vaddr,protocol}
604  * Cretaed to be used in ip_vs_process_message() in
605  * the backup synchronization daemon. It finds the
606  * destination to be bound to the received connection
607  * on the backup.
608  *
609  * ip_vs_lookup_real_service() looked promissing, but
610  * seems not working as expected.
611  */
612 struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
613                                    const union nf_inet_addr *daddr,
614                                    __be16 dport,
615                                    const union nf_inet_addr *vaddr,
616                                    __be16 vport, __u16 protocol, __u32 fwmark)
617 {
618         struct ip_vs_dest *dest;
619         struct ip_vs_service *svc;
620
621         svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
622         if (!svc)
623                 return NULL;
624         dest = ip_vs_lookup_dest(svc, daddr, dport);
625         if (dest)
626                 atomic_inc(&dest->refcnt);
627         ip_vs_service_put(svc);
628         return dest;
629 }
630
631 /*
632  *  Lookup dest by {svc,addr,port} in the destination trash.
633  *  The destination trash is used to hold the destinations that are removed
634  *  from the service table but are still referenced by some conn entries.
635  *  The reason to add the destination trash is when the dest is temporary
636  *  down (either by administrator or by monitor program), the dest can be
637  *  picked back from the trash, the remaining connections to the dest can
638  *  continue, and the counting information of the dest is also useful for
639  *  scheduling.
640  */
641 static struct ip_vs_dest *
642 ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
643                      __be16 dport)
644 {
645         struct ip_vs_dest *dest, *nxt;
646         struct netns_ipvs *ipvs = net_ipvs(svc->net);
647
648         /*
649          * Find the destination in trash
650          */
651         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
652                 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
653                               "dest->refcnt=%d\n",
654                               dest->vfwmark,
655                               IP_VS_DBG_ADDR(svc->af, &dest->addr),
656                               ntohs(dest->port),
657                               atomic_read(&dest->refcnt));
658                 if (dest->af == svc->af &&
659                     ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
660                     dest->port == dport &&
661                     dest->vfwmark == svc->fwmark &&
662                     dest->protocol == svc->protocol &&
663                     (svc->fwmark ||
664                      (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
665                       dest->vport == svc->port))) {
666                         /* HIT */
667                         return dest;
668                 }
669
670                 /*
671                  * Try to purge the destination from trash if not referenced
672                  */
673                 if (atomic_read(&dest->refcnt) == 1) {
674                         IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
675                                       "from trash\n",
676                                       dest->vfwmark,
677                                       IP_VS_DBG_ADDR(svc->af, &dest->addr),
678                                       ntohs(dest->port));
679                         list_del(&dest->n_list);
680                         ip_vs_dst_reset(dest);
681                         __ip_vs_unbind_svc(dest);
682                         free_percpu(dest->stats.cpustats);
683                         kfree(dest);
684                 }
685         }
686
687         return NULL;
688 }
689
690
691 /*
692  *  Clean up all the destinations in the trash
693  *  Called by the ip_vs_control_cleanup()
694  *
695  *  When the ip_vs_control_clearup is activated by ipvs module exit,
696  *  the service tables must have been flushed and all the connections
697  *  are expired, and the refcnt of each destination in the trash must
698  *  be 1, so we simply release them here.
699  */
700 static void ip_vs_trash_cleanup(struct net *net)
701 {
702         struct ip_vs_dest *dest, *nxt;
703         struct netns_ipvs *ipvs = net_ipvs(net);
704
705         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
706                 list_del(&dest->n_list);
707                 ip_vs_dst_reset(dest);
708                 __ip_vs_unbind_svc(dest);
709                 free_percpu(dest->stats.cpustats);
710                 kfree(dest);
711         }
712 }
713
714
715 static void
716 ip_vs_zero_stats(struct ip_vs_stats *stats)
717 {
718         spin_lock_bh(&stats->lock);
719
720         memset(&stats->ustats, 0, sizeof(stats->ustats));
721         ip_vs_zero_estimator(stats);
722
723         spin_unlock_bh(&stats->lock);
724 }
725
726 /*
727  *      Update a destination in the given service
728  */
729 static void
730 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
731                     struct ip_vs_dest_user_kern *udest, int add)
732 {
733         struct netns_ipvs *ipvs = net_ipvs(svc->net);
734         int conn_flags;
735
736         /* set the weight and the flags */
737         atomic_set(&dest->weight, udest->weight);
738         conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
739         conn_flags |= IP_VS_CONN_F_INACTIVE;
740
741         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
742         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
743                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
744         } else {
745                 /*
746                  *    Put the real service in rs_table if not present.
747                  *    For now only for NAT!
748                  */
749                 write_lock_bh(&ipvs->rs_lock);
750                 ip_vs_rs_hash(ipvs, dest);
751                 write_unlock_bh(&ipvs->rs_lock);
752         }
753         atomic_set(&dest->conn_flags, conn_flags);
754
755         /* bind the service */
756         if (!dest->svc) {
757                 __ip_vs_bind_svc(dest, svc);
758         } else {
759                 if (dest->svc != svc) {
760                         __ip_vs_unbind_svc(dest);
761                         ip_vs_zero_stats(&dest->stats);
762                         __ip_vs_bind_svc(dest, svc);
763                 }
764         }
765
766         /* set the dest status flags */
767         dest->flags |= IP_VS_DEST_F_AVAILABLE;
768
769         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
770                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
771         dest->u_threshold = udest->u_threshold;
772         dest->l_threshold = udest->l_threshold;
773
774         spin_lock(&dest->dst_lock);
775         ip_vs_dst_reset(dest);
776         spin_unlock(&dest->dst_lock);
777
778         if (add)
779                 ip_vs_new_estimator(svc->net, &dest->stats);
780
781         write_lock_bh(&__ip_vs_svc_lock);
782
783         /* Wait until all other svc users go away */
784         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
785
786         if (add) {
787                 list_add(&dest->n_list, &svc->destinations);
788                 svc->num_dests++;
789         }
790
791         /* call the update_service, because server weight may be changed */
792         if (svc->scheduler->update_service)
793                 svc->scheduler->update_service(svc);
794
795         write_unlock_bh(&__ip_vs_svc_lock);
796 }
797
798
799 /*
800  *      Create a destination for the given service
801  */
802 static int
803 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
804                struct ip_vs_dest **dest_p)
805 {
806         struct ip_vs_dest *dest;
807         unsigned atype;
808
809         EnterFunction(2);
810
811 #ifdef CONFIG_IP_VS_IPV6
812         if (svc->af == AF_INET6) {
813                 atype = ipv6_addr_type(&udest->addr.in6);
814                 if ((!(atype & IPV6_ADDR_UNICAST) ||
815                         atype & IPV6_ADDR_LINKLOCAL) &&
816                         !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
817                         return -EINVAL;
818         } else
819 #endif
820         {
821                 atype = inet_addr_type(svc->net, udest->addr.ip);
822                 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
823                         return -EINVAL;
824         }
825
826         dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
827         if (dest == NULL) {
828                 pr_err("%s(): no memory.\n", __func__);
829                 return -ENOMEM;
830         }
831         dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
832         if (!dest->stats.cpustats) {
833                 pr_err("%s() alloc_percpu failed\n", __func__);
834                 goto err_alloc;
835         }
836
837         dest->af = svc->af;
838         dest->protocol = svc->protocol;
839         dest->vaddr = svc->addr;
840         dest->vport = svc->port;
841         dest->vfwmark = svc->fwmark;
842         ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
843         dest->port = udest->port;
844
845         atomic_set(&dest->activeconns, 0);
846         atomic_set(&dest->inactconns, 0);
847         atomic_set(&dest->persistconns, 0);
848         atomic_set(&dest->refcnt, 1);
849
850         INIT_LIST_HEAD(&dest->d_list);
851         spin_lock_init(&dest->dst_lock);
852         spin_lock_init(&dest->stats.lock);
853         __ip_vs_update_dest(svc, dest, udest, 1);
854
855         *dest_p = dest;
856
857         LeaveFunction(2);
858         return 0;
859
860 err_alloc:
861         kfree(dest);
862         return -ENOMEM;
863 }
864
865
866 /*
867  *      Add a destination into an existing service
868  */
869 static int
870 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
871 {
872         struct ip_vs_dest *dest;
873         union nf_inet_addr daddr;
874         __be16 dport = udest->port;
875         int ret;
876
877         EnterFunction(2);
878
879         if (udest->weight < 0) {
880                 pr_err("%s(): server weight less than zero\n", __func__);
881                 return -ERANGE;
882         }
883
884         if (udest->l_threshold > udest->u_threshold) {
885                 pr_err("%s(): lower threshold is higher than upper threshold\n",
886                         __func__);
887                 return -ERANGE;
888         }
889
890         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
891
892         /*
893          * Check if the dest already exists in the list
894          */
895         dest = ip_vs_lookup_dest(svc, &daddr, dport);
896
897         if (dest != NULL) {
898                 IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
899                 return -EEXIST;
900         }
901
902         /*
903          * Check if the dest already exists in the trash and
904          * is from the same service
905          */
906         dest = ip_vs_trash_get_dest(svc, &daddr, dport);
907
908         if (dest != NULL) {
909                 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
910                               "dest->refcnt=%d, service %u/%s:%u\n",
911                               IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
912                               atomic_read(&dest->refcnt),
913                               dest->vfwmark,
914                               IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
915                               ntohs(dest->vport));
916
917                 /*
918                  * Get the destination from the trash
919                  */
920                 list_del(&dest->n_list);
921
922                 __ip_vs_update_dest(svc, dest, udest, 1);
923                 ret = 0;
924         } else {
925                 /*
926                  * Allocate and initialize the dest structure
927                  */
928                 ret = ip_vs_new_dest(svc, udest, &dest);
929         }
930         LeaveFunction(2);
931
932         return ret;
933 }
934
935
936 /*
937  *      Edit a destination in the given service
938  */
939 static int
940 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
941 {
942         struct ip_vs_dest *dest;
943         union nf_inet_addr daddr;
944         __be16 dport = udest->port;
945
946         EnterFunction(2);
947
948         if (udest->weight < 0) {
949                 pr_err("%s(): server weight less than zero\n", __func__);
950                 return -ERANGE;
951         }
952
953         if (udest->l_threshold > udest->u_threshold) {
954                 pr_err("%s(): lower threshold is higher than upper threshold\n",
955                         __func__);
956                 return -ERANGE;
957         }
958
959         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
960
961         /*
962          *  Lookup the destination list
963          */
964         dest = ip_vs_lookup_dest(svc, &daddr, dport);
965
966         if (dest == NULL) {
967                 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
968                 return -ENOENT;
969         }
970
971         __ip_vs_update_dest(svc, dest, udest, 0);
972         LeaveFunction(2);
973
974         return 0;
975 }
976
977
978 /*
979  *      Delete a destination (must be already unlinked from the service)
980  */
981 static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
982 {
983         struct netns_ipvs *ipvs = net_ipvs(net);
984
985         ip_vs_kill_estimator(net, &dest->stats);
986
987         /*
988          *  Remove it from the d-linked list with the real services.
989          */
990         write_lock_bh(&ipvs->rs_lock);
991         ip_vs_rs_unhash(dest);
992         write_unlock_bh(&ipvs->rs_lock);
993
994         /*
995          *  Decrease the refcnt of the dest, and free the dest
996          *  if nobody refers to it (refcnt=0). Otherwise, throw
997          *  the destination into the trash.
998          */
999         if (atomic_dec_and_test(&dest->refcnt)) {
1000                 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
1001                               dest->vfwmark,
1002                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1003                               ntohs(dest->port));
1004                 ip_vs_dst_reset(dest);
1005                 /* simply decrease svc->refcnt here, let the caller check
1006                    and release the service if nobody refers to it.
1007                    Only user context can release destination and service,
1008                    and only one user context can update virtual service at a
1009                    time, so the operation here is OK */
1010                 atomic_dec(&dest->svc->refcnt);
1011                 free_percpu(dest->stats.cpustats);
1012                 kfree(dest);
1013         } else {
1014                 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
1015                               "dest->refcnt=%d\n",
1016                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1017                               ntohs(dest->port),
1018                               atomic_read(&dest->refcnt));
1019                 list_add(&dest->n_list, &ipvs->dest_trash);
1020                 atomic_inc(&dest->refcnt);
1021         }
1022 }
1023
1024
1025 /*
1026  *      Unlink a destination from the given service
1027  */
1028 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1029                                 struct ip_vs_dest *dest,
1030                                 int svcupd)
1031 {
1032         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1033
1034         /*
1035          *  Remove it from the d-linked destination list.
1036          */
1037         list_del(&dest->n_list);
1038         svc->num_dests--;
1039
1040         /*
1041          *  Call the update_service function of its scheduler
1042          */
1043         if (svcupd && svc->scheduler->update_service)
1044                         svc->scheduler->update_service(svc);
1045 }
1046
1047
1048 /*
1049  *      Delete a destination server in the given service
1050  */
1051 static int
1052 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1053 {
1054         struct ip_vs_dest *dest;
1055         __be16 dport = udest->port;
1056
1057         EnterFunction(2);
1058
1059         dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
1060
1061         if (dest == NULL) {
1062                 IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
1063                 return -ENOENT;
1064         }
1065
1066         write_lock_bh(&__ip_vs_svc_lock);
1067
1068         /*
1069          *      Wait until all other svc users go away.
1070          */
1071         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1072
1073         /*
1074          *      Unlink dest from the service
1075          */
1076         __ip_vs_unlink_dest(svc, dest, 1);
1077
1078         write_unlock_bh(&__ip_vs_svc_lock);
1079
1080         /*
1081          *      Delete the destination
1082          */
1083         __ip_vs_del_dest(svc->net, dest);
1084
1085         LeaveFunction(2);
1086
1087         return 0;
1088 }
1089
1090
1091 /*
1092  *      Add a service into the service hash table
1093  */
1094 static int
1095 ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
1096                   struct ip_vs_service **svc_p)
1097 {
1098         int ret = 0;
1099         struct ip_vs_scheduler *sched = NULL;
1100         struct ip_vs_pe *pe = NULL;
1101         struct ip_vs_service *svc = NULL;
1102         struct netns_ipvs *ipvs = net_ipvs(net);
1103
1104         /* increase the module use count */
1105         ip_vs_use_count_inc();
1106
1107         /* Lookup the scheduler by 'u->sched_name' */
1108         sched = ip_vs_scheduler_get(u->sched_name);
1109         if (sched == NULL) {
1110                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1111                 ret = -ENOENT;
1112                 goto out_err;
1113         }
1114
1115         if (u->pe_name && *u->pe_name) {
1116                 pe = ip_vs_pe_getbyname(u->pe_name);
1117                 if (pe == NULL) {
1118                         pr_info("persistence engine module ip_vs_pe_%s "
1119                                 "not found\n", u->pe_name);
1120                         ret = -ENOENT;
1121                         goto out_err;
1122                 }
1123         }
1124
1125 #ifdef CONFIG_IP_VS_IPV6
1126         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1127                 ret = -EINVAL;
1128                 goto out_err;
1129         }
1130 #endif
1131
1132         svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
1133         if (svc == NULL) {
1134                 IP_VS_DBG(1, "%s(): no memory\n", __func__);
1135                 ret = -ENOMEM;
1136                 goto out_err;
1137         }
1138         svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1139         if (!svc->stats.cpustats) {
1140                 pr_err("%s() alloc_percpu failed\n", __func__);
1141                 goto out_err;
1142         }
1143
1144         /* I'm the first user of the service */
1145         atomic_set(&svc->usecnt, 0);
1146         atomic_set(&svc->refcnt, 0);
1147
1148         svc->af = u->af;
1149         svc->protocol = u->protocol;
1150         ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1151         svc->port = u->port;
1152         svc->fwmark = u->fwmark;
1153         svc->flags = u->flags;
1154         svc->timeout = u->timeout * HZ;
1155         svc->netmask = u->netmask;
1156         svc->net = net;
1157
1158         INIT_LIST_HEAD(&svc->destinations);
1159         rwlock_init(&svc->sched_lock);
1160         spin_lock_init(&svc->stats.lock);
1161
1162         /* Bind the scheduler */
1163         ret = ip_vs_bind_scheduler(svc, sched);
1164         if (ret)
1165                 goto out_err;
1166         sched = NULL;
1167
1168         /* Bind the ct retriever */
1169         ip_vs_bind_pe(svc, pe);
1170         pe = NULL;
1171
1172         /* Update the virtual service counters */
1173         if (svc->port == FTPPORT)
1174                 atomic_inc(&ipvs->ftpsvc_counter);
1175         else if (svc->port == 0)
1176                 atomic_inc(&ipvs->nullsvc_counter);
1177
1178         ip_vs_new_estimator(net, &svc->stats);
1179
1180         /* Count only IPv4 services for old get/setsockopt interface */
1181         if (svc->af == AF_INET)
1182                 ipvs->num_services++;
1183
1184         /* Hash the service into the service table */
1185         write_lock_bh(&__ip_vs_svc_lock);
1186         ip_vs_svc_hash(svc);
1187         write_unlock_bh(&__ip_vs_svc_lock);
1188
1189         *svc_p = svc;
1190         return 0;
1191
1192
1193  out_err:
1194         if (svc != NULL) {
1195                 ip_vs_unbind_scheduler(svc);
1196                 if (svc->inc) {
1197                         local_bh_disable();
1198                         ip_vs_app_inc_put(svc->inc);
1199                         local_bh_enable();
1200                 }
1201                 if (svc->stats.cpustats)
1202                         free_percpu(svc->stats.cpustats);
1203                 kfree(svc);
1204         }
1205         ip_vs_scheduler_put(sched);
1206         ip_vs_pe_put(pe);
1207
1208         /* decrease the module use count */
1209         ip_vs_use_count_dec();
1210
1211         return ret;
1212 }
1213
1214
1215 /*
1216  *      Edit a service and bind it with a new scheduler
1217  */
1218 static int
1219 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1220 {
1221         struct ip_vs_scheduler *sched, *old_sched;
1222         struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1223         int ret = 0;
1224
1225         /*
1226          * Lookup the scheduler, by 'u->sched_name'
1227          */
1228         sched = ip_vs_scheduler_get(u->sched_name);
1229         if (sched == NULL) {
1230                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1231                 return -ENOENT;
1232         }
1233         old_sched = sched;
1234
1235         if (u->pe_name && *u->pe_name) {
1236                 pe = ip_vs_pe_getbyname(u->pe_name);
1237                 if (pe == NULL) {
1238                         pr_info("persistence engine module ip_vs_pe_%s "
1239                                 "not found\n", u->pe_name);
1240                         ret = -ENOENT;
1241                         goto out;
1242                 }
1243                 old_pe = pe;
1244         }
1245
1246 #ifdef CONFIG_IP_VS_IPV6
1247         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1248                 ret = -EINVAL;
1249                 goto out;
1250         }
1251 #endif
1252
1253         write_lock_bh(&__ip_vs_svc_lock);
1254
1255         /*
1256          * Wait until all other svc users go away.
1257          */
1258         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1259
1260         /*
1261          * Set the flags and timeout value
1262          */
1263         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1264         svc->timeout = u->timeout * HZ;
1265         svc->netmask = u->netmask;
1266
1267         old_sched = svc->scheduler;
1268         if (sched != old_sched) {
1269                 /*
1270                  * Unbind the old scheduler
1271                  */
1272                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1273                         old_sched = sched;
1274                         goto out_unlock;
1275                 }
1276
1277                 /*
1278                  * Bind the new scheduler
1279                  */
1280                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1281                         /*
1282                          * If ip_vs_bind_scheduler fails, restore the old
1283                          * scheduler.
1284                          * The main reason of failure is out of memory.
1285                          *
1286                          * The question is if the old scheduler can be
1287                          * restored all the time. TODO: if it cannot be
1288                          * restored some time, we must delete the service,
1289                          * otherwise the system may crash.
1290                          */
1291                         ip_vs_bind_scheduler(svc, old_sched);
1292                         old_sched = sched;
1293                         goto out_unlock;
1294                 }
1295         }
1296
1297         old_pe = svc->pe;
1298         if (pe != old_pe) {
1299                 ip_vs_unbind_pe(svc);
1300                 ip_vs_bind_pe(svc, pe);
1301         }
1302
1303   out_unlock:
1304         write_unlock_bh(&__ip_vs_svc_lock);
1305   out:
1306         ip_vs_scheduler_put(old_sched);
1307         ip_vs_pe_put(old_pe);
1308         return ret;
1309 }
1310
1311
1312 /*
1313  *      Delete a service from the service list
1314  *      - The service must be unlinked, unlocked and not referenced!
1315  *      - We are called under _bh lock
1316  */
1317 static void __ip_vs_del_service(struct ip_vs_service *svc)
1318 {
1319         struct ip_vs_dest *dest, *nxt;
1320         struct ip_vs_scheduler *old_sched;
1321         struct ip_vs_pe *old_pe;
1322         struct netns_ipvs *ipvs = net_ipvs(svc->net);
1323
1324         pr_info("%s: enter\n", __func__);
1325
1326         /* Count only IPv4 services for old get/setsockopt interface */
1327         if (svc->af == AF_INET)
1328                 ipvs->num_services--;
1329
1330         ip_vs_kill_estimator(svc->net, &svc->stats);
1331
1332         /* Unbind scheduler */
1333         old_sched = svc->scheduler;
1334         ip_vs_unbind_scheduler(svc);
1335         ip_vs_scheduler_put(old_sched);
1336
1337         /* Unbind persistence engine */
1338         old_pe = svc->pe;
1339         ip_vs_unbind_pe(svc);
1340         ip_vs_pe_put(old_pe);
1341
1342         /* Unbind app inc */
1343         if (svc->inc) {
1344                 ip_vs_app_inc_put(svc->inc);
1345                 svc->inc = NULL;
1346         }
1347
1348         /*
1349          *    Unlink the whole destination list
1350          */
1351         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1352                 __ip_vs_unlink_dest(svc, dest, 0);
1353                 __ip_vs_del_dest(svc->net, dest);
1354         }
1355
1356         /*
1357          *    Update the virtual service counters
1358          */
1359         if (svc->port == FTPPORT)
1360                 atomic_dec(&ipvs->ftpsvc_counter);
1361         else if (svc->port == 0)
1362                 atomic_dec(&ipvs->nullsvc_counter);
1363
1364         /*
1365          *    Free the service if nobody refers to it
1366          */
1367         if (atomic_read(&svc->refcnt) == 0) {
1368                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
1369                               svc->fwmark,
1370                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
1371                               ntohs(svc->port), atomic_read(&svc->usecnt));
1372                 free_percpu(svc->stats.cpustats);
1373                 kfree(svc);
1374         }
1375
1376         /* decrease the module use count */
1377         ip_vs_use_count_dec();
1378 }
1379
1380 /*
1381  * Unlink a service from list and try to delete it if its refcnt reached 0
1382  */
1383 static void ip_vs_unlink_service(struct ip_vs_service *svc)
1384 {
1385         /*
1386          * Unhash it from the service table
1387          */
1388         write_lock_bh(&__ip_vs_svc_lock);
1389
1390         ip_vs_svc_unhash(svc);
1391
1392         /*
1393          * Wait until all the svc users go away.
1394          */
1395         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1396
1397         __ip_vs_del_service(svc);
1398
1399         write_unlock_bh(&__ip_vs_svc_lock);
1400 }
1401
1402 /*
1403  *      Delete a service from the service list
1404  */
1405 static int ip_vs_del_service(struct ip_vs_service *svc)
1406 {
1407         if (svc == NULL)
1408                 return -EEXIST;
1409         ip_vs_unlink_service(svc);
1410
1411         return 0;
1412 }
1413
1414
1415 /*
1416  *      Flush all the virtual services
1417  */
1418 static int ip_vs_flush(struct net *net)
1419 {
1420         int idx;
1421         struct ip_vs_service *svc, *nxt;
1422
1423         /*
1424          * Flush the service table hashed by <netns,protocol,addr,port>
1425          */
1426         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1427                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
1428                                          s_list) {
1429                         if (net_eq(svc->net, net))
1430                                 ip_vs_unlink_service(svc);
1431                 }
1432         }
1433
1434         /*
1435          * Flush the service table hashed by fwmark
1436          */
1437         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1438                 list_for_each_entry_safe(svc, nxt,
1439                                          &ip_vs_svc_fwm_table[idx], f_list) {
1440                         if (net_eq(svc->net, net))
1441                                 ip_vs_unlink_service(svc);
1442                 }
1443         }
1444
1445         return 0;
1446 }
1447
1448
1449 /*
1450  *      Zero counters in a service or all services
1451  */
1452 static int ip_vs_zero_service(struct ip_vs_service *svc)
1453 {
1454         struct ip_vs_dest *dest;
1455
1456         write_lock_bh(&__ip_vs_svc_lock);
1457         list_for_each_entry(dest, &svc->destinations, n_list) {
1458                 ip_vs_zero_stats(&dest->stats);
1459         }
1460         ip_vs_zero_stats(&svc->stats);
1461         write_unlock_bh(&__ip_vs_svc_lock);
1462         return 0;
1463 }
1464
1465 static int ip_vs_zero_all(struct net *net)
1466 {
1467         int idx;
1468         struct ip_vs_service *svc;
1469
1470         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1471                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1472                         if (net_eq(svc->net, net))
1473                                 ip_vs_zero_service(svc);
1474                 }
1475         }
1476
1477         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1478                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1479                         if (net_eq(svc->net, net))
1480                                 ip_vs_zero_service(svc);
1481                 }
1482         }
1483
1484         ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
1485         return 0;
1486 }
1487
1488
1489 static int
1490 proc_do_defense_mode(ctl_table *table, int write,
1491                      void __user *buffer, size_t *lenp, loff_t *ppos)
1492 {
1493         struct net *net = current->nsproxy->net_ns;
1494         int *valp = table->data;
1495         int val = *valp;
1496         int rc;
1497
1498         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1499         if (write && (*valp != val)) {
1500                 if ((*valp < 0) || (*valp > 3)) {
1501                         /* Restore the correct value */
1502                         *valp = val;
1503                 } else {
1504                         update_defense_level(net_ipvs(net));
1505                 }
1506         }
1507         return rc;
1508 }
1509
1510
1511 static int
1512 proc_do_sync_threshold(ctl_table *table, int write,
1513                        void __user *buffer, size_t *lenp, loff_t *ppos)
1514 {
1515         int *valp = table->data;
1516         int val[2];
1517         int rc;
1518
1519         /* backup the value first */
1520         memcpy(val, valp, sizeof(val));
1521
1522         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1523         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1524                 /* Restore the correct value */
1525                 memcpy(valp, val, sizeof(val));
1526         }
1527         return rc;
1528 }
1529
1530 static int
1531 proc_do_sync_mode(ctl_table *table, int write,
1532                      void __user *buffer, size_t *lenp, loff_t *ppos)
1533 {
1534         int *valp = table->data;
1535         int val = *valp;
1536         int rc;
1537
1538         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1539         if (write && (*valp != val)) {
1540                 if ((*valp < 0) || (*valp > 1)) {
1541                         /* Restore the correct value */
1542                         *valp = val;
1543                 } else {
1544                         struct net *net = current->nsproxy->net_ns;
1545                         ip_vs_sync_switch_mode(net, val);
1546                 }
1547         }
1548         return rc;
1549 }
1550
1551 /*
1552  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1553  *      Do not change order or insert new entries without
1554  *      align with netns init in __ip_vs_control_init()
1555  */
1556
1557 static struct ctl_table vs_vars[] = {
1558         {
1559                 .procname       = "amemthresh",
1560                 .maxlen         = sizeof(int),
1561                 .mode           = 0644,
1562                 .proc_handler   = proc_dointvec,
1563         },
1564         {
1565                 .procname       = "am_droprate",
1566                 .maxlen         = sizeof(int),
1567                 .mode           = 0644,
1568                 .proc_handler   = proc_dointvec,
1569         },
1570         {
1571                 .procname       = "drop_entry",
1572                 .maxlen         = sizeof(int),
1573                 .mode           = 0644,
1574                 .proc_handler   = proc_do_defense_mode,
1575         },
1576         {
1577                 .procname       = "drop_packet",
1578                 .maxlen         = sizeof(int),
1579                 .mode           = 0644,
1580                 .proc_handler   = proc_do_defense_mode,
1581         },
1582 #ifdef CONFIG_IP_VS_NFCT
1583         {
1584                 .procname       = "conntrack",
1585                 .maxlen         = sizeof(int),
1586                 .mode           = 0644,
1587                 .proc_handler   = &proc_dointvec,
1588         },
1589 #endif
1590         {
1591                 .procname       = "secure_tcp",
1592                 .maxlen         = sizeof(int),
1593                 .mode           = 0644,
1594                 .proc_handler   = proc_do_defense_mode,
1595         },
1596         {
1597                 .procname       = "snat_reroute",
1598                 .maxlen         = sizeof(int),
1599                 .mode           = 0644,
1600                 .proc_handler   = &proc_dointvec,
1601         },
1602         {
1603                 .procname       = "sync_version",
1604                 .maxlen         = sizeof(int),
1605                 .mode           = 0644,
1606                 .proc_handler   = &proc_do_sync_mode,
1607         },
1608         {
1609                 .procname       = "cache_bypass",
1610                 .maxlen         = sizeof(int),
1611                 .mode           = 0644,
1612                 .proc_handler   = proc_dointvec,
1613         },
1614         {
1615                 .procname       = "expire_nodest_conn",
1616                 .maxlen         = sizeof(int),
1617                 .mode           = 0644,
1618                 .proc_handler   = proc_dointvec,
1619         },
1620         {
1621                 .procname       = "expire_quiescent_template",
1622                 .maxlen         = sizeof(int),
1623                 .mode           = 0644,
1624                 .proc_handler   = proc_dointvec,
1625         },
1626         {
1627                 .procname       = "sync_threshold",
1628                 .maxlen         =
1629                         sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
1630                 .mode           = 0644,
1631                 .proc_handler   = proc_do_sync_threshold,
1632         },
1633         {
1634                 .procname       = "nat_icmp_send",
1635                 .maxlen         = sizeof(int),
1636                 .mode           = 0644,
1637                 .proc_handler   = proc_dointvec,
1638         },
1639 #ifdef CONFIG_IP_VS_DEBUG
1640         {
1641                 .procname       = "debug_level",
1642                 .data           = &sysctl_ip_vs_debug_level,
1643                 .maxlen         = sizeof(int),
1644                 .mode           = 0644,
1645                 .proc_handler   = proc_dointvec,
1646         },
1647 #endif
1648 #if 0
1649         {
1650                 .procname       = "timeout_established",
1651                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1652                 .maxlen         = sizeof(int),
1653                 .mode           = 0644,
1654                 .proc_handler   = proc_dointvec_jiffies,
1655         },
1656         {
1657                 .procname       = "timeout_synsent",
1658                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1659                 .maxlen         = sizeof(int),
1660                 .mode           = 0644,
1661                 .proc_handler   = proc_dointvec_jiffies,
1662         },
1663         {
1664                 .procname       = "timeout_synrecv",
1665                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1666                 .maxlen         = sizeof(int),
1667                 .mode           = 0644,
1668                 .proc_handler   = proc_dointvec_jiffies,
1669         },
1670         {
1671                 .procname       = "timeout_finwait",
1672                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1673                 .maxlen         = sizeof(int),
1674                 .mode           = 0644,
1675                 .proc_handler   = proc_dointvec_jiffies,
1676         },
1677         {
1678                 .procname       = "timeout_timewait",
1679                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1680                 .maxlen         = sizeof(int),
1681                 .mode           = 0644,
1682                 .proc_handler   = proc_dointvec_jiffies,
1683         },
1684         {
1685                 .procname       = "timeout_close",
1686                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1687                 .maxlen         = sizeof(int),
1688                 .mode           = 0644,
1689                 .proc_handler   = proc_dointvec_jiffies,
1690         },
1691         {
1692                 .procname       = "timeout_closewait",
1693                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1694                 .maxlen         = sizeof(int),
1695                 .mode           = 0644,
1696                 .proc_handler   = proc_dointvec_jiffies,
1697         },
1698         {
1699                 .procname       = "timeout_lastack",
1700                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1701                 .maxlen         = sizeof(int),
1702                 .mode           = 0644,
1703                 .proc_handler   = proc_dointvec_jiffies,
1704         },
1705         {
1706                 .procname       = "timeout_listen",
1707                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1708                 .maxlen         = sizeof(int),
1709                 .mode           = 0644,
1710                 .proc_handler   = proc_dointvec_jiffies,
1711         },
1712         {
1713                 .procname       = "timeout_synack",
1714                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1715                 .maxlen         = sizeof(int),
1716                 .mode           = 0644,
1717                 .proc_handler   = proc_dointvec_jiffies,
1718         },
1719         {
1720                 .procname       = "timeout_udp",
1721                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1722                 .maxlen         = sizeof(int),
1723                 .mode           = 0644,
1724                 .proc_handler   = proc_dointvec_jiffies,
1725         },
1726         {
1727                 .procname       = "timeout_icmp",
1728                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1729                 .maxlen         = sizeof(int),
1730                 .mode           = 0644,
1731                 .proc_handler   = proc_dointvec_jiffies,
1732         },
1733 #endif
1734         { }
1735 };
1736
1737 const struct ctl_path net_vs_ctl_path[] = {
1738         { .procname = "net", },
1739         { .procname = "ipv4", },
1740         { .procname = "vs", },
1741         { }
1742 };
1743 EXPORT_SYMBOL_GPL(net_vs_ctl_path);
1744
1745 #ifdef CONFIG_PROC_FS
1746
1747 struct ip_vs_iter {
1748         struct seq_net_private p;  /* Do not move this, netns depends upon it*/
1749         struct list_head *table;
1750         int bucket;
1751 };
1752
1753 /*
1754  *      Write the contents of the VS rule table to a PROCfs file.
1755  *      (It is kept just for backward compatibility)
1756  */
1757 static inline const char *ip_vs_fwd_name(unsigned flags)
1758 {
1759         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1760         case IP_VS_CONN_F_LOCALNODE:
1761                 return "Local";
1762         case IP_VS_CONN_F_TUNNEL:
1763                 return "Tunnel";
1764         case IP_VS_CONN_F_DROUTE:
1765                 return "Route";
1766         default:
1767                 return "Masq";
1768         }
1769 }
1770
1771
1772 /* Get the Nth entry in the two lists */
1773 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1774 {
1775         struct net *net = seq_file_net(seq);
1776         struct ip_vs_iter *iter = seq->private;
1777         int idx;
1778         struct ip_vs_service *svc;
1779
1780         /* look in hash by protocol */
1781         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1782                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1783                         if (net_eq(svc->net, net) && pos-- == 0) {
1784                                 iter->table = ip_vs_svc_table;
1785                                 iter->bucket = idx;
1786                                 return svc;
1787                         }
1788                 }
1789         }
1790
1791         /* keep looking in fwmark */
1792         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1793                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1794                         if (net_eq(svc->net, net) && pos-- == 0) {
1795                                 iter->table = ip_vs_svc_fwm_table;
1796                                 iter->bucket = idx;
1797                                 return svc;
1798                         }
1799                 }
1800         }
1801
1802         return NULL;
1803 }
1804
1805 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1806 __acquires(__ip_vs_svc_lock)
1807 {
1808
1809         read_lock_bh(&__ip_vs_svc_lock);
1810         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1811 }
1812
1813
1814 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1815 {
1816         struct list_head *e;
1817         struct ip_vs_iter *iter;
1818         struct ip_vs_service *svc;
1819
1820         ++*pos;
1821         if (v == SEQ_START_TOKEN)
1822                 return ip_vs_info_array(seq,0);
1823
1824         svc = v;
1825         iter = seq->private;
1826
1827         if (iter->table == ip_vs_svc_table) {
1828                 /* next service in table hashed by protocol */
1829                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1830                         return list_entry(e, struct ip_vs_service, s_list);
1831
1832
1833                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1834                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1835                                             s_list) {
1836                                 return svc;
1837                         }
1838                 }
1839
1840                 iter->table = ip_vs_svc_fwm_table;
1841                 iter->bucket = -1;
1842                 goto scan_fwmark;
1843         }
1844
1845         /* next service in hashed by fwmark */
1846         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1847                 return list_entry(e, struct ip_vs_service, f_list);
1848
1849  scan_fwmark:
1850         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1851                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1852                                     f_list)
1853                         return svc;
1854         }
1855
1856         return NULL;
1857 }
1858
1859 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1860 __releases(__ip_vs_svc_lock)
1861 {
1862         read_unlock_bh(&__ip_vs_svc_lock);
1863 }
1864
1865
1866 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1867 {
1868         if (v == SEQ_START_TOKEN) {
1869                 seq_printf(seq,
1870                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1871                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
1872                 seq_puts(seq,
1873                          "Prot LocalAddress:Port Scheduler Flags\n");
1874                 seq_puts(seq,
1875                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1876         } else {
1877                 const struct ip_vs_service *svc = v;
1878                 const struct ip_vs_iter *iter = seq->private;
1879                 const struct ip_vs_dest *dest;
1880
1881                 if (iter->table == ip_vs_svc_table) {
1882 #ifdef CONFIG_IP_VS_IPV6
1883                         if (svc->af == AF_INET6)
1884                                 seq_printf(seq, "%s  [%pI6]:%04X %s ",
1885                                            ip_vs_proto_name(svc->protocol),
1886                                            &svc->addr.in6,
1887                                            ntohs(svc->port),
1888                                            svc->scheduler->name);
1889                         else
1890 #endif
1891                                 seq_printf(seq, "%s  %08X:%04X %s %s ",
1892                                            ip_vs_proto_name(svc->protocol),
1893                                            ntohl(svc->addr.ip),
1894                                            ntohs(svc->port),
1895                                            svc->scheduler->name,
1896                                            (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
1897                 } else {
1898                         seq_printf(seq, "FWM  %08X %s %s",
1899                                    svc->fwmark, svc->scheduler->name,
1900                                    (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
1901                 }
1902
1903                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1904                         seq_printf(seq, "persistent %d %08X\n",
1905                                 svc->timeout,
1906                                 ntohl(svc->netmask));
1907                 else
1908                         seq_putc(seq, '\n');
1909
1910                 list_for_each_entry(dest, &svc->destinations, n_list) {
1911 #ifdef CONFIG_IP_VS_IPV6
1912                         if (dest->af == AF_INET6)
1913                                 seq_printf(seq,
1914                                            "  -> [%pI6]:%04X"
1915                                            "      %-7s %-6d %-10d %-10d\n",
1916                                            &dest->addr.in6,
1917                                            ntohs(dest->port),
1918                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1919                                            atomic_read(&dest->weight),
1920                                            atomic_read(&dest->activeconns),
1921                                            atomic_read(&dest->inactconns));
1922                         else
1923 #endif
1924                                 seq_printf(seq,
1925                                            "  -> %08X:%04X      "
1926                                            "%-7s %-6d %-10d %-10d\n",
1927                                            ntohl(dest->addr.ip),
1928                                            ntohs(dest->port),
1929                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1930                                            atomic_read(&dest->weight),
1931                                            atomic_read(&dest->activeconns),
1932                                            atomic_read(&dest->inactconns));
1933
1934                 }
1935         }
1936         return 0;
1937 }
1938
1939 static const struct seq_operations ip_vs_info_seq_ops = {
1940         .start = ip_vs_info_seq_start,
1941         .next  = ip_vs_info_seq_next,
1942         .stop  = ip_vs_info_seq_stop,
1943         .show  = ip_vs_info_seq_show,
1944 };
1945
1946 static int ip_vs_info_open(struct inode *inode, struct file *file)
1947 {
1948         return seq_open_net(inode, file, &ip_vs_info_seq_ops,
1949                         sizeof(struct ip_vs_iter));
1950 }
1951
1952 static const struct file_operations ip_vs_info_fops = {
1953         .owner   = THIS_MODULE,
1954         .open    = ip_vs_info_open,
1955         .read    = seq_read,
1956         .llseek  = seq_lseek,
1957         .release = seq_release_private,
1958 };
1959
1960 #endif
1961
1962 #ifdef CONFIG_PROC_FS
1963 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1964 {
1965         struct net *net = seq_file_single_net(seq);
1966         struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
1967
1968 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1969         seq_puts(seq,
1970                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1971         seq_printf(seq,
1972                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1973
1974         spin_lock_bh(&tot_stats->lock);
1975         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", tot_stats->ustats.conns,
1976                    tot_stats->ustats.inpkts, tot_stats->ustats.outpkts,
1977                    (unsigned long long) tot_stats->ustats.inbytes,
1978                    (unsigned long long) tot_stats->ustats.outbytes);
1979
1980 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1981         seq_puts(seq,
1982                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
1983         seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1984                         tot_stats->ustats.cps,
1985                         tot_stats->ustats.inpps,
1986                         tot_stats->ustats.outpps,
1987                         tot_stats->ustats.inbps,
1988                         tot_stats->ustats.outbps);
1989         spin_unlock_bh(&tot_stats->lock);
1990
1991         return 0;
1992 }
1993
1994 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1995 {
1996         return single_open_net(inode, file, ip_vs_stats_show);
1997 }
1998
1999 static const struct file_operations ip_vs_stats_fops = {
2000         .owner = THIS_MODULE,
2001         .open = ip_vs_stats_seq_open,
2002         .read = seq_read,
2003         .llseek = seq_lseek,
2004         .release = single_release,
2005 };
2006
2007 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
2008 {
2009         struct net *net = seq_file_single_net(seq);
2010         struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
2011         struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats;
2012         int i;
2013
2014 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2015         seq_puts(seq,
2016                  "       Total Incoming Outgoing         Incoming         Outgoing\n");
2017         seq_printf(seq,
2018                    "CPU    Conns  Packets  Packets            Bytes            Bytes\n");
2019
2020         for_each_possible_cpu(i) {
2021                 struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
2022                 unsigned int start;
2023                 __u64 inbytes, outbytes;
2024
2025                 do {
2026                         start = u64_stats_fetch_begin_bh(&u->syncp);
2027                         inbytes = u->ustats.inbytes;
2028                         outbytes = u->ustats.outbytes;
2029                 } while (u64_stats_fetch_retry_bh(&u->syncp, start));
2030
2031                 seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
2032                            i, u->ustats.conns, u->ustats.inpkts,
2033                            u->ustats.outpkts, (__u64)inbytes,
2034                            (__u64)outbytes);
2035         }
2036
2037         spin_lock_bh(&tot_stats->lock);
2038         seq_printf(seq, "  ~ %8X %8X %8X %16LX %16LX\n\n",
2039                    tot_stats->ustats.conns, tot_stats->ustats.inpkts,
2040                    tot_stats->ustats.outpkts,
2041                    (unsigned long long) tot_stats->ustats.inbytes,
2042                    (unsigned long long) tot_stats->ustats.outbytes);
2043
2044 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2045         seq_puts(seq,
2046                    "     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2047         seq_printf(seq, "    %8X %8X %8X %16X %16X\n",
2048                         tot_stats->ustats.cps,
2049                         tot_stats->ustats.inpps,
2050                         tot_stats->ustats.outpps,
2051                         tot_stats->ustats.inbps,
2052                         tot_stats->ustats.outbps);
2053         spin_unlock_bh(&tot_stats->lock);
2054
2055         return 0;
2056 }
2057
2058 static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
2059 {
2060         return single_open_net(inode, file, ip_vs_stats_percpu_show);
2061 }
2062
2063 static const struct file_operations ip_vs_stats_percpu_fops = {
2064         .owner = THIS_MODULE,
2065         .open = ip_vs_stats_percpu_seq_open,
2066         .read = seq_read,
2067         .llseek = seq_lseek,
2068         .release = single_release,
2069 };
2070 #endif
2071
2072 /*
2073  *      Set timeout values for tcp tcpfin udp in the timeout_table.
2074  */
2075 static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
2076 {
2077 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2078         struct ip_vs_proto_data *pd;
2079 #endif
2080
2081         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
2082                   u->tcp_timeout,
2083                   u->tcp_fin_timeout,
2084                   u->udp_timeout);
2085
2086 #ifdef CONFIG_IP_VS_PROTO_TCP
2087         if (u->tcp_timeout) {
2088                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2089                 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
2090                         = u->tcp_timeout * HZ;
2091         }
2092
2093         if (u->tcp_fin_timeout) {
2094                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2095                 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
2096                         = u->tcp_fin_timeout * HZ;
2097         }
2098 #endif
2099
2100 #ifdef CONFIG_IP_VS_PROTO_UDP
2101         if (u->udp_timeout) {
2102                 pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2103                 pd->timeout_table[IP_VS_UDP_S_NORMAL]
2104                         = u->udp_timeout * HZ;
2105         }
2106 #endif
2107         return 0;
2108 }
2109
2110
2111 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2112 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
2113 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
2114                                  sizeof(struct ip_vs_dest_user))
2115 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
2116 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
2117 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
2118
2119 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
2120         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
2121         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
2122         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
2123         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
2124         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
2125         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
2126         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
2127         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
2128         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
2129         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
2130         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
2131 };
2132
2133 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2134                                   struct ip_vs_service_user *usvc_compat)
2135 {
2136         memset(usvc, 0, sizeof(*usvc));
2137
2138         usvc->af                = AF_INET;
2139         usvc->protocol          = usvc_compat->protocol;
2140         usvc->addr.ip           = usvc_compat->addr;
2141         usvc->port              = usvc_compat->port;
2142         usvc->fwmark            = usvc_compat->fwmark;
2143
2144         /* Deep copy of sched_name is not needed here */
2145         usvc->sched_name        = usvc_compat->sched_name;
2146
2147         usvc->flags             = usvc_compat->flags;
2148         usvc->timeout           = usvc_compat->timeout;
2149         usvc->netmask           = usvc_compat->netmask;
2150 }
2151
2152 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2153                                    struct ip_vs_dest_user *udest_compat)
2154 {
2155         memset(udest, 0, sizeof(*udest));
2156
2157         udest->addr.ip          = udest_compat->addr;
2158         udest->port             = udest_compat->port;
2159         udest->conn_flags       = udest_compat->conn_flags;
2160         udest->weight           = udest_compat->weight;
2161         udest->u_threshold      = udest_compat->u_threshold;
2162         udest->l_threshold      = udest_compat->l_threshold;
2163 }
2164
2165 static int
2166 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2167 {
2168         struct net *net = sock_net(sk);
2169         int ret;
2170         unsigned char arg[MAX_ARG_LEN];
2171         struct ip_vs_service_user *usvc_compat;
2172         struct ip_vs_service_user_kern usvc;
2173         struct ip_vs_service *svc;
2174         struct ip_vs_dest_user *udest_compat;
2175         struct ip_vs_dest_user_kern udest;
2176
2177         if (!capable(CAP_NET_ADMIN))
2178                 return -EPERM;
2179
2180         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
2181                 return -EINVAL;
2182         if (len < 0 || len >  MAX_ARG_LEN)
2183                 return -EINVAL;
2184         if (len != set_arglen[SET_CMDID(cmd)]) {
2185                 pr_err("set_ctl: len %u != %u\n",
2186                        len, set_arglen[SET_CMDID(cmd)]);
2187                 return -EINVAL;
2188         }
2189
2190         if (copy_from_user(arg, user, len) != 0)
2191                 return -EFAULT;
2192
2193         /* increase the module use count */
2194         ip_vs_use_count_inc();
2195
2196         if (mutex_lock_interruptible(&__ip_vs_mutex)) {
2197                 ret = -ERESTARTSYS;
2198                 goto out_dec;
2199         }
2200
2201         if (cmd == IP_VS_SO_SET_FLUSH) {
2202                 /* Flush the virtual service */
2203                 ret = ip_vs_flush(net);
2204                 goto out_unlock;
2205         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
2206                 /* Set timeout values for (tcp tcpfin udp) */
2207                 ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
2208                 goto out_unlock;
2209         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
2210                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2211                 ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
2212                                         dm->syncid);
2213                 goto out_unlock;
2214         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
2215                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2216                 ret = stop_sync_thread(net, dm->state);
2217                 goto out_unlock;
2218         }
2219
2220         usvc_compat = (struct ip_vs_service_user *)arg;
2221         udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
2222
2223         /* We only use the new structs internally, so copy userspace compat
2224          * structs to extended internal versions */
2225         ip_vs_copy_usvc_compat(&usvc, usvc_compat);
2226         ip_vs_copy_udest_compat(&udest, udest_compat);
2227
2228         if (cmd == IP_VS_SO_SET_ZERO) {
2229                 /* if no service address is set, zero counters in all */
2230                 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2231                         ret = ip_vs_zero_all(net);
2232                         goto out_unlock;
2233                 }
2234         }
2235
2236         /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
2237         if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
2238             usvc.protocol != IPPROTO_SCTP) {
2239                 pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n",
2240                        usvc.protocol, &usvc.addr.ip,
2241                        ntohs(usvc.port), usvc.sched_name);
2242                 ret = -EFAULT;
2243                 goto out_unlock;
2244         }
2245
2246         /* Lookup the exact service by <protocol, addr, port> or fwmark */
2247         if (usvc.fwmark == 0)
2248                 svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
2249                                            &usvc.addr, usvc.port);
2250         else
2251                 svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
2252
2253         if (cmd != IP_VS_SO_SET_ADD
2254             && (svc == NULL || svc->protocol != usvc.protocol)) {
2255                 ret = -ESRCH;
2256                 goto out_unlock;
2257         }
2258
2259         switch (cmd) {
2260         case IP_VS_SO_SET_ADD:
2261                 if (svc != NULL)
2262                         ret = -EEXIST;
2263                 else
2264                         ret = ip_vs_add_service(net, &usvc, &svc);
2265                 break;
2266         case IP_VS_SO_SET_EDIT:
2267                 ret = ip_vs_edit_service(svc, &usvc);
2268                 break;
2269         case IP_VS_SO_SET_DEL:
2270                 ret = ip_vs_del_service(svc);
2271                 if (!ret)
2272                         goto out_unlock;
2273                 break;
2274         case IP_VS_SO_SET_ZERO:
2275                 ret = ip_vs_zero_service(svc);
2276                 break;
2277         case IP_VS_SO_SET_ADDDEST:
2278                 ret = ip_vs_add_dest(svc, &udest);
2279                 break;
2280         case IP_VS_SO_SET_EDITDEST:
2281                 ret = ip_vs_edit_dest(svc, &udest);
2282                 break;
2283         case IP_VS_SO_SET_DELDEST:
2284                 ret = ip_vs_del_dest(svc, &udest);
2285                 break;
2286         default:
2287                 ret = -EINVAL;
2288         }
2289
2290   out_unlock:
2291         mutex_unlock(&__ip_vs_mutex);
2292   out_dec:
2293         /* decrease the module use count */
2294         ip_vs_use_count_dec();
2295
2296         return ret;
2297 }
2298
2299
2300 static void
2301 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2302 {
2303         spin_lock_bh(&src->lock);
2304         memcpy(dst, &src->ustats, sizeof(*dst));
2305         spin_unlock_bh(&src->lock);
2306 }
2307
2308 static void
2309 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2310 {
2311         dst->protocol = src->protocol;
2312         dst->addr = src->addr.ip;
2313         dst->port = src->port;
2314         dst->fwmark = src->fwmark;
2315         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2316         dst->flags = src->flags;
2317         dst->timeout = src->timeout / HZ;
2318         dst->netmask = src->netmask;
2319         dst->num_dests = src->num_dests;
2320         ip_vs_copy_stats(&dst->stats, &src->stats);
2321 }
2322
2323 static inline int
2324 __ip_vs_get_service_entries(struct net *net,
2325                             const struct ip_vs_get_services *get,
2326                             struct ip_vs_get_services __user *uptr)
2327 {
2328         int idx, count=0;
2329         struct ip_vs_service *svc;
2330         struct ip_vs_service_entry entry;
2331         int ret = 0;
2332
2333         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2334                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2335                         /* Only expose IPv4 entries to old interface */
2336                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2337                                 continue;
2338
2339                         if (count >= get->num_services)
2340                                 goto out;
2341                         memset(&entry, 0, sizeof(entry));
2342                         ip_vs_copy_service(&entry, svc);
2343                         if (copy_to_user(&uptr->entrytable[count],
2344                                          &entry, sizeof(entry))) {
2345                                 ret = -EFAULT;
2346                                 goto out;
2347                         }
2348                         count++;
2349                 }
2350         }
2351
2352         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2353                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2354                         /* Only expose IPv4 entries to old interface */
2355                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2356                                 continue;
2357
2358                         if (count >= get->num_services)
2359                                 goto out;
2360                         memset(&entry, 0, sizeof(entry));
2361                         ip_vs_copy_service(&entry, svc);
2362                         if (copy_to_user(&uptr->entrytable[count],
2363                                          &entry, sizeof(entry))) {
2364                                 ret = -EFAULT;
2365                                 goto out;
2366                         }
2367                         count++;
2368                 }
2369         }
2370   out:
2371         return ret;
2372 }
2373
2374 static inline int
2375 __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
2376                          struct ip_vs_get_dests __user *uptr)
2377 {
2378         struct ip_vs_service *svc;
2379         union nf_inet_addr addr = { .ip = get->addr };
2380         int ret = 0;
2381
2382         if (get->fwmark)
2383                 svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
2384         else
2385                 svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
2386                                            get->port);
2387
2388         if (svc) {
2389                 int count = 0;
2390                 struct ip_vs_dest *dest;
2391                 struct ip_vs_dest_entry entry;
2392
2393                 list_for_each_entry(dest, &svc->destinations, n_list) {
2394                         if (count >= get->num_dests)
2395                                 break;
2396
2397                         entry.addr = dest->addr.ip;
2398                         entry.port = dest->port;
2399                         entry.conn_flags = atomic_read(&dest->conn_flags);
2400                         entry.weight = atomic_read(&dest->weight);
2401                         entry.u_threshold = dest->u_threshold;
2402                         entry.l_threshold = dest->l_threshold;
2403                         entry.activeconns = atomic_read(&dest->activeconns);
2404                         entry.inactconns = atomic_read(&dest->inactconns);
2405                         entry.persistconns = atomic_read(&dest->persistconns);
2406                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2407                         if (copy_to_user(&uptr->entrytable[count],
2408                                          &entry, sizeof(entry))) {
2409                                 ret = -EFAULT;
2410                                 break;
2411                         }
2412                         count++;
2413                 }
2414         } else
2415                 ret = -ESRCH;
2416         return ret;
2417 }
2418
2419 static inline void
2420 __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
2421 {
2422 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2423         struct ip_vs_proto_data *pd;
2424 #endif
2425
2426 #ifdef CONFIG_IP_VS_PROTO_TCP
2427         pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2428         u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2429         u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2430 #endif
2431 #ifdef CONFIG_IP_VS_PROTO_UDP
2432         pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2433         u->udp_timeout =
2434                         pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2435 #endif
2436 }
2437
2438
2439 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2440 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2441 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2442 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2443 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2444 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2445 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2446
2447 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2448         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2449         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2450         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2451         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2452         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2453         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2454         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2455 };
2456
2457 static int
2458 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2459 {
2460         unsigned char arg[128];
2461         int ret = 0;
2462         unsigned int copylen;
2463         struct net *net = sock_net(sk);
2464         struct netns_ipvs *ipvs = net_ipvs(net);
2465
2466         BUG_ON(!net);
2467         if (!capable(CAP_NET_ADMIN))
2468                 return -EPERM;
2469
2470         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
2471                 return -EINVAL;
2472
2473         if (*len < get_arglen[GET_CMDID(cmd)]) {
2474                 pr_err("get_ctl: len %u < %u\n",
2475                        *len, get_arglen[GET_CMDID(cmd)]);
2476                 return -EINVAL;
2477         }
2478
2479         copylen = get_arglen[GET_CMDID(cmd)];
2480         if (copylen > 128)
2481                 return -EINVAL;
2482
2483         if (copy_from_user(arg, user, copylen) != 0)
2484                 return -EFAULT;
2485
2486         if (mutex_lock_interruptible(&__ip_vs_mutex))
2487                 return -ERESTARTSYS;
2488
2489         switch (cmd) {
2490         case IP_VS_SO_GET_VERSION:
2491         {
2492                 char buf[64];
2493
2494                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2495                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2496                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2497                         ret = -EFAULT;
2498                         goto out;
2499                 }
2500                 *len = strlen(buf)+1;
2501         }
2502         break;
2503
2504         case IP_VS_SO_GET_INFO:
2505         {
2506                 struct ip_vs_getinfo info;
2507                 info.version = IP_VS_VERSION_CODE;
2508                 info.size = ip_vs_conn_tab_size;
2509                 info.num_services = ipvs->num_services;
2510                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2511                         ret = -EFAULT;
2512         }
2513         break;
2514
2515         case IP_VS_SO_GET_SERVICES:
2516         {
2517                 struct ip_vs_get_services *get;
2518                 int size;
2519
2520                 get = (struct ip_vs_get_services *)arg;
2521                 size = sizeof(*get) +
2522                         sizeof(struct ip_vs_service_entry) * get->num_services;
2523                 if (*len != size) {
2524                         pr_err("length: %u != %u\n", *len, size);
2525                         ret = -EINVAL;
2526                         goto out;
2527                 }
2528                 ret = __ip_vs_get_service_entries(net, get, user);
2529         }
2530         break;
2531
2532         case IP_VS_SO_GET_SERVICE:
2533         {
2534                 struct ip_vs_service_entry *entry;
2535                 struct ip_vs_service *svc;
2536                 union nf_inet_addr addr;
2537
2538                 entry = (struct ip_vs_service_entry *)arg;
2539                 addr.ip = entry->addr;
2540                 if (entry->fwmark)
2541                         svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
2542                 else
2543                         svc = __ip_vs_service_find(net, AF_INET,
2544                                                    entry->protocol, &addr,
2545                                                    entry->port);
2546                 if (svc) {
2547                         ip_vs_copy_service(entry, svc);
2548                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2549                                 ret = -EFAULT;
2550                 } else
2551                         ret = -ESRCH;
2552         }
2553         break;
2554
2555         case IP_VS_SO_GET_DESTS:
2556         {
2557                 struct ip_vs_get_dests *get;
2558                 int size;
2559
2560                 get = (struct ip_vs_get_dests *)arg;
2561                 size = sizeof(*get) +
2562                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2563                 if (*len != size) {
2564                         pr_err("length: %u != %u\n", *len, size);
2565                         ret = -EINVAL;
2566                         goto out;
2567                 }
2568                 ret = __ip_vs_get_dest_entries(net, get, user);
2569         }
2570         break;
2571
2572         case IP_VS_SO_GET_TIMEOUT:
2573         {
2574                 struct ip_vs_timeout_user t;
2575
2576                 __ip_vs_get_timeouts(net, &t);
2577                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2578                         ret = -EFAULT;
2579         }
2580         break;
2581
2582         case IP_VS_SO_GET_DAEMON:
2583         {
2584                 struct ip_vs_daemon_user d[2];
2585
2586                 memset(&d, 0, sizeof(d));
2587                 if (ipvs->sync_state & IP_VS_STATE_MASTER) {
2588                         d[0].state = IP_VS_STATE_MASTER;
2589                         strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
2590                                 sizeof(d[0].mcast_ifn));
2591                         d[0].syncid = ipvs->master_syncid;
2592                 }
2593                 if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
2594                         d[1].state = IP_VS_STATE_BACKUP;
2595                         strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
2596                                 sizeof(d[1].mcast_ifn));
2597                         d[1].syncid = ipvs->backup_syncid;
2598                 }
2599                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2600                         ret = -EFAULT;
2601         }
2602         break;
2603
2604         default:
2605                 ret = -EINVAL;
2606         }
2607
2608   out:
2609         mutex_unlock(&__ip_vs_mutex);
2610         return ret;
2611 }
2612
2613
2614 static struct nf_sockopt_ops ip_vs_sockopts = {
2615         .pf             = PF_INET,
2616         .set_optmin     = IP_VS_BASE_CTL,
2617         .set_optmax     = IP_VS_SO_SET_MAX+1,
2618         .set            = do_ip_vs_set_ctl,
2619         .get_optmin     = IP_VS_BASE_CTL,
2620         .get_optmax     = IP_VS_SO_GET_MAX+1,
2621         .get            = do_ip_vs_get_ctl,
2622         .owner          = THIS_MODULE,
2623 };
2624
2625 /*
2626  * Generic Netlink interface
2627  */
2628
2629 /* IPVS genetlink family */
2630 static struct genl_family ip_vs_genl_family = {
2631         .id             = GENL_ID_GENERATE,
2632         .hdrsize        = 0,
2633         .name           = IPVS_GENL_NAME,
2634         .version        = IPVS_GENL_VERSION,
2635         .maxattr        = IPVS_CMD_MAX,
2636         .netnsok        = true,         /* Make ipvsadm to work on netns */
2637 };
2638
2639 /* Policy used for first-level command attributes */
2640 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
2641         [IPVS_CMD_ATTR_SERVICE]         = { .type = NLA_NESTED },
2642         [IPVS_CMD_ATTR_DEST]            = { .type = NLA_NESTED },
2643         [IPVS_CMD_ATTR_DAEMON]          = { .type = NLA_NESTED },
2644         [IPVS_CMD_ATTR_TIMEOUT_TCP]     = { .type = NLA_U32 },
2645         [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
2646         [IPVS_CMD_ATTR_TIMEOUT_UDP]     = { .type = NLA_U32 },
2647 };
2648
2649 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
2650 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
2651         [IPVS_DAEMON_ATTR_STATE]        = { .type = NLA_U32 },
2652         [IPVS_DAEMON_ATTR_MCAST_IFN]    = { .type = NLA_NUL_STRING,
2653                                             .len = IP_VS_IFNAME_MAXLEN },
2654         [IPVS_DAEMON_ATTR_SYNC_ID]      = { .type = NLA_U32 },
2655 };
2656
2657 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
2658 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
2659         [IPVS_SVC_ATTR_AF]              = { .type = NLA_U16 },
2660         [IPVS_SVC_ATTR_PROTOCOL]        = { .type = NLA_U16 },
2661         [IPVS_SVC_ATTR_ADDR]            = { .type = NLA_BINARY,
2662                                             .len = sizeof(union nf_inet_addr) },
2663         [IPVS_SVC_ATTR_PORT]            = { .type = NLA_U16 },
2664         [IPVS_SVC_ATTR_FWMARK]          = { .type = NLA_U32 },
2665         [IPVS_SVC_ATTR_SCHED_NAME]      = { .type = NLA_NUL_STRING,
2666                                             .len = IP_VS_SCHEDNAME_MAXLEN },
2667         [IPVS_SVC_ATTR_PE_NAME]         = { .type = NLA_NUL_STRING,
2668                                             .len = IP_VS_PENAME_MAXLEN },
2669         [IPVS_SVC_ATTR_FLAGS]           = { .type = NLA_BINARY,
2670                                             .len = sizeof(struct ip_vs_flags) },
2671         [IPVS_SVC_ATTR_TIMEOUT]         = { .type = NLA_U32 },
2672         [IPVS_SVC_ATTR_NETMASK]         = { .type = NLA_U32 },
2673         [IPVS_SVC_ATTR_STATS]           = { .type = NLA_NESTED },
2674 };
2675
2676 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
2677 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
2678         [IPVS_DEST_ATTR_ADDR]           = { .type = NLA_BINARY,
2679                                             .len = sizeof(union nf_inet_addr) },
2680         [IPVS_DEST_ATTR_PORT]           = { .type = NLA_U16 },
2681         [IPVS_DEST_ATTR_FWD_METHOD]     = { .type = NLA_U32 },
2682         [IPVS_DEST_ATTR_WEIGHT]         = { .type = NLA_U32 },
2683         [IPVS_DEST_ATTR_U_THRESH]       = { .type = NLA_U32 },
2684         [IPVS_DEST_ATTR_L_THRESH]       = { .type = NLA_U32 },
2685         [IPVS_DEST_ATTR_ACTIVE_CONNS]   = { .type = NLA_U32 },
2686         [IPVS_DEST_ATTR_INACT_CONNS]    = { .type = NLA_U32 },
2687         [IPVS_DEST_ATTR_PERSIST_CONNS]  = { .type = NLA_U32 },
2688         [IPVS_DEST_ATTR_STATS]          = { .type = NLA_NESTED },
2689 };
2690
2691 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
2692                                  struct ip_vs_stats *stats)
2693 {
2694         struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2695         if (!nl_stats)
2696                 return -EMSGSIZE;
2697
2698         spin_lock_bh(&stats->lock);
2699
2700         NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns);
2701         NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts);
2702         NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts);
2703         NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes);
2704         NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes);
2705         NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps);
2706         NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps);
2707         NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps);
2708         NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps);
2709         NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps);
2710
2711         spin_unlock_bh(&stats->lock);
2712
2713         nla_nest_end(skb, nl_stats);
2714
2715         return 0;
2716
2717 nla_put_failure:
2718         spin_unlock_bh(&stats->lock);
2719         nla_nest_cancel(skb, nl_stats);
2720         return -EMSGSIZE;
2721 }
2722
2723 static int ip_vs_genl_fill_service(struct sk_buff *skb,
2724                                    struct ip_vs_service *svc)
2725 {
2726         struct nlattr *nl_service;
2727         struct ip_vs_flags flags = { .flags = svc->flags,
2728                                      .mask = ~0 };
2729
2730         nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
2731         if (!nl_service)
2732                 return -EMSGSIZE;
2733
2734         NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af);
2735
2736         if (svc->fwmark) {
2737                 NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
2738         } else {
2739                 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol);
2740                 NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr);
2741                 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port);
2742         }
2743
2744         NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
2745         if (svc->pe)
2746                 NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name);
2747         NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
2748         NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
2749         NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
2750
2751         if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
2752                 goto nla_put_failure;
2753
2754         nla_nest_end(skb, nl_service);
2755
2756         return 0;
2757
2758 nla_put_failure:
2759         nla_nest_cancel(skb, nl_service);
2760         return -EMSGSIZE;
2761 }
2762
2763 static int ip_vs_genl_dump_service(struct sk_buff *skb,
2764                                    struct ip_vs_service *svc,
2765                                    struct netlink_callback *cb)
2766 {
2767         void *hdr;
2768
2769         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2770                           &ip_vs_genl_family, NLM_F_MULTI,
2771                           IPVS_CMD_NEW_SERVICE);
2772         if (!hdr)
2773                 return -EMSGSIZE;
2774
2775         if (ip_vs_genl_fill_service(skb, svc) < 0)
2776                 goto nla_put_failure;
2777
2778         return genlmsg_end(skb, hdr);
2779
2780 nla_put_failure:
2781         genlmsg_cancel(skb, hdr);
2782         return -EMSGSIZE;
2783 }
2784
2785 static int ip_vs_genl_dump_services(struct sk_buff *skb,
2786                                     struct netlink_callback *cb)
2787 {
2788         int idx = 0, i;
2789         int start = cb->args[0];
2790         struct ip_vs_service *svc;
2791         struct net *net = skb_sknet(skb);
2792
2793         mutex_lock(&__ip_vs_mutex);
2794         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2795                 list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
2796                         if (++idx <= start || !net_eq(svc->net, net))
2797                                 continue;
2798                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2799                                 idx--;
2800                                 goto nla_put_failure;
2801                         }
2802                 }
2803         }
2804
2805         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2806                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
2807                         if (++idx <= start || !net_eq(svc->net, net))
2808                                 continue;
2809                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2810                                 idx--;
2811                                 goto nla_put_failure;
2812                         }
2813                 }
2814         }
2815
2816 nla_put_failure:
2817         mutex_unlock(&__ip_vs_mutex);
2818         cb->args[0] = idx;
2819
2820         return skb->len;
2821 }
2822
2823 static int ip_vs_genl_parse_service(struct net *net,
2824                                     struct ip_vs_service_user_kern *usvc,
2825                                     struct nlattr *nla, int full_entry,
2826                                     struct ip_vs_service **ret_svc)
2827 {
2828         struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
2829         struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
2830         struct ip_vs_service *svc;
2831
2832         /* Parse mandatory identifying service fields first */
2833         if (nla == NULL ||
2834             nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
2835                 return -EINVAL;
2836
2837         nla_af          = attrs[IPVS_SVC_ATTR_AF];
2838         nla_protocol    = attrs[IPVS_SVC_ATTR_PROTOCOL];
2839         nla_addr        = attrs[IPVS_SVC_ATTR_ADDR];
2840         nla_port        = attrs[IPVS_SVC_ATTR_PORT];
2841         nla_fwmark      = attrs[IPVS_SVC_ATTR_FWMARK];
2842
2843         if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
2844                 return -EINVAL;
2845
2846         memset(usvc, 0, sizeof(*usvc));
2847
2848         usvc->af = nla_get_u16(nla_af);
2849 #ifdef CONFIG_IP_VS_IPV6
2850         if (usvc->af != AF_INET && usvc->af != AF_INET6)
2851 #else
2852         if (usvc->af != AF_INET)
2853 #endif
2854                 return -EAFNOSUPPORT;
2855
2856         if (nla_fwmark) {
2857                 usvc->protocol = IPPROTO_TCP;
2858                 usvc->fwmark = nla_get_u32(nla_fwmark);
2859         } else {
2860                 usvc->protocol = nla_get_u16(nla_protocol);
2861                 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
2862                 usvc->port = nla_get_u16(nla_port);
2863                 usvc->fwmark = 0;
2864         }
2865
2866         if (usvc->fwmark)
2867                 svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
2868         else
2869                 svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
2870                                            &usvc->addr, usvc->port);
2871         *ret_svc = svc;
2872
2873         /* If a full entry was requested, check for the additional fields */
2874         if (full_entry) {
2875                 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
2876                               *nla_netmask;
2877                 struct ip_vs_flags flags;
2878
2879                 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
2880                 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
2881                 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
2882                 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
2883                 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
2884
2885                 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
2886                         return -EINVAL;
2887
2888                 nla_memcpy(&flags, nla_flags, sizeof(flags));
2889
2890                 /* prefill flags from service if it already exists */
2891                 if (svc)
2892                         usvc->flags = svc->flags;
2893
2894                 /* set new flags from userland */
2895                 usvc->flags = (usvc->flags & ~flags.mask) |
2896                               (flags.flags & flags.mask);
2897                 usvc->sched_name = nla_data(nla_sched);
2898                 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
2899                 usvc->timeout = nla_get_u32(nla_timeout);
2900                 usvc->netmask = nla_get_u32(nla_netmask);
2901         }
2902
2903         return 0;
2904 }
2905
2906 static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
2907                                                      struct nlattr *nla)
2908 {
2909         struct ip_vs_service_user_kern usvc;
2910         struct ip_vs_service *svc;
2911         int ret;
2912
2913         ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
2914         return ret ? ERR_PTR(ret) : svc;
2915 }
2916
2917 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
2918 {
2919         struct nlattr *nl_dest;
2920
2921         nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
2922         if (!nl_dest)
2923                 return -EMSGSIZE;
2924
2925         NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr);
2926         NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port);
2927
2928         NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD,
2929                     atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK);
2930         NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight));
2931         NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold);
2932         NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold);
2933         NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
2934                     atomic_read(&dest->activeconns));
2935         NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS,
2936                     atomic_read(&dest->inactconns));
2937         NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
2938                     atomic_read(&dest->persistconns));
2939
2940         if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
2941                 goto nla_put_failure;
2942
2943         nla_nest_end(skb, nl_dest);
2944
2945         return 0;
2946
2947 nla_put_failure:
2948         nla_nest_cancel(skb, nl_dest);
2949         return -EMSGSIZE;
2950 }
2951
2952 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
2953                                 struct netlink_callback *cb)
2954 {
2955         void *hdr;
2956
2957         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2958                           &ip_vs_genl_family, NLM_F_MULTI,
2959                           IPVS_CMD_NEW_DEST);
2960         if (!hdr)
2961                 return -EMSGSIZE;
2962
2963         if (ip_vs_genl_fill_dest(skb, dest) < 0)
2964                 goto nla_put_failure;
2965
2966         return genlmsg_end(skb, hdr);
2967
2968 nla_put_failure:
2969         genlmsg_cancel(skb, hdr);
2970         return -EMSGSIZE;
2971 }
2972
2973 static int ip_vs_genl_dump_dests(struct sk_buff *skb,
2974                                  struct netlink_callback *cb)
2975 {
2976         int idx = 0;
2977         int start = cb->args[0];
2978         struct ip_vs_service *svc;
2979         struct ip_vs_dest *dest;
2980         struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
2981         struct net *net = skb_sknet(skb);
2982
2983         mutex_lock(&__ip_vs_mutex);
2984
2985         /* Try to find the service for which to dump destinations */
2986         if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
2987                         IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
2988                 goto out_err;
2989
2990
2991         svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
2992         if (IS_ERR(svc) || svc == NULL)
2993                 goto out_err;
2994
2995         /* Dump the destinations */
2996         list_for_each_entry(dest, &svc->destinations, n_list) {
2997                 if (++idx <= start)
2998                         continue;
2999                 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
3000                         idx--;
3001                         goto nla_put_failure;
3002                 }
3003         }
3004
3005 nla_put_failure:
3006         cb->args[0] = idx;
3007
3008 out_err:
3009         mutex_unlock(&__ip_vs_mutex);
3010
3011         return skb->len;
3012 }
3013
3014 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
3015                                  struct nlattr *nla, int full_entry)
3016 {
3017         struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
3018         struct nlattr *nla_addr, *nla_port;
3019
3020         /* Parse mandatory identifying destination fields first */
3021         if (nla == NULL ||
3022             nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
3023                 return -EINVAL;
3024
3025         nla_addr        = attrs[IPVS_DEST_ATTR_ADDR];
3026         nla_port        = attrs[IPVS_DEST_ATTR_PORT];
3027
3028         if (!(nla_addr && nla_port))
3029                 return -EINVAL;
3030
3031         memset(udest, 0, sizeof(*udest));
3032
3033         nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
3034         udest->port = nla_get_u16(nla_port);
3035
3036         /* If a full entry was requested, check for the additional fields */
3037         if (full_entry) {
3038                 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
3039                               *nla_l_thresh;
3040
3041                 nla_fwd         = attrs[IPVS_DEST_ATTR_FWD_METHOD];
3042                 nla_weight      = attrs[IPVS_DEST_ATTR_WEIGHT];
3043                 nla_u_thresh    = attrs[IPVS_DEST_ATTR_U_THRESH];
3044                 nla_l_thresh    = attrs[IPVS_DEST_ATTR_L_THRESH];
3045
3046                 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
3047                         return -EINVAL;
3048
3049                 udest->conn_flags = nla_get_u32(nla_fwd)
3050                                     & IP_VS_CONN_F_FWD_MASK;
3051                 udest->weight = nla_get_u32(nla_weight);
3052                 udest->u_threshold = nla_get_u32(nla_u_thresh);
3053                 udest->l_threshold = nla_get_u32(nla_l_thresh);
3054         }
3055
3056         return 0;
3057 }
3058
3059 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
3060                                   const char *mcast_ifn, __be32 syncid)
3061 {
3062         struct nlattr *nl_daemon;
3063
3064         nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
3065         if (!nl_daemon)
3066                 return -EMSGSIZE;
3067
3068         NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state);
3069         NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn);
3070         NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid);
3071
3072         nla_nest_end(skb, nl_daemon);
3073
3074         return 0;
3075
3076 nla_put_failure:
3077         nla_nest_cancel(skb, nl_daemon);
3078         return -EMSGSIZE;
3079 }
3080
3081 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
3082                                   const char *mcast_ifn, __be32 syncid,
3083                                   struct netlink_callback *cb)
3084 {
3085         void *hdr;
3086         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
3087                           &ip_vs_genl_family, NLM_F_MULTI,
3088                           IPVS_CMD_NEW_DAEMON);
3089         if (!hdr)
3090                 return -EMSGSIZE;
3091
3092         if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
3093                 goto nla_put_failure;
3094
3095         return genlmsg_end(skb, hdr);
3096
3097 nla_put_failure:
3098         genlmsg_cancel(skb, hdr);
3099         return -EMSGSIZE;
3100 }
3101
3102 static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
3103                                    struct netlink_callback *cb)
3104 {
3105         struct net *net = skb_net(skb);
3106         struct netns_ipvs *ipvs = net_ipvs(net);
3107
3108         mutex_lock(&__ip_vs_mutex);
3109         if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
3110                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
3111                                            ipvs->master_mcast_ifn,
3112                                            ipvs->master_syncid, cb) < 0)
3113                         goto nla_put_failure;
3114
3115                 cb->args[0] = 1;
3116         }
3117
3118         if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
3119                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
3120                                            ipvs->backup_mcast_ifn,
3121                                            ipvs->backup_syncid, cb) < 0)
3122                         goto nla_put_failure;
3123
3124                 cb->args[1] = 1;
3125         }
3126
3127 nla_put_failure:
3128         mutex_unlock(&__ip_vs_mutex);
3129
3130         return skb->len;
3131 }
3132
3133 static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
3134 {
3135         if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
3136               attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
3137               attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
3138                 return -EINVAL;
3139
3140         return start_sync_thread(net,
3141                                  nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
3142                                  nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
3143                                  nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
3144 }
3145
3146 static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
3147 {
3148         if (!attrs[IPVS_DAEMON_ATTR_STATE])
3149                 return -EINVAL;
3150
3151         return stop_sync_thread(net,
3152                                 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3153 }
3154
3155 static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
3156 {
3157         struct ip_vs_timeout_user t;
3158
3159         __ip_vs_get_timeouts(net, &t);
3160
3161         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
3162                 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
3163
3164         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
3165                 t.tcp_fin_timeout =
3166                         nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
3167
3168         if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
3169                 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
3170
3171         return ip_vs_set_timeout(net, &t);
3172 }
3173
3174 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3175 {
3176         struct ip_vs_service *svc = NULL;
3177         struct ip_vs_service_user_kern usvc;
3178         struct ip_vs_dest_user_kern udest;
3179         int ret = 0, cmd;
3180         int need_full_svc = 0, need_full_dest = 0;
3181         struct net *net;
3182         struct netns_ipvs *ipvs;
3183
3184         net = skb_sknet(skb);
3185         ipvs = net_ipvs(net);
3186         cmd = info->genlhdr->cmd;
3187
3188         mutex_lock(&__ip_vs_mutex);
3189
3190         if (cmd == IPVS_CMD_FLUSH) {
3191                 ret = ip_vs_flush(net);
3192                 goto out;
3193         } else if (cmd == IPVS_CMD_SET_CONFIG) {
3194                 ret = ip_vs_genl_set_config(net, info->attrs);
3195                 goto out;
3196         } else if (cmd == IPVS_CMD_NEW_DAEMON ||
3197                    cmd == IPVS_CMD_DEL_DAEMON) {
3198
3199                 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
3200
3201                 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
3202                     nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
3203                                      info->attrs[IPVS_CMD_ATTR_DAEMON],
3204                                      ip_vs_daemon_policy)) {
3205                         ret = -EINVAL;
3206                         goto out;
3207                 }
3208
3209                 if (cmd == IPVS_CMD_NEW_DAEMON)
3210                         ret = ip_vs_genl_new_daemon(net, daemon_attrs);
3211                 else
3212                         ret = ip_vs_genl_del_daemon(net, daemon_attrs);
3213                 goto out;
3214         } else if (cmd == IPVS_CMD_ZERO &&
3215                    !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3216                 ret = ip_vs_zero_all(net);
3217                 goto out;
3218         }
3219
3220         /* All following commands require a service argument, so check if we
3221          * received a valid one. We need a full service specification when
3222          * adding / editing a service. Only identifying members otherwise. */
3223         if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
3224                 need_full_svc = 1;
3225
3226         ret = ip_vs_genl_parse_service(net, &usvc,
3227                                        info->attrs[IPVS_CMD_ATTR_SERVICE],
3228                                        need_full_svc, &svc);
3229         if (ret)
3230                 goto out;
3231
3232         /* Unless we're adding a new service, the service must already exist */
3233         if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
3234                 ret = -ESRCH;
3235                 goto out;
3236         }
3237
3238         /* Destination commands require a valid destination argument. For
3239          * adding / editing a destination, we need a full destination
3240          * specification. */
3241         if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
3242             cmd == IPVS_CMD_DEL_DEST) {
3243                 if (cmd != IPVS_CMD_DEL_DEST)
3244                         need_full_dest = 1;
3245
3246                 ret = ip_vs_genl_parse_dest(&udest,
3247                                             info->attrs[IPVS_CMD_ATTR_DEST],
3248                                             need_full_dest);
3249                 if (ret)
3250                         goto out;
3251         }
3252
3253         switch (cmd) {
3254         case IPVS_CMD_NEW_SERVICE:
3255                 if (svc == NULL)
3256                         ret = ip_vs_add_service(net, &usvc, &svc);
3257                 else
3258                         ret = -EEXIST;
3259                 break;
3260         case IPVS_CMD_SET_SERVICE:
3261                 ret = ip_vs_edit_service(svc, &usvc);
3262                 break;
3263         case IPVS_CMD_DEL_SERVICE:
3264                 ret = ip_vs_del_service(svc);
3265                 /* do not use svc, it can be freed */
3266                 break;
3267         case IPVS_CMD_NEW_DEST:
3268                 ret = ip_vs_add_dest(svc, &udest);
3269                 break;
3270         case IPVS_CMD_SET_DEST:
3271                 ret = ip_vs_edit_dest(svc, &udest);
3272                 break;
3273         case IPVS_CMD_DEL_DEST:
3274                 ret = ip_vs_del_dest(svc, &udest);
3275                 break;
3276         case IPVS_CMD_ZERO:
3277                 ret = ip_vs_zero_service(svc);
3278                 break;
3279         default:
3280                 ret = -EINVAL;
3281         }
3282
3283 out:
3284         mutex_unlock(&__ip_vs_mutex);
3285
3286         return ret;
3287 }
3288
3289 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3290 {
3291         struct sk_buff *msg;
3292         void *reply;
3293         int ret, cmd, reply_cmd;
3294         struct net *net;
3295         struct netns_ipvs *ipvs;
3296
3297         net = skb_sknet(skb);
3298         ipvs = net_ipvs(net);
3299         cmd = info->genlhdr->cmd;
3300
3301         if (cmd == IPVS_CMD_GET_SERVICE)
3302                 reply_cmd = IPVS_CMD_NEW_SERVICE;
3303         else if (cmd == IPVS_CMD_GET_INFO)
3304                 reply_cmd = IPVS_CMD_SET_INFO;
3305         else if (cmd == IPVS_CMD_GET_CONFIG)
3306                 reply_cmd = IPVS_CMD_SET_CONFIG;
3307         else {
3308                 pr_err("unknown Generic Netlink command\n");
3309                 return -EINVAL;
3310         }
3311
3312         msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
3313         if (!msg)
3314                 return -ENOMEM;
3315
3316         mutex_lock(&__ip_vs_mutex);
3317
3318         reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
3319         if (reply == NULL)
3320                 goto nla_put_failure;
3321
3322         switch (cmd) {
3323         case IPVS_CMD_GET_SERVICE:
3324         {
3325                 struct ip_vs_service *svc;
3326
3327                 svc = ip_vs_genl_find_service(net,
3328                                               info->attrs[IPVS_CMD_ATTR_SERVICE]);
3329                 if (IS_ERR(svc)) {
3330                         ret = PTR_ERR(svc);
3331                         goto out_err;
3332                 } else if (svc) {
3333                         ret = ip_vs_genl_fill_service(msg, svc);
3334                         if (ret)
3335                                 goto nla_put_failure;
3336                 } else {
3337                         ret = -ESRCH;
3338                         goto out_err;
3339                 }
3340
3341                 break;
3342         }
3343
3344         case IPVS_CMD_GET_CONFIG:
3345         {
3346                 struct ip_vs_timeout_user t;
3347
3348                 __ip_vs_get_timeouts(net, &t);
3349 #ifdef CONFIG_IP_VS_PROTO_TCP
3350                 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
3351                 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
3352                             t.tcp_fin_timeout);
3353 #endif
3354 #ifdef CONFIG_IP_VS_PROTO_UDP
3355                 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout);
3356 #endif
3357
3358                 break;
3359         }
3360
3361         case IPVS_CMD_GET_INFO:
3362                 NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
3363                 NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
3364                             ip_vs_conn_tab_size);
3365                 break;
3366         }
3367
3368         genlmsg_end(msg, reply);
3369         ret = genlmsg_reply(msg, info);
3370         goto out;
3371
3372 nla_put_failure:
3373         pr_err("not enough space in Netlink message\n");
3374         ret = -EMSGSIZE;
3375
3376 out_err:
3377         nlmsg_free(msg);
3378 out:
3379         mutex_unlock(&__ip_vs_mutex);
3380
3381         return ret;
3382 }
3383
3384
3385 static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
3386         {
3387                 .cmd    = IPVS_CMD_NEW_SERVICE,
3388                 .flags  = GENL_ADMIN_PERM,
3389                 .policy = ip_vs_cmd_policy,
3390                 .doit   = ip_vs_genl_set_cmd,
3391         },
3392         {
3393                 .cmd    = IPVS_CMD_SET_SERVICE,
3394                 .flags  = GENL_ADMIN_PERM,
3395                 .policy = ip_vs_cmd_policy,
3396                 .doit   = ip_vs_genl_set_cmd,
3397         },
3398         {
3399                 .cmd    = IPVS_CMD_DEL_SERVICE,
3400                 .flags  = GENL_ADMIN_PERM,
3401                 .policy = ip_vs_cmd_policy,
3402                 .doit   = ip_vs_genl_set_cmd,
3403         },
3404         {
3405                 .cmd    = IPVS_CMD_GET_SERVICE,
3406                 .flags  = GENL_ADMIN_PERM,
3407                 .doit   = ip_vs_genl_get_cmd,
3408                 .dumpit = ip_vs_genl_dump_services,
3409                 .policy = ip_vs_cmd_policy,
3410         },
3411         {
3412                 .cmd    = IPVS_CMD_NEW_DEST,
3413                 .flags  = GENL_ADMIN_PERM,
3414                 .policy = ip_vs_cmd_policy,
3415                 .doit   = ip_vs_genl_set_cmd,
3416         },
3417         {
3418                 .cmd    = IPVS_CMD_SET_DEST,
3419                 .flags  = GENL_ADMIN_PERM,
3420                 .policy = ip_vs_cmd_policy,
3421                 .doit   = ip_vs_genl_set_cmd,
3422         },
3423         {
3424                 .cmd    = IPVS_CMD_DEL_DEST,
3425                 .flags  = GENL_ADMIN_PERM,
3426                 .policy = ip_vs_cmd_policy,
3427                 .doit   = ip_vs_genl_set_cmd,
3428         },
3429         {
3430                 .cmd    = IPVS_CMD_GET_DEST,
3431                 .flags  = GENL_ADMIN_PERM,
3432                 .policy = ip_vs_cmd_policy,
3433                 .dumpit = ip_vs_genl_dump_dests,
3434         },
3435         {
3436                 .cmd    = IPVS_CMD_NEW_DAEMON,
3437                 .flags  = GENL_ADMIN_PERM,
3438                 .policy = ip_vs_cmd_policy,
3439                 .doit   = ip_vs_genl_set_cmd,
3440         },
3441         {
3442                 .cmd    = IPVS_CMD_DEL_DAEMON,
3443                 .flags  = GENL_ADMIN_PERM,
3444                 .policy = ip_vs_cmd_policy,
3445                 .doit   = ip_vs_genl_set_cmd,
3446         },
3447         {
3448                 .cmd    = IPVS_CMD_GET_DAEMON,
3449                 .flags  = GENL_ADMIN_PERM,
3450                 .dumpit = ip_vs_genl_dump_daemons,
3451         },
3452         {
3453                 .cmd    = IPVS_CMD_SET_CONFIG,
3454                 .flags  = GENL_ADMIN_PERM,
3455                 .policy = ip_vs_cmd_policy,
3456                 .doit   = ip_vs_genl_set_cmd,
3457         },
3458         {
3459                 .cmd    = IPVS_CMD_GET_CONFIG,
3460                 .flags  = GENL_ADMIN_PERM,
3461                 .doit   = ip_vs_genl_get_cmd,
3462         },
3463         {
3464                 .cmd    = IPVS_CMD_GET_INFO,
3465                 .flags  = GENL_ADMIN_PERM,
3466                 .doit   = ip_vs_genl_get_cmd,
3467         },
3468         {
3469                 .cmd    = IPVS_CMD_ZERO,
3470                 .flags  = GENL_ADMIN_PERM,
3471                 .policy = ip_vs_cmd_policy,
3472                 .doit   = ip_vs_genl_set_cmd,
3473         },
3474         {
3475                 .cmd    = IPVS_CMD_FLUSH,
3476                 .flags  = GENL_ADMIN_PERM,
3477                 .doit   = ip_vs_genl_set_cmd,
3478         },
3479 };
3480
3481 static int __init ip_vs_genl_register(void)
3482 {
3483         return genl_register_family_with_ops(&ip_vs_genl_family,
3484                 ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops));
3485 }
3486
3487 static void ip_vs_genl_unregister(void)
3488 {
3489         genl_unregister_family(&ip_vs_genl_family);
3490 }
3491
3492 /* End of Generic Netlink interface definitions */
3493
3494 /*
3495  * per netns intit/exit func.
3496  */
3497 int __net_init __ip_vs_control_init(struct net *net)
3498 {
3499         int idx;
3500         struct netns_ipvs *ipvs = net_ipvs(net);
3501         struct ctl_table *tbl;
3502
3503         atomic_set(&ipvs->dropentry, 0);
3504         spin_lock_init(&ipvs->dropentry_lock);
3505         spin_lock_init(&ipvs->droppacket_lock);
3506         spin_lock_init(&ipvs->securetcp_lock);
3507         ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock);
3508
3509         /* Initialize rs_table */
3510         for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
3511                 INIT_LIST_HEAD(&ipvs->rs_table[idx]);
3512
3513         INIT_LIST_HEAD(&ipvs->dest_trash);
3514         atomic_set(&ipvs->ftpsvc_counter, 0);
3515         atomic_set(&ipvs->nullsvc_counter, 0);
3516
3517         /* procfs stats */
3518         ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
3519         if (!ipvs->tot_stats.cpustats) {
3520                 pr_err("%s() alloc_percpu failed\n", __func__);
3521                 goto err_alloc;
3522         }
3523         spin_lock_init(&ipvs->tot_stats.lock);
3524
3525         proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops);
3526         proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops);
3527         proc_net_fops_create(net, "ip_vs_stats_percpu", 0,
3528                              &ip_vs_stats_percpu_fops);
3529
3530         if (!net_eq(net, &init_net)) {
3531                 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
3532                 if (tbl == NULL)
3533                         goto err_dup;
3534         } else
3535                 tbl = vs_vars;
3536         /* Initialize sysctl defaults */
3537         idx = 0;
3538         ipvs->sysctl_amemthresh = 1024;
3539         tbl[idx++].data = &ipvs->sysctl_amemthresh;
3540         ipvs->sysctl_am_droprate = 10;
3541         tbl[idx++].data = &ipvs->sysctl_am_droprate;
3542         tbl[idx++].data = &ipvs->sysctl_drop_entry;
3543         tbl[idx++].data = &ipvs->sysctl_drop_packet;
3544 #ifdef CONFIG_IP_VS_NFCT
3545         tbl[idx++].data = &ipvs->sysctl_conntrack;
3546 #endif
3547         tbl[idx++].data = &ipvs->sysctl_secure_tcp;
3548         ipvs->sysctl_snat_reroute = 1;
3549         tbl[idx++].data = &ipvs->sysctl_snat_reroute;
3550         ipvs->sysctl_sync_ver = 1;
3551         tbl[idx++].data = &ipvs->sysctl_sync_ver;
3552         tbl[idx++].data = &ipvs->sysctl_cache_bypass;
3553         tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
3554         tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
3555         ipvs->sysctl_sync_threshold[0] = 3;
3556         ipvs->sysctl_sync_threshold[1] = 50;
3557         tbl[idx].data = &ipvs->sysctl_sync_threshold;
3558         tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
3559         tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
3560
3561
3562 #ifdef CONFIG_SYSCTL
3563         ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path,
3564                                                      tbl);
3565         if (ipvs->sysctl_hdr == NULL) {
3566                 if (!net_eq(net, &init_net))
3567                         kfree(tbl);
3568                 goto err_dup;
3569         }
3570 #endif
3571         ip_vs_new_estimator(net, &ipvs->tot_stats);
3572         ipvs->sysctl_tbl = tbl;
3573         /* Schedule defense work */
3574         INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
3575         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
3576         return 0;
3577
3578 err_dup:
3579         free_percpu(ipvs->tot_stats.cpustats);
3580 err_alloc:
3581         return -ENOMEM;
3582 }
3583
3584 static void __net_exit __ip_vs_control_cleanup(struct net *net)
3585 {
3586         struct netns_ipvs *ipvs = net_ipvs(net);
3587
3588         ip_vs_trash_cleanup(net);
3589         ip_vs_kill_estimator(net, &ipvs->tot_stats);
3590         cancel_delayed_work_sync(&ipvs->defense_work);
3591         cancel_work_sync(&ipvs->defense_work.work);
3592 #ifdef CONFIG_SYSCTL
3593         unregister_net_sysctl_table(ipvs->sysctl_hdr);
3594 #endif
3595         proc_net_remove(net, "ip_vs_stats_percpu");
3596         proc_net_remove(net, "ip_vs_stats");
3597         proc_net_remove(net, "ip_vs");
3598         free_percpu(ipvs->tot_stats.cpustats);
3599 }
3600
3601 static struct pernet_operations ipvs_control_ops = {
3602         .init = __ip_vs_control_init,
3603         .exit = __ip_vs_control_cleanup,
3604 };
3605
3606 int __init ip_vs_control_init(void)
3607 {
3608         int idx;
3609         int ret;
3610
3611         EnterFunction(2);
3612
3613         /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
3614         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
3615                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
3616                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
3617         }
3618
3619         ret = register_pernet_subsys(&ipvs_control_ops);
3620         if (ret) {
3621                 pr_err("cannot register namespace.\n");
3622                 goto err;
3623         }
3624
3625         smp_wmb();      /* Do we really need it now ? */
3626
3627         ret = nf_register_sockopt(&ip_vs_sockopts);
3628         if (ret) {
3629                 pr_err("cannot register sockopt.\n");
3630                 goto err_net;
3631         }
3632
3633         ret = ip_vs_genl_register();
3634         if (ret) {
3635                 pr_err("cannot register Generic Netlink interface.\n");
3636                 nf_unregister_sockopt(&ip_vs_sockopts);
3637                 goto err_net;
3638         }
3639
3640         LeaveFunction(2);
3641         return 0;
3642
3643 err_net:
3644         unregister_pernet_subsys(&ipvs_control_ops);
3645 err:
3646         return ret;
3647 }
3648
3649
3650 void ip_vs_control_cleanup(void)
3651 {
3652         EnterFunction(2);
3653         unregister_pernet_subsys(&ipvs_control_ops);
3654         ip_vs_genl_unregister();
3655         nf_unregister_sockopt(&ip_vs_sockopts);
3656         LeaveFunction(2);
3657 }