f0369d665088ded9d109ce831684c90117b937c5
[linux-2.6.git] / net / netfilter / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * Changes:
18  *
19  */
20
21 #define KMSG_COMPONENT "IPVS"
22 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23
24 #include <linux/module.h>
25 #include <linux/init.h>
26 #include <linux/types.h>
27 #include <linux/capability.h>
28 #include <linux/fs.h>
29 #include <linux/sysctl.h>
30 #include <linux/proc_fs.h>
31 #include <linux/workqueue.h>
32 #include <linux/swap.h>
33 #include <linux/seq_file.h>
34 #include <linux/slab.h>
35
36 #include <linux/netfilter.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/mutex.h>
39
40 #include <net/net_namespace.h>
41 #include <linux/nsproxy.h>
42 #include <net/ip.h>
43 #ifdef CONFIG_IP_VS_IPV6
44 #include <net/ipv6.h>
45 #include <net/ip6_route.h>
46 #endif
47 #include <net/route.h>
48 #include <net/sock.h>
49 #include <net/genetlink.h>
50
51 #include <asm/uaccess.h>
52
53 #include <net/ip_vs.h>
54
55 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
56 static DEFINE_MUTEX(__ip_vs_mutex);
57
58 /* lock for service table */
59 static DEFINE_RWLOCK(__ip_vs_svc_lock);
60
61 /* sysctl variables */
62
63 #ifdef CONFIG_IP_VS_DEBUG
64 static int sysctl_ip_vs_debug_level = 0;
65
66 int ip_vs_get_debug_level(void)
67 {
68         return sysctl_ip_vs_debug_level;
69 }
70 #endif
71
72 #ifdef CONFIG_IP_VS_IPV6
73 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
74 static int __ip_vs_addr_is_local_v6(struct net *net,
75                                     const struct in6_addr *addr)
76 {
77         struct rt6_info *rt;
78         struct flowi fl = {
79                 .oif = 0,
80                 .fl6_dst = *addr,
81                 .fl6_src = { .s6_addr32 = {0, 0, 0, 0} },
82         };
83
84         rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl);
85         if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
86                         return 1;
87
88         return 0;
89 }
90 #endif
91 /*
92  *      update_defense_level is called from keventd and from sysctl,
93  *      so it needs to protect itself from softirqs
94  */
95 static void update_defense_level(struct netns_ipvs *ipvs)
96 {
97         struct sysinfo i;
98         static int old_secure_tcp = 0;
99         int availmem;
100         int nomem;
101         int to_change = -1;
102
103         /* we only count free and buffered memory (in pages) */
104         si_meminfo(&i);
105         availmem = i.freeram + i.bufferram;
106         /* however in linux 2.5 the i.bufferram is total page cache size,
107            we need adjust it */
108         /* si_swapinfo(&i); */
109         /* availmem = availmem - (i.totalswap - i.freeswap); */
110
111         nomem = (availmem < ipvs->sysctl_amemthresh);
112
113         local_bh_disable();
114
115         /* drop_entry */
116         spin_lock(&ipvs->dropentry_lock);
117         switch (ipvs->sysctl_drop_entry) {
118         case 0:
119                 atomic_set(&ipvs->dropentry, 0);
120                 break;
121         case 1:
122                 if (nomem) {
123                         atomic_set(&ipvs->dropentry, 1);
124                         ipvs->sysctl_drop_entry = 2;
125                 } else {
126                         atomic_set(&ipvs->dropentry, 0);
127                 }
128                 break;
129         case 2:
130                 if (nomem) {
131                         atomic_set(&ipvs->dropentry, 1);
132                 } else {
133                         atomic_set(&ipvs->dropentry, 0);
134                         ipvs->sysctl_drop_entry = 1;
135                 };
136                 break;
137         case 3:
138                 atomic_set(&ipvs->dropentry, 1);
139                 break;
140         }
141         spin_unlock(&ipvs->dropentry_lock);
142
143         /* drop_packet */
144         spin_lock(&ipvs->droppacket_lock);
145         switch (ipvs->sysctl_drop_packet) {
146         case 0:
147                 ipvs->drop_rate = 0;
148                 break;
149         case 1:
150                 if (nomem) {
151                         ipvs->drop_rate = ipvs->drop_counter
152                                 = ipvs->sysctl_amemthresh /
153                                 (ipvs->sysctl_amemthresh-availmem);
154                         ipvs->sysctl_drop_packet = 2;
155                 } else {
156                         ipvs->drop_rate = 0;
157                 }
158                 break;
159         case 2:
160                 if (nomem) {
161                         ipvs->drop_rate = ipvs->drop_counter
162                                 = ipvs->sysctl_amemthresh /
163                                 (ipvs->sysctl_amemthresh-availmem);
164                 } else {
165                         ipvs->drop_rate = 0;
166                         ipvs->sysctl_drop_packet = 1;
167                 }
168                 break;
169         case 3:
170                 ipvs->drop_rate = ipvs->sysctl_am_droprate;
171                 break;
172         }
173         spin_unlock(&ipvs->droppacket_lock);
174
175         /* secure_tcp */
176         spin_lock(&ipvs->securetcp_lock);
177         switch (ipvs->sysctl_secure_tcp) {
178         case 0:
179                 if (old_secure_tcp >= 2)
180                         to_change = 0;
181                 break;
182         case 1:
183                 if (nomem) {
184                         if (old_secure_tcp < 2)
185                                 to_change = 1;
186                         ipvs->sysctl_secure_tcp = 2;
187                 } else {
188                         if (old_secure_tcp >= 2)
189                                 to_change = 0;
190                 }
191                 break;
192         case 2:
193                 if (nomem) {
194                         if (old_secure_tcp < 2)
195                                 to_change = 1;
196                 } else {
197                         if (old_secure_tcp >= 2)
198                                 to_change = 0;
199                         ipvs->sysctl_secure_tcp = 1;
200                 }
201                 break;
202         case 3:
203                 if (old_secure_tcp < 2)
204                         to_change = 1;
205                 break;
206         }
207         old_secure_tcp = ipvs->sysctl_secure_tcp;
208         if (to_change >= 0)
209                 ip_vs_protocol_timeout_change(ipvs,
210                                               ipvs->sysctl_secure_tcp > 1);
211         spin_unlock(&ipvs->securetcp_lock);
212
213         local_bh_enable();
214 }
215
216
217 /*
218  *      Timer for checking the defense
219  */
220 #define DEFENSE_TIMER_PERIOD    1*HZ
221
222 static void defense_work_handler(struct work_struct *work)
223 {
224         struct netns_ipvs *ipvs =
225                 container_of(work, struct netns_ipvs, defense_work.work);
226
227         update_defense_level(ipvs);
228         if (atomic_read(&ipvs->dropentry))
229                 ip_vs_random_dropentry(ipvs->net);
230         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
231 }
232
233 int
234 ip_vs_use_count_inc(void)
235 {
236         return try_module_get(THIS_MODULE);
237 }
238
239 void
240 ip_vs_use_count_dec(void)
241 {
242         module_put(THIS_MODULE);
243 }
244
245
246 /*
247  *      Hash table: for virtual service lookups
248  */
249 #define IP_VS_SVC_TAB_BITS 8
250 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
251 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
252
253 /* the service table hashed by <protocol, addr, port> */
254 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
255 /* the service table hashed by fwmark */
256 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
257
258
259 /*
260  *      Returns hash value for virtual service
261  */
262 static inline unsigned
263 ip_vs_svc_hashkey(struct net *net, int af, unsigned proto,
264                   const union nf_inet_addr *addr, __be16 port)
265 {
266         register unsigned porth = ntohs(port);
267         __be32 addr_fold = addr->ip;
268
269 #ifdef CONFIG_IP_VS_IPV6
270         if (af == AF_INET6)
271                 addr_fold = addr->ip6[0]^addr->ip6[1]^
272                             addr->ip6[2]^addr->ip6[3];
273 #endif
274         addr_fold ^= ((size_t)net>>8);
275
276         return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
277                 & IP_VS_SVC_TAB_MASK;
278 }
279
280 /*
281  *      Returns hash value of fwmark for virtual service lookup
282  */
283 static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
284 {
285         return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
286 }
287
288 /*
289  *      Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
290  *      or in the ip_vs_svc_fwm_table by fwmark.
291  *      Should be called with locked tables.
292  */
293 static int ip_vs_svc_hash(struct ip_vs_service *svc)
294 {
295         unsigned hash;
296
297         if (svc->flags & IP_VS_SVC_F_HASHED) {
298                 pr_err("%s(): request for already hashed, called from %pF\n",
299                        __func__, __builtin_return_address(0));
300                 return 0;
301         }
302
303         if (svc->fwmark == 0) {
304                 /*
305                  *  Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
306                  */
307                 hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
308                                          &svc->addr, svc->port);
309                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
310         } else {
311                 /*
312                  *  Hash it by fwmark in svc_fwm_table
313                  */
314                 hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
315                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
316         }
317
318         svc->flags |= IP_VS_SVC_F_HASHED;
319         /* increase its refcnt because it is referenced by the svc table */
320         atomic_inc(&svc->refcnt);
321         return 1;
322 }
323
324
325 /*
326  *      Unhashes a service from svc_table / svc_fwm_table.
327  *      Should be called with locked tables.
328  */
329 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
330 {
331         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
332                 pr_err("%s(): request for unhash flagged, called from %pF\n",
333                        __func__, __builtin_return_address(0));
334                 return 0;
335         }
336
337         if (svc->fwmark == 0) {
338                 /* Remove it from the svc_table table */
339                 list_del(&svc->s_list);
340         } else {
341                 /* Remove it from the svc_fwm_table table */
342                 list_del(&svc->f_list);
343         }
344
345         svc->flags &= ~IP_VS_SVC_F_HASHED;
346         atomic_dec(&svc->refcnt);
347         return 1;
348 }
349
350
351 /*
352  *      Get service by {netns, proto,addr,port} in the service table.
353  */
354 static inline struct ip_vs_service *
355 __ip_vs_service_find(struct net *net, int af, __u16 protocol,
356                      const union nf_inet_addr *vaddr, __be16 vport)
357 {
358         unsigned hash;
359         struct ip_vs_service *svc;
360
361         /* Check for "full" addressed entries */
362         hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
363
364         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
365                 if ((svc->af == af)
366                     && ip_vs_addr_equal(af, &svc->addr, vaddr)
367                     && (svc->port == vport)
368                     && (svc->protocol == protocol)
369                     && net_eq(svc->net, net)) {
370                         /* HIT */
371                         return svc;
372                 }
373         }
374
375         return NULL;
376 }
377
378
379 /*
380  *      Get service by {fwmark} in the service table.
381  */
382 static inline struct ip_vs_service *
383 __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
384 {
385         unsigned hash;
386         struct ip_vs_service *svc;
387
388         /* Check for fwmark addressed entries */
389         hash = ip_vs_svc_fwm_hashkey(net, fwmark);
390
391         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
392                 if (svc->fwmark == fwmark && svc->af == af
393                     && net_eq(svc->net, net)) {
394                         /* HIT */
395                         return svc;
396                 }
397         }
398
399         return NULL;
400 }
401
402 struct ip_vs_service *
403 ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
404                   const union nf_inet_addr *vaddr, __be16 vport)
405 {
406         struct ip_vs_service *svc;
407         struct netns_ipvs *ipvs = net_ipvs(net);
408
409         read_lock(&__ip_vs_svc_lock);
410
411         /*
412          *      Check the table hashed by fwmark first
413          */
414         if (fwmark) {
415                 svc = __ip_vs_svc_fwm_find(net, af, fwmark);
416                 if (svc)
417                         goto out;
418         }
419
420         /*
421          *      Check the table hashed by <protocol,addr,port>
422          *      for "full" addressed entries
423          */
424         svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
425
426         if (svc == NULL
427             && protocol == IPPROTO_TCP
428             && atomic_read(&ipvs->ftpsvc_counter)
429             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
430                 /*
431                  * Check if ftp service entry exists, the packet
432                  * might belong to FTP data connections.
433                  */
434                 svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
435         }
436
437         if (svc == NULL
438             && atomic_read(&ipvs->nullsvc_counter)) {
439                 /*
440                  * Check if the catch-all port (port zero) exists
441                  */
442                 svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
443         }
444
445   out:
446         if (svc)
447                 atomic_inc(&svc->usecnt);
448         read_unlock(&__ip_vs_svc_lock);
449
450         IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
451                       fwmark, ip_vs_proto_name(protocol),
452                       IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
453                       svc ? "hit" : "not hit");
454
455         return svc;
456 }
457
458
459 static inline void
460 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
461 {
462         atomic_inc(&svc->refcnt);
463         dest->svc = svc;
464 }
465
466 static void
467 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
468 {
469         struct ip_vs_service *svc = dest->svc;
470
471         dest->svc = NULL;
472         if (atomic_dec_and_test(&svc->refcnt)) {
473                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
474                               svc->fwmark,
475                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
476                               ntohs(svc->port), atomic_read(&svc->usecnt));
477                 free_percpu(svc->stats.cpustats);
478                 kfree(svc);
479         }
480 }
481
482
483 /*
484  *      Returns hash value for real service
485  */
486 static inline unsigned ip_vs_rs_hashkey(int af,
487                                             const union nf_inet_addr *addr,
488                                             __be16 port)
489 {
490         register unsigned porth = ntohs(port);
491         __be32 addr_fold = addr->ip;
492
493 #ifdef CONFIG_IP_VS_IPV6
494         if (af == AF_INET6)
495                 addr_fold = addr->ip6[0]^addr->ip6[1]^
496                             addr->ip6[2]^addr->ip6[3];
497 #endif
498
499         return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
500                 & IP_VS_RTAB_MASK;
501 }
502
503 /*
504  *      Hashes ip_vs_dest in rs_table by <proto,addr,port>.
505  *      should be called with locked tables.
506  */
507 static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
508 {
509         unsigned hash;
510
511         if (!list_empty(&dest->d_list)) {
512                 return 0;
513         }
514
515         /*
516          *      Hash by proto,addr,port,
517          *      which are the parameters of the real service.
518          */
519         hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
520
521         list_add(&dest->d_list, &ipvs->rs_table[hash]);
522
523         return 1;
524 }
525
526 /*
527  *      UNhashes ip_vs_dest from rs_table.
528  *      should be called with locked tables.
529  */
530 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
531 {
532         /*
533          * Remove it from the rs_table table.
534          */
535         if (!list_empty(&dest->d_list)) {
536                 list_del(&dest->d_list);
537                 INIT_LIST_HEAD(&dest->d_list);
538         }
539
540         return 1;
541 }
542
543 /*
544  *      Lookup real service by <proto,addr,port> in the real service table.
545  */
546 struct ip_vs_dest *
547 ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
548                           const union nf_inet_addr *daddr,
549                           __be16 dport)
550 {
551         struct netns_ipvs *ipvs = net_ipvs(net);
552         unsigned hash;
553         struct ip_vs_dest *dest;
554
555         /*
556          *      Check for "full" addressed entries
557          *      Return the first found entry
558          */
559         hash = ip_vs_rs_hashkey(af, daddr, dport);
560
561         read_lock(&ipvs->rs_lock);
562         list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
563                 if ((dest->af == af)
564                     && ip_vs_addr_equal(af, &dest->addr, daddr)
565                     && (dest->port == dport)
566                     && ((dest->protocol == protocol) ||
567                         dest->vfwmark)) {
568                         /* HIT */
569                         read_unlock(&ipvs->rs_lock);
570                         return dest;
571                 }
572         }
573         read_unlock(&ipvs->rs_lock);
574
575         return NULL;
576 }
577
578 /*
579  *      Lookup destination by {addr,port} in the given service
580  */
581 static struct ip_vs_dest *
582 ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
583                   __be16 dport)
584 {
585         struct ip_vs_dest *dest;
586
587         /*
588          * Find the destination for the given service
589          */
590         list_for_each_entry(dest, &svc->destinations, n_list) {
591                 if ((dest->af == svc->af)
592                     && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
593                     && (dest->port == dport)) {
594                         /* HIT */
595                         return dest;
596                 }
597         }
598
599         return NULL;
600 }
601
602 /*
603  * Find destination by {daddr,dport,vaddr,protocol}
604  * Cretaed to be used in ip_vs_process_message() in
605  * the backup synchronization daemon. It finds the
606  * destination to be bound to the received connection
607  * on the backup.
608  *
609  * ip_vs_lookup_real_service() looked promissing, but
610  * seems not working as expected.
611  */
612 struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
613                                    const union nf_inet_addr *daddr,
614                                    __be16 dport,
615                                    const union nf_inet_addr *vaddr,
616                                    __be16 vport, __u16 protocol, __u32 fwmark)
617 {
618         struct ip_vs_dest *dest;
619         struct ip_vs_service *svc;
620
621         svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
622         if (!svc)
623                 return NULL;
624         dest = ip_vs_lookup_dest(svc, daddr, dport);
625         if (dest)
626                 atomic_inc(&dest->refcnt);
627         ip_vs_service_put(svc);
628         return dest;
629 }
630
631 /*
632  *  Lookup dest by {svc,addr,port} in the destination trash.
633  *  The destination trash is used to hold the destinations that are removed
634  *  from the service table but are still referenced by some conn entries.
635  *  The reason to add the destination trash is when the dest is temporary
636  *  down (either by administrator or by monitor program), the dest can be
637  *  picked back from the trash, the remaining connections to the dest can
638  *  continue, and the counting information of the dest is also useful for
639  *  scheduling.
640  */
641 static struct ip_vs_dest *
642 ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
643                      __be16 dport)
644 {
645         struct ip_vs_dest *dest, *nxt;
646         struct netns_ipvs *ipvs = net_ipvs(svc->net);
647
648         /*
649          * Find the destination in trash
650          */
651         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
652                 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
653                               "dest->refcnt=%d\n",
654                               dest->vfwmark,
655                               IP_VS_DBG_ADDR(svc->af, &dest->addr),
656                               ntohs(dest->port),
657                               atomic_read(&dest->refcnt));
658                 if (dest->af == svc->af &&
659                     ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
660                     dest->port == dport &&
661                     dest->vfwmark == svc->fwmark &&
662                     dest->protocol == svc->protocol &&
663                     (svc->fwmark ||
664                      (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
665                       dest->vport == svc->port))) {
666                         /* HIT */
667                         return dest;
668                 }
669
670                 /*
671                  * Try to purge the destination from trash if not referenced
672                  */
673                 if (atomic_read(&dest->refcnt) == 1) {
674                         IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
675                                       "from trash\n",
676                                       dest->vfwmark,
677                                       IP_VS_DBG_ADDR(svc->af, &dest->addr),
678                                       ntohs(dest->port));
679                         list_del(&dest->n_list);
680                         ip_vs_dst_reset(dest);
681                         __ip_vs_unbind_svc(dest);
682                         free_percpu(dest->stats.cpustats);
683                         kfree(dest);
684                 }
685         }
686
687         return NULL;
688 }
689
690
691 /*
692  *  Clean up all the destinations in the trash
693  *  Called by the ip_vs_control_cleanup()
694  *
695  *  When the ip_vs_control_clearup is activated by ipvs module exit,
696  *  the service tables must have been flushed and all the connections
697  *  are expired, and the refcnt of each destination in the trash must
698  *  be 1, so we simply release them here.
699  */
700 static void ip_vs_trash_cleanup(struct net *net)
701 {
702         struct ip_vs_dest *dest, *nxt;
703         struct netns_ipvs *ipvs = net_ipvs(net);
704
705         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
706                 list_del(&dest->n_list);
707                 ip_vs_dst_reset(dest);
708                 __ip_vs_unbind_svc(dest);
709                 free_percpu(dest->stats.cpustats);
710                 kfree(dest);
711         }
712 }
713
714
715 static void
716 ip_vs_zero_stats(struct ip_vs_stats *stats)
717 {
718         spin_lock_bh(&stats->lock);
719
720         memset(&stats->ustats, 0, sizeof(stats->ustats));
721         ip_vs_zero_estimator(stats);
722
723         spin_unlock_bh(&stats->lock);
724 }
725
726 /*
727  *      Update a destination in the given service
728  */
729 static void
730 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
731                     struct ip_vs_dest_user_kern *udest, int add)
732 {
733         struct netns_ipvs *ipvs = net_ipvs(svc->net);
734         int conn_flags;
735
736         /* set the weight and the flags */
737         atomic_set(&dest->weight, udest->weight);
738         conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
739         conn_flags |= IP_VS_CONN_F_INACTIVE;
740
741         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
742         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
743                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
744         } else {
745                 /*
746                  *    Put the real service in rs_table if not present.
747                  *    For now only for NAT!
748                  */
749                 write_lock_bh(&ipvs->rs_lock);
750                 ip_vs_rs_hash(ipvs, dest);
751                 write_unlock_bh(&ipvs->rs_lock);
752         }
753         atomic_set(&dest->conn_flags, conn_flags);
754
755         /* bind the service */
756         if (!dest->svc) {
757                 __ip_vs_bind_svc(dest, svc);
758         } else {
759                 if (dest->svc != svc) {
760                         __ip_vs_unbind_svc(dest);
761                         ip_vs_zero_stats(&dest->stats);
762                         __ip_vs_bind_svc(dest, svc);
763                 }
764         }
765
766         /* set the dest status flags */
767         dest->flags |= IP_VS_DEST_F_AVAILABLE;
768
769         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
770                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
771         dest->u_threshold = udest->u_threshold;
772         dest->l_threshold = udest->l_threshold;
773
774         spin_lock(&dest->dst_lock);
775         ip_vs_dst_reset(dest);
776         spin_unlock(&dest->dst_lock);
777
778         if (add)
779                 ip_vs_new_estimator(svc->net, &dest->stats);
780
781         write_lock_bh(&__ip_vs_svc_lock);
782
783         /* Wait until all other svc users go away */
784         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
785
786         if (add) {
787                 list_add(&dest->n_list, &svc->destinations);
788                 svc->num_dests++;
789         }
790
791         /* call the update_service, because server weight may be changed */
792         if (svc->scheduler->update_service)
793                 svc->scheduler->update_service(svc);
794
795         write_unlock_bh(&__ip_vs_svc_lock);
796 }
797
798
799 /*
800  *      Create a destination for the given service
801  */
802 static int
803 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
804                struct ip_vs_dest **dest_p)
805 {
806         struct ip_vs_dest *dest;
807         unsigned atype;
808
809         EnterFunction(2);
810
811 #ifdef CONFIG_IP_VS_IPV6
812         if (svc->af == AF_INET6) {
813                 atype = ipv6_addr_type(&udest->addr.in6);
814                 if ((!(atype & IPV6_ADDR_UNICAST) ||
815                         atype & IPV6_ADDR_LINKLOCAL) &&
816                         !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
817                         return -EINVAL;
818         } else
819 #endif
820         {
821                 atype = inet_addr_type(svc->net, udest->addr.ip);
822                 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
823                         return -EINVAL;
824         }
825
826         dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
827         if (dest == NULL) {
828                 pr_err("%s(): no memory.\n", __func__);
829                 return -ENOMEM;
830         }
831         dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
832         if (!dest->stats.cpustats) {
833                 pr_err("%s() alloc_percpu failed\n", __func__);
834                 goto err_alloc;
835         }
836
837         dest->af = svc->af;
838         dest->protocol = svc->protocol;
839         dest->vaddr = svc->addr;
840         dest->vport = svc->port;
841         dest->vfwmark = svc->fwmark;
842         ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
843         dest->port = udest->port;
844
845         atomic_set(&dest->activeconns, 0);
846         atomic_set(&dest->inactconns, 0);
847         atomic_set(&dest->persistconns, 0);
848         atomic_set(&dest->refcnt, 1);
849
850         INIT_LIST_HEAD(&dest->d_list);
851         spin_lock_init(&dest->dst_lock);
852         spin_lock_init(&dest->stats.lock);
853         __ip_vs_update_dest(svc, dest, udest, 1);
854
855         *dest_p = dest;
856
857         LeaveFunction(2);
858         return 0;
859
860 err_alloc:
861         kfree(dest);
862         return -ENOMEM;
863 }
864
865
866 /*
867  *      Add a destination into an existing service
868  */
869 static int
870 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
871 {
872         struct ip_vs_dest *dest;
873         union nf_inet_addr daddr;
874         __be16 dport = udest->port;
875         int ret;
876
877         EnterFunction(2);
878
879         if (udest->weight < 0) {
880                 pr_err("%s(): server weight less than zero\n", __func__);
881                 return -ERANGE;
882         }
883
884         if (udest->l_threshold > udest->u_threshold) {
885                 pr_err("%s(): lower threshold is higher than upper threshold\n",
886                         __func__);
887                 return -ERANGE;
888         }
889
890         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
891
892         /*
893          * Check if the dest already exists in the list
894          */
895         dest = ip_vs_lookup_dest(svc, &daddr, dport);
896
897         if (dest != NULL) {
898                 IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
899                 return -EEXIST;
900         }
901
902         /*
903          * Check if the dest already exists in the trash and
904          * is from the same service
905          */
906         dest = ip_vs_trash_get_dest(svc, &daddr, dport);
907
908         if (dest != NULL) {
909                 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
910                               "dest->refcnt=%d, service %u/%s:%u\n",
911                               IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
912                               atomic_read(&dest->refcnt),
913                               dest->vfwmark,
914                               IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
915                               ntohs(dest->vport));
916
917                 /*
918                  * Get the destination from the trash
919                  */
920                 list_del(&dest->n_list);
921
922                 __ip_vs_update_dest(svc, dest, udest, 1);
923                 ret = 0;
924         } else {
925                 /*
926                  * Allocate and initialize the dest structure
927                  */
928                 ret = ip_vs_new_dest(svc, udest, &dest);
929         }
930         LeaveFunction(2);
931
932         return ret;
933 }
934
935
936 /*
937  *      Edit a destination in the given service
938  */
939 static int
940 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
941 {
942         struct ip_vs_dest *dest;
943         union nf_inet_addr daddr;
944         __be16 dport = udest->port;
945
946         EnterFunction(2);
947
948         if (udest->weight < 0) {
949                 pr_err("%s(): server weight less than zero\n", __func__);
950                 return -ERANGE;
951         }
952
953         if (udest->l_threshold > udest->u_threshold) {
954                 pr_err("%s(): lower threshold is higher than upper threshold\n",
955                         __func__);
956                 return -ERANGE;
957         }
958
959         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
960
961         /*
962          *  Lookup the destination list
963          */
964         dest = ip_vs_lookup_dest(svc, &daddr, dport);
965
966         if (dest == NULL) {
967                 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
968                 return -ENOENT;
969         }
970
971         __ip_vs_update_dest(svc, dest, udest, 0);
972         LeaveFunction(2);
973
974         return 0;
975 }
976
977
978 /*
979  *      Delete a destination (must be already unlinked from the service)
980  */
981 static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
982 {
983         struct netns_ipvs *ipvs = net_ipvs(net);
984
985         ip_vs_kill_estimator(net, &dest->stats);
986
987         /*
988          *  Remove it from the d-linked list with the real services.
989          */
990         write_lock_bh(&ipvs->rs_lock);
991         ip_vs_rs_unhash(dest);
992         write_unlock_bh(&ipvs->rs_lock);
993
994         /*
995          *  Decrease the refcnt of the dest, and free the dest
996          *  if nobody refers to it (refcnt=0). Otherwise, throw
997          *  the destination into the trash.
998          */
999         if (atomic_dec_and_test(&dest->refcnt)) {
1000                 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
1001                               dest->vfwmark,
1002                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1003                               ntohs(dest->port));
1004                 ip_vs_dst_reset(dest);
1005                 /* simply decrease svc->refcnt here, let the caller check
1006                    and release the service if nobody refers to it.
1007                    Only user context can release destination and service,
1008                    and only one user context can update virtual service at a
1009                    time, so the operation here is OK */
1010                 atomic_dec(&dest->svc->refcnt);
1011                 free_percpu(dest->stats.cpustats);
1012                 kfree(dest);
1013         } else {
1014                 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
1015                               "dest->refcnt=%d\n",
1016                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1017                               ntohs(dest->port),
1018                               atomic_read(&dest->refcnt));
1019                 list_add(&dest->n_list, &ipvs->dest_trash);
1020                 atomic_inc(&dest->refcnt);
1021         }
1022 }
1023
1024
1025 /*
1026  *      Unlink a destination from the given service
1027  */
1028 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1029                                 struct ip_vs_dest *dest,
1030                                 int svcupd)
1031 {
1032         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1033
1034         /*
1035          *  Remove it from the d-linked destination list.
1036          */
1037         list_del(&dest->n_list);
1038         svc->num_dests--;
1039
1040         /*
1041          *  Call the update_service function of its scheduler
1042          */
1043         if (svcupd && svc->scheduler->update_service)
1044                         svc->scheduler->update_service(svc);
1045 }
1046
1047
1048 /*
1049  *      Delete a destination server in the given service
1050  */
1051 static int
1052 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1053 {
1054         struct ip_vs_dest *dest;
1055         __be16 dport = udest->port;
1056
1057         EnterFunction(2);
1058
1059         dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
1060
1061         if (dest == NULL) {
1062                 IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
1063                 return -ENOENT;
1064         }
1065
1066         write_lock_bh(&__ip_vs_svc_lock);
1067
1068         /*
1069          *      Wait until all other svc users go away.
1070          */
1071         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1072
1073         /*
1074          *      Unlink dest from the service
1075          */
1076         __ip_vs_unlink_dest(svc, dest, 1);
1077
1078         write_unlock_bh(&__ip_vs_svc_lock);
1079
1080         /*
1081          *      Delete the destination
1082          */
1083         __ip_vs_del_dest(svc->net, dest);
1084
1085         LeaveFunction(2);
1086
1087         return 0;
1088 }
1089
1090
1091 /*
1092  *      Add a service into the service hash table
1093  */
1094 static int
1095 ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
1096                   struct ip_vs_service **svc_p)
1097 {
1098         int ret = 0;
1099         struct ip_vs_scheduler *sched = NULL;
1100         struct ip_vs_pe *pe = NULL;
1101         struct ip_vs_service *svc = NULL;
1102         struct netns_ipvs *ipvs = net_ipvs(net);
1103
1104         /* increase the module use count */
1105         ip_vs_use_count_inc();
1106
1107         /* Lookup the scheduler by 'u->sched_name' */
1108         sched = ip_vs_scheduler_get(u->sched_name);
1109         if (sched == NULL) {
1110                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1111                 ret = -ENOENT;
1112                 goto out_err;
1113         }
1114
1115         if (u->pe_name && *u->pe_name) {
1116                 pe = ip_vs_pe_getbyname(u->pe_name);
1117                 if (pe == NULL) {
1118                         pr_info("persistence engine module ip_vs_pe_%s "
1119                                 "not found\n", u->pe_name);
1120                         ret = -ENOENT;
1121                         goto out_err;
1122                 }
1123         }
1124
1125 #ifdef CONFIG_IP_VS_IPV6
1126         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1127                 ret = -EINVAL;
1128                 goto out_err;
1129         }
1130 #endif
1131
1132         svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
1133         if (svc == NULL) {
1134                 IP_VS_DBG(1, "%s(): no memory\n", __func__);
1135                 ret = -ENOMEM;
1136                 goto out_err;
1137         }
1138         svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1139         if (!svc->stats.cpustats) {
1140                 pr_err("%s() alloc_percpu failed\n", __func__);
1141                 goto out_err;
1142         }
1143
1144         /* I'm the first user of the service */
1145         atomic_set(&svc->usecnt, 0);
1146         atomic_set(&svc->refcnt, 0);
1147
1148         svc->af = u->af;
1149         svc->protocol = u->protocol;
1150         ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1151         svc->port = u->port;
1152         svc->fwmark = u->fwmark;
1153         svc->flags = u->flags;
1154         svc->timeout = u->timeout * HZ;
1155         svc->netmask = u->netmask;
1156         svc->net = net;
1157
1158         INIT_LIST_HEAD(&svc->destinations);
1159         rwlock_init(&svc->sched_lock);
1160         spin_lock_init(&svc->stats.lock);
1161
1162         /* Bind the scheduler */
1163         ret = ip_vs_bind_scheduler(svc, sched);
1164         if (ret)
1165                 goto out_err;
1166         sched = NULL;
1167
1168         /* Bind the ct retriever */
1169         ip_vs_bind_pe(svc, pe);
1170         pe = NULL;
1171
1172         /* Update the virtual service counters */
1173         if (svc->port == FTPPORT)
1174                 atomic_inc(&ipvs->ftpsvc_counter);
1175         else if (svc->port == 0)
1176                 atomic_inc(&ipvs->nullsvc_counter);
1177
1178         ip_vs_new_estimator(net, &svc->stats);
1179
1180         /* Count only IPv4 services for old get/setsockopt interface */
1181         if (svc->af == AF_INET)
1182                 ipvs->num_services++;
1183
1184         /* Hash the service into the service table */
1185         write_lock_bh(&__ip_vs_svc_lock);
1186         ip_vs_svc_hash(svc);
1187         write_unlock_bh(&__ip_vs_svc_lock);
1188
1189         *svc_p = svc;
1190         return 0;
1191
1192
1193  out_err:
1194         if (svc != NULL) {
1195                 ip_vs_unbind_scheduler(svc);
1196                 if (svc->inc) {
1197                         local_bh_disable();
1198                         ip_vs_app_inc_put(svc->inc);
1199                         local_bh_enable();
1200                 }
1201                 if (svc->stats.cpustats)
1202                         free_percpu(svc->stats.cpustats);
1203                 kfree(svc);
1204         }
1205         ip_vs_scheduler_put(sched);
1206         ip_vs_pe_put(pe);
1207
1208         /* decrease the module use count */
1209         ip_vs_use_count_dec();
1210
1211         return ret;
1212 }
1213
1214
1215 /*
1216  *      Edit a service and bind it with a new scheduler
1217  */
1218 static int
1219 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1220 {
1221         struct ip_vs_scheduler *sched, *old_sched;
1222         struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1223         int ret = 0;
1224
1225         /*
1226          * Lookup the scheduler, by 'u->sched_name'
1227          */
1228         sched = ip_vs_scheduler_get(u->sched_name);
1229         if (sched == NULL) {
1230                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1231                 return -ENOENT;
1232         }
1233         old_sched = sched;
1234
1235         if (u->pe_name && *u->pe_name) {
1236                 pe = ip_vs_pe_getbyname(u->pe_name);
1237                 if (pe == NULL) {
1238                         pr_info("persistence engine module ip_vs_pe_%s "
1239                                 "not found\n", u->pe_name);
1240                         ret = -ENOENT;
1241                         goto out;
1242                 }
1243                 old_pe = pe;
1244         }
1245
1246 #ifdef CONFIG_IP_VS_IPV6
1247         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1248                 ret = -EINVAL;
1249                 goto out;
1250         }
1251 #endif
1252
1253         write_lock_bh(&__ip_vs_svc_lock);
1254
1255         /*
1256          * Wait until all other svc users go away.
1257          */
1258         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1259
1260         /*
1261          * Set the flags and timeout value
1262          */
1263         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1264         svc->timeout = u->timeout * HZ;
1265         svc->netmask = u->netmask;
1266
1267         old_sched = svc->scheduler;
1268         if (sched != old_sched) {
1269                 /*
1270                  * Unbind the old scheduler
1271                  */
1272                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1273                         old_sched = sched;
1274                         goto out_unlock;
1275                 }
1276
1277                 /*
1278                  * Bind the new scheduler
1279                  */
1280                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1281                         /*
1282                          * If ip_vs_bind_scheduler fails, restore the old
1283                          * scheduler.
1284                          * The main reason of failure is out of memory.
1285                          *
1286                          * The question is if the old scheduler can be
1287                          * restored all the time. TODO: if it cannot be
1288                          * restored some time, we must delete the service,
1289                          * otherwise the system may crash.
1290                          */
1291                         ip_vs_bind_scheduler(svc, old_sched);
1292                         old_sched = sched;
1293                         goto out_unlock;
1294                 }
1295         }
1296
1297         old_pe = svc->pe;
1298         if (pe != old_pe) {
1299                 ip_vs_unbind_pe(svc);
1300                 ip_vs_bind_pe(svc, pe);
1301         }
1302
1303   out_unlock:
1304         write_unlock_bh(&__ip_vs_svc_lock);
1305   out:
1306         ip_vs_scheduler_put(old_sched);
1307         ip_vs_pe_put(old_pe);
1308         return ret;
1309 }
1310
1311
1312 /*
1313  *      Delete a service from the service list
1314  *      - The service must be unlinked, unlocked and not referenced!
1315  *      - We are called under _bh lock
1316  */
1317 static void __ip_vs_del_service(struct ip_vs_service *svc)
1318 {
1319         struct ip_vs_dest *dest, *nxt;
1320         struct ip_vs_scheduler *old_sched;
1321         struct ip_vs_pe *old_pe;
1322         struct netns_ipvs *ipvs = net_ipvs(svc->net);
1323
1324         pr_info("%s: enter\n", __func__);
1325
1326         /* Count only IPv4 services for old get/setsockopt interface */
1327         if (svc->af == AF_INET)
1328                 ipvs->num_services--;
1329
1330         ip_vs_kill_estimator(svc->net, &svc->stats);
1331
1332         /* Unbind scheduler */
1333         old_sched = svc->scheduler;
1334         ip_vs_unbind_scheduler(svc);
1335         ip_vs_scheduler_put(old_sched);
1336
1337         /* Unbind persistence engine */
1338         old_pe = svc->pe;
1339         ip_vs_unbind_pe(svc);
1340         ip_vs_pe_put(old_pe);
1341
1342         /* Unbind app inc */
1343         if (svc->inc) {
1344                 ip_vs_app_inc_put(svc->inc);
1345                 svc->inc = NULL;
1346         }
1347
1348         /*
1349          *    Unlink the whole destination list
1350          */
1351         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1352                 __ip_vs_unlink_dest(svc, dest, 0);
1353                 __ip_vs_del_dest(svc->net, dest);
1354         }
1355
1356         /*
1357          *    Update the virtual service counters
1358          */
1359         if (svc->port == FTPPORT)
1360                 atomic_dec(&ipvs->ftpsvc_counter);
1361         else if (svc->port == 0)
1362                 atomic_dec(&ipvs->nullsvc_counter);
1363
1364         /*
1365          *    Free the service if nobody refers to it
1366          */
1367         if (atomic_read(&svc->refcnt) == 0) {
1368                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
1369                               svc->fwmark,
1370                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
1371                               ntohs(svc->port), atomic_read(&svc->usecnt));
1372                 free_percpu(svc->stats.cpustats);
1373                 kfree(svc);
1374         }
1375
1376         /* decrease the module use count */
1377         ip_vs_use_count_dec();
1378 }
1379
1380 /*
1381  * Unlink a service from list and try to delete it if its refcnt reached 0
1382  */
1383 static void ip_vs_unlink_service(struct ip_vs_service *svc)
1384 {
1385         /*
1386          * Unhash it from the service table
1387          */
1388         write_lock_bh(&__ip_vs_svc_lock);
1389
1390         ip_vs_svc_unhash(svc);
1391
1392         /*
1393          * Wait until all the svc users go away.
1394          */
1395         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1396
1397         __ip_vs_del_service(svc);
1398
1399         write_unlock_bh(&__ip_vs_svc_lock);
1400 }
1401
1402 /*
1403  *      Delete a service from the service list
1404  */
1405 static int ip_vs_del_service(struct ip_vs_service *svc)
1406 {
1407         if (svc == NULL)
1408                 return -EEXIST;
1409         ip_vs_unlink_service(svc);
1410
1411         return 0;
1412 }
1413
1414
1415 /*
1416  *      Flush all the virtual services
1417  */
1418 static int ip_vs_flush(struct net *net)
1419 {
1420         int idx;
1421         struct ip_vs_service *svc, *nxt;
1422
1423         /*
1424          * Flush the service table hashed by <netns,protocol,addr,port>
1425          */
1426         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1427                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
1428                                          s_list) {
1429                         if (net_eq(svc->net, net))
1430                                 ip_vs_unlink_service(svc);
1431                 }
1432         }
1433
1434         /*
1435          * Flush the service table hashed by fwmark
1436          */
1437         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1438                 list_for_each_entry_safe(svc, nxt,
1439                                          &ip_vs_svc_fwm_table[idx], f_list) {
1440                         if (net_eq(svc->net, net))
1441                                 ip_vs_unlink_service(svc);
1442                 }
1443         }
1444
1445         return 0;
1446 }
1447
1448
1449 /*
1450  *      Zero counters in a service or all services
1451  */
1452 static int ip_vs_zero_service(struct ip_vs_service *svc)
1453 {
1454         struct ip_vs_dest *dest;
1455
1456         write_lock_bh(&__ip_vs_svc_lock);
1457         list_for_each_entry(dest, &svc->destinations, n_list) {
1458                 ip_vs_zero_stats(&dest->stats);
1459         }
1460         ip_vs_zero_stats(&svc->stats);
1461         write_unlock_bh(&__ip_vs_svc_lock);
1462         return 0;
1463 }
1464
1465 static int ip_vs_zero_all(struct net *net)
1466 {
1467         int idx;
1468         struct ip_vs_service *svc;
1469
1470         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1471                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1472                         if (net_eq(svc->net, net))
1473                                 ip_vs_zero_service(svc);
1474                 }
1475         }
1476
1477         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1478                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1479                         if (net_eq(svc->net, net))
1480                                 ip_vs_zero_service(svc);
1481                 }
1482         }
1483
1484         ip_vs_zero_stats(net_ipvs(net)->tot_stats);
1485         return 0;
1486 }
1487
1488
1489 static int
1490 proc_do_defense_mode(ctl_table *table, int write,
1491                      void __user *buffer, size_t *lenp, loff_t *ppos)
1492 {
1493         struct net *net = current->nsproxy->net_ns;
1494         int *valp = table->data;
1495         int val = *valp;
1496         int rc;
1497
1498         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1499         if (write && (*valp != val)) {
1500                 if ((*valp < 0) || (*valp > 3)) {
1501                         /* Restore the correct value */
1502                         *valp = val;
1503                 } else {
1504                         update_defense_level(net_ipvs(net));
1505                 }
1506         }
1507         return rc;
1508 }
1509
1510
1511 static int
1512 proc_do_sync_threshold(ctl_table *table, int write,
1513                        void __user *buffer, size_t *lenp, loff_t *ppos)
1514 {
1515         int *valp = table->data;
1516         int val[2];
1517         int rc;
1518
1519         /* backup the value first */
1520         memcpy(val, valp, sizeof(val));
1521
1522         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1523         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1524                 /* Restore the correct value */
1525                 memcpy(valp, val, sizeof(val));
1526         }
1527         return rc;
1528 }
1529
1530 static int
1531 proc_do_sync_mode(ctl_table *table, int write,
1532                      void __user *buffer, size_t *lenp, loff_t *ppos)
1533 {
1534         int *valp = table->data;
1535         int val = *valp;
1536         int rc;
1537
1538         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1539         if (write && (*valp != val)) {
1540                 if ((*valp < 0) || (*valp > 1)) {
1541                         /* Restore the correct value */
1542                         *valp = val;
1543                 } else {
1544                         struct net *net = current->nsproxy->net_ns;
1545                         ip_vs_sync_switch_mode(net, val);
1546                 }
1547         }
1548         return rc;
1549 }
1550
1551 /*
1552  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1553  *      Do not change order or insert new entries without
1554  *      align with netns init in __ip_vs_control_init()
1555  */
1556
1557 static struct ctl_table vs_vars[] = {
1558         {
1559                 .procname       = "amemthresh",
1560                 .maxlen         = sizeof(int),
1561                 .mode           = 0644,
1562                 .proc_handler   = proc_dointvec,
1563         },
1564         {
1565                 .procname       = "am_droprate",
1566                 .maxlen         = sizeof(int),
1567                 .mode           = 0644,
1568                 .proc_handler   = proc_dointvec,
1569         },
1570         {
1571                 .procname       = "drop_entry",
1572                 .maxlen         = sizeof(int),
1573                 .mode           = 0644,
1574                 .proc_handler   = proc_do_defense_mode,
1575         },
1576         {
1577                 .procname       = "drop_packet",
1578                 .maxlen         = sizeof(int),
1579                 .mode           = 0644,
1580                 .proc_handler   = proc_do_defense_mode,
1581         },
1582 #ifdef CONFIG_IP_VS_NFCT
1583         {
1584                 .procname       = "conntrack",
1585                 .maxlen         = sizeof(int),
1586                 .mode           = 0644,
1587                 .proc_handler   = &proc_dointvec,
1588         },
1589 #endif
1590         {
1591                 .procname       = "secure_tcp",
1592                 .maxlen         = sizeof(int),
1593                 .mode           = 0644,
1594                 .proc_handler   = proc_do_defense_mode,
1595         },
1596         {
1597                 .procname       = "snat_reroute",
1598                 .maxlen         = sizeof(int),
1599                 .mode           = 0644,
1600                 .proc_handler   = &proc_dointvec,
1601         },
1602         {
1603                 .procname       = "sync_version",
1604                 .maxlen         = sizeof(int),
1605                 .mode           = 0644,
1606                 .proc_handler   = &proc_do_sync_mode,
1607         },
1608         {
1609                 .procname       = "cache_bypass",
1610                 .maxlen         = sizeof(int),
1611                 .mode           = 0644,
1612                 .proc_handler   = proc_dointvec,
1613         },
1614         {
1615                 .procname       = "expire_nodest_conn",
1616                 .maxlen         = sizeof(int),
1617                 .mode           = 0644,
1618                 .proc_handler   = proc_dointvec,
1619         },
1620         {
1621                 .procname       = "expire_quiescent_template",
1622                 .maxlen         = sizeof(int),
1623                 .mode           = 0644,
1624                 .proc_handler   = proc_dointvec,
1625         },
1626         {
1627                 .procname       = "sync_threshold",
1628                 .maxlen         =
1629                         sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
1630                 .mode           = 0644,
1631                 .proc_handler   = proc_do_sync_threshold,
1632         },
1633         {
1634                 .procname       = "nat_icmp_send",
1635                 .maxlen         = sizeof(int),
1636                 .mode           = 0644,
1637                 .proc_handler   = proc_dointvec,
1638         },
1639 #ifdef CONFIG_IP_VS_DEBUG
1640         {
1641                 .procname       = "debug_level",
1642                 .data           = &sysctl_ip_vs_debug_level,
1643                 .maxlen         = sizeof(int),
1644                 .mode           = 0644,
1645                 .proc_handler   = proc_dointvec,
1646         },
1647 #endif
1648 #if 0
1649         {
1650                 .procname       = "timeout_established",
1651                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1652                 .maxlen         = sizeof(int),
1653                 .mode           = 0644,
1654                 .proc_handler   = proc_dointvec_jiffies,
1655         },
1656         {
1657                 .procname       = "timeout_synsent",
1658                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1659                 .maxlen         = sizeof(int),
1660                 .mode           = 0644,
1661                 .proc_handler   = proc_dointvec_jiffies,
1662         },
1663         {
1664                 .procname       = "timeout_synrecv",
1665                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1666                 .maxlen         = sizeof(int),
1667                 .mode           = 0644,
1668                 .proc_handler   = proc_dointvec_jiffies,
1669         },
1670         {
1671                 .procname       = "timeout_finwait",
1672                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1673                 .maxlen         = sizeof(int),
1674                 .mode           = 0644,
1675                 .proc_handler   = proc_dointvec_jiffies,
1676         },
1677         {
1678                 .procname       = "timeout_timewait",
1679                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1680                 .maxlen         = sizeof(int),
1681                 .mode           = 0644,
1682                 .proc_handler   = proc_dointvec_jiffies,
1683         },
1684         {
1685                 .procname       = "timeout_close",
1686                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1687                 .maxlen         = sizeof(int),
1688                 .mode           = 0644,
1689                 .proc_handler   = proc_dointvec_jiffies,
1690         },
1691         {
1692                 .procname       = "timeout_closewait",
1693                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1694                 .maxlen         = sizeof(int),
1695                 .mode           = 0644,
1696                 .proc_handler   = proc_dointvec_jiffies,
1697         },
1698         {
1699                 .procname       = "timeout_lastack",
1700                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1701                 .maxlen         = sizeof(int),
1702                 .mode           = 0644,
1703                 .proc_handler   = proc_dointvec_jiffies,
1704         },
1705         {
1706                 .procname       = "timeout_listen",
1707                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1708                 .maxlen         = sizeof(int),
1709                 .mode           = 0644,
1710                 .proc_handler   = proc_dointvec_jiffies,
1711         },
1712         {
1713                 .procname       = "timeout_synack",
1714                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1715                 .maxlen         = sizeof(int),
1716                 .mode           = 0644,
1717                 .proc_handler   = proc_dointvec_jiffies,
1718         },
1719         {
1720                 .procname       = "timeout_udp",
1721                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1722                 .maxlen         = sizeof(int),
1723                 .mode           = 0644,
1724                 .proc_handler   = proc_dointvec_jiffies,
1725         },
1726         {
1727                 .procname       = "timeout_icmp",
1728                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1729                 .maxlen         = sizeof(int),
1730                 .mode           = 0644,
1731                 .proc_handler   = proc_dointvec_jiffies,
1732         },
1733 #endif
1734         { }
1735 };
1736
1737 const struct ctl_path net_vs_ctl_path[] = {
1738         { .procname = "net", },
1739         { .procname = "ipv4", },
1740         { .procname = "vs", },
1741         { }
1742 };
1743 EXPORT_SYMBOL_GPL(net_vs_ctl_path);
1744
1745 #ifdef CONFIG_PROC_FS
1746
1747 struct ip_vs_iter {
1748         struct seq_net_private p;  /* Do not move this, netns depends upon it*/
1749         struct list_head *table;
1750         int bucket;
1751 };
1752
1753 /*
1754  *      Write the contents of the VS rule table to a PROCfs file.
1755  *      (It is kept just for backward compatibility)
1756  */
1757 static inline const char *ip_vs_fwd_name(unsigned flags)
1758 {
1759         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1760         case IP_VS_CONN_F_LOCALNODE:
1761                 return "Local";
1762         case IP_VS_CONN_F_TUNNEL:
1763                 return "Tunnel";
1764         case IP_VS_CONN_F_DROUTE:
1765                 return "Route";
1766         default:
1767                 return "Masq";
1768         }
1769 }
1770
1771
1772 /* Get the Nth entry in the two lists */
1773 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1774 {
1775         struct net *net = seq_file_net(seq);
1776         struct ip_vs_iter *iter = seq->private;
1777         int idx;
1778         struct ip_vs_service *svc;
1779
1780         /* look in hash by protocol */
1781         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1782                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1783                         if (net_eq(svc->net, net) && pos-- == 0) {
1784                                 iter->table = ip_vs_svc_table;
1785                                 iter->bucket = idx;
1786                                 return svc;
1787                         }
1788                 }
1789         }
1790
1791         /* keep looking in fwmark */
1792         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1793                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1794                         if (net_eq(svc->net, net) && pos-- == 0) {
1795                                 iter->table = ip_vs_svc_fwm_table;
1796                                 iter->bucket = idx;
1797                                 return svc;
1798                         }
1799                 }
1800         }
1801
1802         return NULL;
1803 }
1804
1805 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1806 __acquires(__ip_vs_svc_lock)
1807 {
1808
1809         read_lock_bh(&__ip_vs_svc_lock);
1810         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1811 }
1812
1813
1814 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1815 {
1816         struct list_head *e;
1817         struct ip_vs_iter *iter;
1818         struct ip_vs_service *svc;
1819
1820         ++*pos;
1821         if (v == SEQ_START_TOKEN)
1822                 return ip_vs_info_array(seq,0);
1823
1824         svc = v;
1825         iter = seq->private;
1826
1827         if (iter->table == ip_vs_svc_table) {
1828                 /* next service in table hashed by protocol */
1829                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1830                         return list_entry(e, struct ip_vs_service, s_list);
1831
1832
1833                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1834                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1835                                             s_list) {
1836                                 return svc;
1837                         }
1838                 }
1839
1840                 iter->table = ip_vs_svc_fwm_table;
1841                 iter->bucket = -1;
1842                 goto scan_fwmark;
1843         }
1844
1845         /* next service in hashed by fwmark */
1846         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1847                 return list_entry(e, struct ip_vs_service, f_list);
1848
1849  scan_fwmark:
1850         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1851                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1852                                     f_list)
1853                         return svc;
1854         }
1855
1856         return NULL;
1857 }
1858
1859 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1860 __releases(__ip_vs_svc_lock)
1861 {
1862         read_unlock_bh(&__ip_vs_svc_lock);
1863 }
1864
1865
1866 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1867 {
1868         if (v == SEQ_START_TOKEN) {
1869                 seq_printf(seq,
1870                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1871                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
1872                 seq_puts(seq,
1873                          "Prot LocalAddress:Port Scheduler Flags\n");
1874                 seq_puts(seq,
1875                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1876         } else {
1877                 const struct ip_vs_service *svc = v;
1878                 const struct ip_vs_iter *iter = seq->private;
1879                 const struct ip_vs_dest *dest;
1880
1881                 if (iter->table == ip_vs_svc_table) {
1882 #ifdef CONFIG_IP_VS_IPV6
1883                         if (svc->af == AF_INET6)
1884                                 seq_printf(seq, "%s  [%pI6]:%04X %s ",
1885                                            ip_vs_proto_name(svc->protocol),
1886                                            &svc->addr.in6,
1887                                            ntohs(svc->port),
1888                                            svc->scheduler->name);
1889                         else
1890 #endif
1891                                 seq_printf(seq, "%s  %08X:%04X %s %s ",
1892                                            ip_vs_proto_name(svc->protocol),
1893                                            ntohl(svc->addr.ip),
1894                                            ntohs(svc->port),
1895                                            svc->scheduler->name,
1896                                            (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
1897                 } else {
1898                         seq_printf(seq, "FWM  %08X %s %s",
1899                                    svc->fwmark, svc->scheduler->name,
1900                                    (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
1901                 }
1902
1903                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1904                         seq_printf(seq, "persistent %d %08X\n",
1905                                 svc->timeout,
1906                                 ntohl(svc->netmask));
1907                 else
1908                         seq_putc(seq, '\n');
1909
1910                 list_for_each_entry(dest, &svc->destinations, n_list) {
1911 #ifdef CONFIG_IP_VS_IPV6
1912                         if (dest->af == AF_INET6)
1913                                 seq_printf(seq,
1914                                            "  -> [%pI6]:%04X"
1915                                            "      %-7s %-6d %-10d %-10d\n",
1916                                            &dest->addr.in6,
1917                                            ntohs(dest->port),
1918                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1919                                            atomic_read(&dest->weight),
1920                                            atomic_read(&dest->activeconns),
1921                                            atomic_read(&dest->inactconns));
1922                         else
1923 #endif
1924                                 seq_printf(seq,
1925                                            "  -> %08X:%04X      "
1926                                            "%-7s %-6d %-10d %-10d\n",
1927                                            ntohl(dest->addr.ip),
1928                                            ntohs(dest->port),
1929                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1930                                            atomic_read(&dest->weight),
1931                                            atomic_read(&dest->activeconns),
1932                                            atomic_read(&dest->inactconns));
1933
1934                 }
1935         }
1936         return 0;
1937 }
1938
1939 static const struct seq_operations ip_vs_info_seq_ops = {
1940         .start = ip_vs_info_seq_start,
1941         .next  = ip_vs_info_seq_next,
1942         .stop  = ip_vs_info_seq_stop,
1943         .show  = ip_vs_info_seq_show,
1944 };
1945
1946 static int ip_vs_info_open(struct inode *inode, struct file *file)
1947 {
1948         return seq_open_net(inode, file, &ip_vs_info_seq_ops,
1949                         sizeof(struct ip_vs_iter));
1950 }
1951
1952 static const struct file_operations ip_vs_info_fops = {
1953         .owner   = THIS_MODULE,
1954         .open    = ip_vs_info_open,
1955         .read    = seq_read,
1956         .llseek  = seq_lseek,
1957         .release = seq_release_private,
1958 };
1959
1960 #endif
1961
1962 #ifdef CONFIG_PROC_FS
1963 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1964 {
1965         struct net *net = seq_file_single_net(seq);
1966         struct ip_vs_stats *tot_stats = net_ipvs(net)->tot_stats;
1967
1968 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1969         seq_puts(seq,
1970                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1971         seq_printf(seq,
1972                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1973
1974         spin_lock_bh(&tot_stats->lock);
1975         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", tot_stats->ustats.conns,
1976                    tot_stats->ustats.inpkts, tot_stats->ustats.outpkts,
1977                    (unsigned long long) tot_stats->ustats.inbytes,
1978                    (unsigned long long) tot_stats->ustats.outbytes);
1979
1980 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1981         seq_puts(seq,
1982                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
1983         seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1984                         tot_stats->ustats.cps,
1985                         tot_stats->ustats.inpps,
1986                         tot_stats->ustats.outpps,
1987                         tot_stats->ustats.inbps,
1988                         tot_stats->ustats.outbps);
1989         spin_unlock_bh(&tot_stats->lock);
1990
1991         return 0;
1992 }
1993
1994 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1995 {
1996         return single_open_net(inode, file, ip_vs_stats_show);
1997 }
1998
1999 static const struct file_operations ip_vs_stats_fops = {
2000         .owner = THIS_MODULE,
2001         .open = ip_vs_stats_seq_open,
2002         .read = seq_read,
2003         .llseek = seq_lseek,
2004         .release = single_release,
2005 };
2006
2007 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
2008 {
2009         struct net *net = seq_file_single_net(seq);
2010         struct ip_vs_stats *tot_stats = net_ipvs(net)->tot_stats;
2011         int i;
2012
2013 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2014         seq_puts(seq,
2015                  "       Total Incoming Outgoing         Incoming         Outgoing\n");
2016         seq_printf(seq,
2017                    "CPU    Conns  Packets  Packets            Bytes            Bytes\n");
2018
2019         for_each_possible_cpu(i) {
2020                 struct ip_vs_cpu_stats *u = per_cpu_ptr(net->ipvs->cpustats, i);
2021                 seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
2022                             i, u->ustats.conns, u->ustats.inpkts,
2023                             u->ustats.outpkts, (__u64)u->ustats.inbytes,
2024                             (__u64)u->ustats.outbytes);
2025         }
2026
2027         spin_lock_bh(&tot_stats->lock);
2028         seq_printf(seq, "  ~ %8X %8X %8X %16LX %16LX\n\n",
2029                    tot_stats->ustats.conns, tot_stats->ustats.inpkts,
2030                    tot_stats->ustats.outpkts,
2031                    (unsigned long long) tot_stats->ustats.inbytes,
2032                    (unsigned long long) tot_stats->ustats.outbytes);
2033
2034 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2035         seq_puts(seq,
2036                    "     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2037         seq_printf(seq, "    %8X %8X %8X %16X %16X\n",
2038                         tot_stats->ustats.cps,
2039                         tot_stats->ustats.inpps,
2040                         tot_stats->ustats.outpps,
2041                         tot_stats->ustats.inbps,
2042                         tot_stats->ustats.outbps);
2043         spin_unlock_bh(&tot_stats->lock);
2044
2045         return 0;
2046 }
2047
2048 static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
2049 {
2050         return single_open_net(inode, file, ip_vs_stats_percpu_show);
2051 }
2052
2053 static const struct file_operations ip_vs_stats_percpu_fops = {
2054         .owner = THIS_MODULE,
2055         .open = ip_vs_stats_percpu_seq_open,
2056         .read = seq_read,
2057         .llseek = seq_lseek,
2058         .release = single_release,
2059 };
2060 #endif
2061
2062 /*
2063  *      Set timeout values for tcp tcpfin udp in the timeout_table.
2064  */
2065 static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
2066 {
2067 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2068         struct ip_vs_proto_data *pd;
2069 #endif
2070
2071         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
2072                   u->tcp_timeout,
2073                   u->tcp_fin_timeout,
2074                   u->udp_timeout);
2075
2076 #ifdef CONFIG_IP_VS_PROTO_TCP
2077         if (u->tcp_timeout) {
2078                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2079                 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
2080                         = u->tcp_timeout * HZ;
2081         }
2082
2083         if (u->tcp_fin_timeout) {
2084                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2085                 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
2086                         = u->tcp_fin_timeout * HZ;
2087         }
2088 #endif
2089
2090 #ifdef CONFIG_IP_VS_PROTO_UDP
2091         if (u->udp_timeout) {
2092                 pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2093                 pd->timeout_table[IP_VS_UDP_S_NORMAL]
2094                         = u->udp_timeout * HZ;
2095         }
2096 #endif
2097         return 0;
2098 }
2099
2100
2101 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2102 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
2103 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
2104                                  sizeof(struct ip_vs_dest_user))
2105 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
2106 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
2107 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
2108
2109 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
2110         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
2111         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
2112         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
2113         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
2114         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
2115         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
2116         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
2117         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
2118         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
2119         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
2120         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
2121 };
2122
2123 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2124                                   struct ip_vs_service_user *usvc_compat)
2125 {
2126         memset(usvc, 0, sizeof(*usvc));
2127
2128         usvc->af                = AF_INET;
2129         usvc->protocol          = usvc_compat->protocol;
2130         usvc->addr.ip           = usvc_compat->addr;
2131         usvc->port              = usvc_compat->port;
2132         usvc->fwmark            = usvc_compat->fwmark;
2133
2134         /* Deep copy of sched_name is not needed here */
2135         usvc->sched_name        = usvc_compat->sched_name;
2136
2137         usvc->flags             = usvc_compat->flags;
2138         usvc->timeout           = usvc_compat->timeout;
2139         usvc->netmask           = usvc_compat->netmask;
2140 }
2141
2142 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2143                                    struct ip_vs_dest_user *udest_compat)
2144 {
2145         memset(udest, 0, sizeof(*udest));
2146
2147         udest->addr.ip          = udest_compat->addr;
2148         udest->port             = udest_compat->port;
2149         udest->conn_flags       = udest_compat->conn_flags;
2150         udest->weight           = udest_compat->weight;
2151         udest->u_threshold      = udest_compat->u_threshold;
2152         udest->l_threshold      = udest_compat->l_threshold;
2153 }
2154
2155 static int
2156 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2157 {
2158         struct net *net = sock_net(sk);
2159         int ret;
2160         unsigned char arg[MAX_ARG_LEN];
2161         struct ip_vs_service_user *usvc_compat;
2162         struct ip_vs_service_user_kern usvc;
2163         struct ip_vs_service *svc;
2164         struct ip_vs_dest_user *udest_compat;
2165         struct ip_vs_dest_user_kern udest;
2166
2167         if (!capable(CAP_NET_ADMIN))
2168                 return -EPERM;
2169
2170         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
2171                 return -EINVAL;
2172         if (len < 0 || len >  MAX_ARG_LEN)
2173                 return -EINVAL;
2174         if (len != set_arglen[SET_CMDID(cmd)]) {
2175                 pr_err("set_ctl: len %u != %u\n",
2176                        len, set_arglen[SET_CMDID(cmd)]);
2177                 return -EINVAL;
2178         }
2179
2180         if (copy_from_user(arg, user, len) != 0)
2181                 return -EFAULT;
2182
2183         /* increase the module use count */
2184         ip_vs_use_count_inc();
2185
2186         if (mutex_lock_interruptible(&__ip_vs_mutex)) {
2187                 ret = -ERESTARTSYS;
2188                 goto out_dec;
2189         }
2190
2191         if (cmd == IP_VS_SO_SET_FLUSH) {
2192                 /* Flush the virtual service */
2193                 ret = ip_vs_flush(net);
2194                 goto out_unlock;
2195         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
2196                 /* Set timeout values for (tcp tcpfin udp) */
2197                 ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
2198                 goto out_unlock;
2199         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
2200                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2201                 ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
2202                                         dm->syncid);
2203                 goto out_unlock;
2204         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
2205                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2206                 ret = stop_sync_thread(net, dm->state);
2207                 goto out_unlock;
2208         }
2209
2210         usvc_compat = (struct ip_vs_service_user *)arg;
2211         udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
2212
2213         /* We only use the new structs internally, so copy userspace compat
2214          * structs to extended internal versions */
2215         ip_vs_copy_usvc_compat(&usvc, usvc_compat);
2216         ip_vs_copy_udest_compat(&udest, udest_compat);
2217
2218         if (cmd == IP_VS_SO_SET_ZERO) {
2219                 /* if no service address is set, zero counters in all */
2220                 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2221                         ret = ip_vs_zero_all(net);
2222                         goto out_unlock;
2223                 }
2224         }
2225
2226         /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
2227         if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
2228             usvc.protocol != IPPROTO_SCTP) {
2229                 pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n",
2230                        usvc.protocol, &usvc.addr.ip,
2231                        ntohs(usvc.port), usvc.sched_name);
2232                 ret = -EFAULT;
2233                 goto out_unlock;
2234         }
2235
2236         /* Lookup the exact service by <protocol, addr, port> or fwmark */
2237         if (usvc.fwmark == 0)
2238                 svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
2239                                            &usvc.addr, usvc.port);
2240         else
2241                 svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
2242
2243         if (cmd != IP_VS_SO_SET_ADD
2244             && (svc == NULL || svc->protocol != usvc.protocol)) {
2245                 ret = -ESRCH;
2246                 goto out_unlock;
2247         }
2248
2249         switch (cmd) {
2250         case IP_VS_SO_SET_ADD:
2251                 if (svc != NULL)
2252                         ret = -EEXIST;
2253                 else
2254                         ret = ip_vs_add_service(net, &usvc, &svc);
2255                 break;
2256         case IP_VS_SO_SET_EDIT:
2257                 ret = ip_vs_edit_service(svc, &usvc);
2258                 break;
2259         case IP_VS_SO_SET_DEL:
2260                 ret = ip_vs_del_service(svc);
2261                 if (!ret)
2262                         goto out_unlock;
2263                 break;
2264         case IP_VS_SO_SET_ZERO:
2265                 ret = ip_vs_zero_service(svc);
2266                 break;
2267         case IP_VS_SO_SET_ADDDEST:
2268                 ret = ip_vs_add_dest(svc, &udest);
2269                 break;
2270         case IP_VS_SO_SET_EDITDEST:
2271                 ret = ip_vs_edit_dest(svc, &udest);
2272                 break;
2273         case IP_VS_SO_SET_DELDEST:
2274                 ret = ip_vs_del_dest(svc, &udest);
2275                 break;
2276         default:
2277                 ret = -EINVAL;
2278         }
2279
2280   out_unlock:
2281         mutex_unlock(&__ip_vs_mutex);
2282   out_dec:
2283         /* decrease the module use count */
2284         ip_vs_use_count_dec();
2285
2286         return ret;
2287 }
2288
2289
2290 static void
2291 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2292 {
2293         spin_lock_bh(&src->lock);
2294         memcpy(dst, &src->ustats, sizeof(*dst));
2295         spin_unlock_bh(&src->lock);
2296 }
2297
2298 static void
2299 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2300 {
2301         dst->protocol = src->protocol;
2302         dst->addr = src->addr.ip;
2303         dst->port = src->port;
2304         dst->fwmark = src->fwmark;
2305         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2306         dst->flags = src->flags;
2307         dst->timeout = src->timeout / HZ;
2308         dst->netmask = src->netmask;
2309         dst->num_dests = src->num_dests;
2310         ip_vs_copy_stats(&dst->stats, &src->stats);
2311 }
2312
2313 static inline int
2314 __ip_vs_get_service_entries(struct net *net,
2315                             const struct ip_vs_get_services *get,
2316                             struct ip_vs_get_services __user *uptr)
2317 {
2318         int idx, count=0;
2319         struct ip_vs_service *svc;
2320         struct ip_vs_service_entry entry;
2321         int ret = 0;
2322
2323         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2324                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2325                         /* Only expose IPv4 entries to old interface */
2326                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2327                                 continue;
2328
2329                         if (count >= get->num_services)
2330                                 goto out;
2331                         memset(&entry, 0, sizeof(entry));
2332                         ip_vs_copy_service(&entry, svc);
2333                         if (copy_to_user(&uptr->entrytable[count],
2334                                          &entry, sizeof(entry))) {
2335                                 ret = -EFAULT;
2336                                 goto out;
2337                         }
2338                         count++;
2339                 }
2340         }
2341
2342         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2343                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2344                         /* Only expose IPv4 entries to old interface */
2345                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2346                                 continue;
2347
2348                         if (count >= get->num_services)
2349                                 goto out;
2350                         memset(&entry, 0, sizeof(entry));
2351                         ip_vs_copy_service(&entry, svc);
2352                         if (copy_to_user(&uptr->entrytable[count],
2353                                          &entry, sizeof(entry))) {
2354                                 ret = -EFAULT;
2355                                 goto out;
2356                         }
2357                         count++;
2358                 }
2359         }
2360   out:
2361         return ret;
2362 }
2363
2364 static inline int
2365 __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
2366                          struct ip_vs_get_dests __user *uptr)
2367 {
2368         struct ip_vs_service *svc;
2369         union nf_inet_addr addr = { .ip = get->addr };
2370         int ret = 0;
2371
2372         if (get->fwmark)
2373                 svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
2374         else
2375                 svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
2376                                            get->port);
2377
2378         if (svc) {
2379                 int count = 0;
2380                 struct ip_vs_dest *dest;
2381                 struct ip_vs_dest_entry entry;
2382
2383                 list_for_each_entry(dest, &svc->destinations, n_list) {
2384                         if (count >= get->num_dests)
2385                                 break;
2386
2387                         entry.addr = dest->addr.ip;
2388                         entry.port = dest->port;
2389                         entry.conn_flags = atomic_read(&dest->conn_flags);
2390                         entry.weight = atomic_read(&dest->weight);
2391                         entry.u_threshold = dest->u_threshold;
2392                         entry.l_threshold = dest->l_threshold;
2393                         entry.activeconns = atomic_read(&dest->activeconns);
2394                         entry.inactconns = atomic_read(&dest->inactconns);
2395                         entry.persistconns = atomic_read(&dest->persistconns);
2396                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2397                         if (copy_to_user(&uptr->entrytable[count],
2398                                          &entry, sizeof(entry))) {
2399                                 ret = -EFAULT;
2400                                 break;
2401                         }
2402                         count++;
2403                 }
2404         } else
2405                 ret = -ESRCH;
2406         return ret;
2407 }
2408
2409 static inline void
2410 __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
2411 {
2412 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2413         struct ip_vs_proto_data *pd;
2414 #endif
2415
2416 #ifdef CONFIG_IP_VS_PROTO_TCP
2417         pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2418         u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2419         u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2420 #endif
2421 #ifdef CONFIG_IP_VS_PROTO_UDP
2422         pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2423         u->udp_timeout =
2424                         pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2425 #endif
2426 }
2427
2428
2429 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2430 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2431 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2432 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2433 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2434 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2435 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2436
2437 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2438         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2439         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2440         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2441         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2442         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2443         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2444         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2445 };
2446
2447 static int
2448 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2449 {
2450         unsigned char arg[128];
2451         int ret = 0;
2452         unsigned int copylen;
2453         struct net *net = sock_net(sk);
2454         struct netns_ipvs *ipvs = net_ipvs(net);
2455
2456         BUG_ON(!net);
2457         if (!capable(CAP_NET_ADMIN))
2458                 return -EPERM;
2459
2460         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
2461                 return -EINVAL;
2462
2463         if (*len < get_arglen[GET_CMDID(cmd)]) {
2464                 pr_err("get_ctl: len %u < %u\n",
2465                        *len, get_arglen[GET_CMDID(cmd)]);
2466                 return -EINVAL;
2467         }
2468
2469         copylen = get_arglen[GET_CMDID(cmd)];
2470         if (copylen > 128)
2471                 return -EINVAL;
2472
2473         if (copy_from_user(arg, user, copylen) != 0)
2474                 return -EFAULT;
2475
2476         if (mutex_lock_interruptible(&__ip_vs_mutex))
2477                 return -ERESTARTSYS;
2478
2479         switch (cmd) {
2480         case IP_VS_SO_GET_VERSION:
2481         {
2482                 char buf[64];
2483
2484                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2485                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2486                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2487                         ret = -EFAULT;
2488                         goto out;
2489                 }
2490                 *len = strlen(buf)+1;
2491         }
2492         break;
2493
2494         case IP_VS_SO_GET_INFO:
2495         {
2496                 struct ip_vs_getinfo info;
2497                 info.version = IP_VS_VERSION_CODE;
2498                 info.size = ip_vs_conn_tab_size;
2499                 info.num_services = ipvs->num_services;
2500                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2501                         ret = -EFAULT;
2502         }
2503         break;
2504
2505         case IP_VS_SO_GET_SERVICES:
2506         {
2507                 struct ip_vs_get_services *get;
2508                 int size;
2509
2510                 get = (struct ip_vs_get_services *)arg;
2511                 size = sizeof(*get) +
2512                         sizeof(struct ip_vs_service_entry) * get->num_services;
2513                 if (*len != size) {
2514                         pr_err("length: %u != %u\n", *len, size);
2515                         ret = -EINVAL;
2516                         goto out;
2517                 }
2518                 ret = __ip_vs_get_service_entries(net, get, user);
2519         }
2520         break;
2521
2522         case IP_VS_SO_GET_SERVICE:
2523         {
2524                 struct ip_vs_service_entry *entry;
2525                 struct ip_vs_service *svc;
2526                 union nf_inet_addr addr;
2527
2528                 entry = (struct ip_vs_service_entry *)arg;
2529                 addr.ip = entry->addr;
2530                 if (entry->fwmark)
2531                         svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
2532                 else
2533                         svc = __ip_vs_service_find(net, AF_INET,
2534                                                    entry->protocol, &addr,
2535                                                    entry->port);
2536                 if (svc) {
2537                         ip_vs_copy_service(entry, svc);
2538                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2539                                 ret = -EFAULT;
2540                 } else
2541                         ret = -ESRCH;
2542         }
2543         break;
2544
2545         case IP_VS_SO_GET_DESTS:
2546         {
2547                 struct ip_vs_get_dests *get;
2548                 int size;
2549
2550                 get = (struct ip_vs_get_dests *)arg;
2551                 size = sizeof(*get) +
2552                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2553                 if (*len != size) {
2554                         pr_err("length: %u != %u\n", *len, size);
2555                         ret = -EINVAL;
2556                         goto out;
2557                 }
2558                 ret = __ip_vs_get_dest_entries(net, get, user);
2559         }
2560         break;
2561
2562         case IP_VS_SO_GET_TIMEOUT:
2563         {
2564                 struct ip_vs_timeout_user t;
2565
2566                 __ip_vs_get_timeouts(net, &t);
2567                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2568                         ret = -EFAULT;
2569         }
2570         break;
2571
2572         case IP_VS_SO_GET_DAEMON:
2573         {
2574                 struct ip_vs_daemon_user d[2];
2575
2576                 memset(&d, 0, sizeof(d));
2577                 if (ipvs->sync_state & IP_VS_STATE_MASTER) {
2578                         d[0].state = IP_VS_STATE_MASTER;
2579                         strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
2580                                 sizeof(d[0].mcast_ifn));
2581                         d[0].syncid = ipvs->master_syncid;
2582                 }
2583                 if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
2584                         d[1].state = IP_VS_STATE_BACKUP;
2585                         strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
2586                                 sizeof(d[1].mcast_ifn));
2587                         d[1].syncid = ipvs->backup_syncid;
2588                 }
2589                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2590                         ret = -EFAULT;
2591         }
2592         break;
2593
2594         default:
2595                 ret = -EINVAL;
2596         }
2597
2598   out:
2599         mutex_unlock(&__ip_vs_mutex);
2600         return ret;
2601 }
2602
2603
2604 static struct nf_sockopt_ops ip_vs_sockopts = {
2605         .pf             = PF_INET,
2606         .set_optmin     = IP_VS_BASE_CTL,
2607         .set_optmax     = IP_VS_SO_SET_MAX+1,
2608         .set            = do_ip_vs_set_ctl,
2609         .get_optmin     = IP_VS_BASE_CTL,
2610         .get_optmax     = IP_VS_SO_GET_MAX+1,
2611         .get            = do_ip_vs_get_ctl,
2612         .owner          = THIS_MODULE,
2613 };
2614
2615 /*
2616  * Generic Netlink interface
2617  */
2618
2619 /* IPVS genetlink family */
2620 static struct genl_family ip_vs_genl_family = {
2621         .id             = GENL_ID_GENERATE,
2622         .hdrsize        = 0,
2623         .name           = IPVS_GENL_NAME,
2624         .version        = IPVS_GENL_VERSION,
2625         .maxattr        = IPVS_CMD_MAX,
2626         .netnsok        = true,         /* Make ipvsadm to work on netns */
2627 };
2628
2629 /* Policy used for first-level command attributes */
2630 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
2631         [IPVS_CMD_ATTR_SERVICE]         = { .type = NLA_NESTED },
2632         [IPVS_CMD_ATTR_DEST]            = { .type = NLA_NESTED },
2633         [IPVS_CMD_ATTR_DAEMON]          = { .type = NLA_NESTED },
2634         [IPVS_CMD_ATTR_TIMEOUT_TCP]     = { .type = NLA_U32 },
2635         [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
2636         [IPVS_CMD_ATTR_TIMEOUT_UDP]     = { .type = NLA_U32 },
2637 };
2638
2639 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
2640 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
2641         [IPVS_DAEMON_ATTR_STATE]        = { .type = NLA_U32 },
2642         [IPVS_DAEMON_ATTR_MCAST_IFN]    = { .type = NLA_NUL_STRING,
2643                                             .len = IP_VS_IFNAME_MAXLEN },
2644         [IPVS_DAEMON_ATTR_SYNC_ID]      = { .type = NLA_U32 },
2645 };
2646
2647 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
2648 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
2649         [IPVS_SVC_ATTR_AF]              = { .type = NLA_U16 },
2650         [IPVS_SVC_ATTR_PROTOCOL]        = { .type = NLA_U16 },
2651         [IPVS_SVC_ATTR_ADDR]            = { .type = NLA_BINARY,
2652                                             .len = sizeof(union nf_inet_addr) },
2653         [IPVS_SVC_ATTR_PORT]            = { .type = NLA_U16 },
2654         [IPVS_SVC_ATTR_FWMARK]          = { .type = NLA_U32 },
2655         [IPVS_SVC_ATTR_SCHED_NAME]      = { .type = NLA_NUL_STRING,
2656                                             .len = IP_VS_SCHEDNAME_MAXLEN },
2657         [IPVS_SVC_ATTR_PE_NAME]         = { .type = NLA_NUL_STRING,
2658                                             .len = IP_VS_PENAME_MAXLEN },
2659         [IPVS_SVC_ATTR_FLAGS]           = { .type = NLA_BINARY,
2660                                             .len = sizeof(struct ip_vs_flags) },
2661         [IPVS_SVC_ATTR_TIMEOUT]         = { .type = NLA_U32 },
2662         [IPVS_SVC_ATTR_NETMASK]         = { .type = NLA_U32 },
2663         [IPVS_SVC_ATTR_STATS]           = { .type = NLA_NESTED },
2664 };
2665
2666 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
2667 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
2668         [IPVS_DEST_ATTR_ADDR]           = { .type = NLA_BINARY,
2669                                             .len = sizeof(union nf_inet_addr) },
2670         [IPVS_DEST_ATTR_PORT]           = { .type = NLA_U16 },
2671         [IPVS_DEST_ATTR_FWD_METHOD]     = { .type = NLA_U32 },
2672         [IPVS_DEST_ATTR_WEIGHT]         = { .type = NLA_U32 },
2673         [IPVS_DEST_ATTR_U_THRESH]       = { .type = NLA_U32 },
2674         [IPVS_DEST_ATTR_L_THRESH]       = { .type = NLA_U32 },
2675         [IPVS_DEST_ATTR_ACTIVE_CONNS]   = { .type = NLA_U32 },
2676         [IPVS_DEST_ATTR_INACT_CONNS]    = { .type = NLA_U32 },
2677         [IPVS_DEST_ATTR_PERSIST_CONNS]  = { .type = NLA_U32 },
2678         [IPVS_DEST_ATTR_STATS]          = { .type = NLA_NESTED },
2679 };
2680
2681 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
2682                                  struct ip_vs_stats *stats)
2683 {
2684         struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2685         if (!nl_stats)
2686                 return -EMSGSIZE;
2687
2688         spin_lock_bh(&stats->lock);
2689
2690         NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns);
2691         NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts);
2692         NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts);
2693         NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes);
2694         NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes);
2695         NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps);
2696         NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps);
2697         NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps);
2698         NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps);
2699         NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps);
2700
2701         spin_unlock_bh(&stats->lock);
2702
2703         nla_nest_end(skb, nl_stats);
2704
2705         return 0;
2706
2707 nla_put_failure:
2708         spin_unlock_bh(&stats->lock);
2709         nla_nest_cancel(skb, nl_stats);
2710         return -EMSGSIZE;
2711 }
2712
2713 static int ip_vs_genl_fill_service(struct sk_buff *skb,
2714                                    struct ip_vs_service *svc)
2715 {
2716         struct nlattr *nl_service;
2717         struct ip_vs_flags flags = { .flags = svc->flags,
2718                                      .mask = ~0 };
2719
2720         nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
2721         if (!nl_service)
2722                 return -EMSGSIZE;
2723
2724         NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af);
2725
2726         if (svc->fwmark) {
2727                 NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
2728         } else {
2729                 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol);
2730                 NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr);
2731                 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port);
2732         }
2733
2734         NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
2735         if (svc->pe)
2736                 NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name);
2737         NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
2738         NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
2739         NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
2740
2741         if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
2742                 goto nla_put_failure;
2743
2744         nla_nest_end(skb, nl_service);
2745
2746         return 0;
2747
2748 nla_put_failure:
2749         nla_nest_cancel(skb, nl_service);
2750         return -EMSGSIZE;
2751 }
2752
2753 static int ip_vs_genl_dump_service(struct sk_buff *skb,
2754                                    struct ip_vs_service *svc,
2755                                    struct netlink_callback *cb)
2756 {
2757         void *hdr;
2758
2759         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2760                           &ip_vs_genl_family, NLM_F_MULTI,
2761                           IPVS_CMD_NEW_SERVICE);
2762         if (!hdr)
2763                 return -EMSGSIZE;
2764
2765         if (ip_vs_genl_fill_service(skb, svc) < 0)
2766                 goto nla_put_failure;
2767
2768         return genlmsg_end(skb, hdr);
2769
2770 nla_put_failure:
2771         genlmsg_cancel(skb, hdr);
2772         return -EMSGSIZE;
2773 }
2774
2775 static int ip_vs_genl_dump_services(struct sk_buff *skb,
2776                                     struct netlink_callback *cb)
2777 {
2778         int idx = 0, i;
2779         int start = cb->args[0];
2780         struct ip_vs_service *svc;
2781         struct net *net = skb_sknet(skb);
2782
2783         mutex_lock(&__ip_vs_mutex);
2784         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2785                 list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
2786                         if (++idx <= start || !net_eq(svc->net, net))
2787                                 continue;
2788                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2789                                 idx--;
2790                                 goto nla_put_failure;
2791                         }
2792                 }
2793         }
2794
2795         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2796                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
2797                         if (++idx <= start || !net_eq(svc->net, net))
2798                                 continue;
2799                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2800                                 idx--;
2801                                 goto nla_put_failure;
2802                         }
2803                 }
2804         }
2805
2806 nla_put_failure:
2807         mutex_unlock(&__ip_vs_mutex);
2808         cb->args[0] = idx;
2809
2810         return skb->len;
2811 }
2812
2813 static int ip_vs_genl_parse_service(struct net *net,
2814                                     struct ip_vs_service_user_kern *usvc,
2815                                     struct nlattr *nla, int full_entry,
2816                                     struct ip_vs_service **ret_svc)
2817 {
2818         struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
2819         struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
2820         struct ip_vs_service *svc;
2821
2822         /* Parse mandatory identifying service fields first */
2823         if (nla == NULL ||
2824             nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
2825                 return -EINVAL;
2826
2827         nla_af          = attrs[IPVS_SVC_ATTR_AF];
2828         nla_protocol    = attrs[IPVS_SVC_ATTR_PROTOCOL];
2829         nla_addr        = attrs[IPVS_SVC_ATTR_ADDR];
2830         nla_port        = attrs[IPVS_SVC_ATTR_PORT];
2831         nla_fwmark      = attrs[IPVS_SVC_ATTR_FWMARK];
2832
2833         if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
2834                 return -EINVAL;
2835
2836         memset(usvc, 0, sizeof(*usvc));
2837
2838         usvc->af = nla_get_u16(nla_af);
2839 #ifdef CONFIG_IP_VS_IPV6
2840         if (usvc->af != AF_INET && usvc->af != AF_INET6)
2841 #else
2842         if (usvc->af != AF_INET)
2843 #endif
2844                 return -EAFNOSUPPORT;
2845
2846         if (nla_fwmark) {
2847                 usvc->protocol = IPPROTO_TCP;
2848                 usvc->fwmark = nla_get_u32(nla_fwmark);
2849         } else {
2850                 usvc->protocol = nla_get_u16(nla_protocol);
2851                 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
2852                 usvc->port = nla_get_u16(nla_port);
2853                 usvc->fwmark = 0;
2854         }
2855
2856         if (usvc->fwmark)
2857                 svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
2858         else
2859                 svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
2860                                            &usvc->addr, usvc->port);
2861         *ret_svc = svc;
2862
2863         /* If a full entry was requested, check for the additional fields */
2864         if (full_entry) {
2865                 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
2866                               *nla_netmask;
2867                 struct ip_vs_flags flags;
2868
2869                 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
2870                 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
2871                 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
2872                 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
2873                 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
2874
2875                 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
2876                         return -EINVAL;
2877
2878                 nla_memcpy(&flags, nla_flags, sizeof(flags));
2879
2880                 /* prefill flags from service if it already exists */
2881                 if (svc)
2882                         usvc->flags = svc->flags;
2883
2884                 /* set new flags from userland */
2885                 usvc->flags = (usvc->flags & ~flags.mask) |
2886                               (flags.flags & flags.mask);
2887                 usvc->sched_name = nla_data(nla_sched);
2888                 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
2889                 usvc->timeout = nla_get_u32(nla_timeout);
2890                 usvc->netmask = nla_get_u32(nla_netmask);
2891         }
2892
2893         return 0;
2894 }
2895
2896 static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
2897                                                      struct nlattr *nla)
2898 {
2899         struct ip_vs_service_user_kern usvc;
2900         struct ip_vs_service *svc;
2901         int ret;
2902
2903         ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
2904         return ret ? ERR_PTR(ret) : svc;
2905 }
2906
2907 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
2908 {
2909         struct nlattr *nl_dest;
2910
2911         nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
2912         if (!nl_dest)
2913                 return -EMSGSIZE;
2914
2915         NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr);
2916         NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port);
2917
2918         NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD,
2919                     atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK);
2920         NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight));
2921         NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold);
2922         NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold);
2923         NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
2924                     atomic_read(&dest->activeconns));
2925         NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS,
2926                     atomic_read(&dest->inactconns));
2927         NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
2928                     atomic_read(&dest->persistconns));
2929
2930         if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
2931                 goto nla_put_failure;
2932
2933         nla_nest_end(skb, nl_dest);
2934
2935         return 0;
2936
2937 nla_put_failure:
2938         nla_nest_cancel(skb, nl_dest);
2939         return -EMSGSIZE;
2940 }
2941
2942 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
2943                                 struct netlink_callback *cb)
2944 {
2945         void *hdr;
2946
2947         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2948                           &ip_vs_genl_family, NLM_F_MULTI,
2949                           IPVS_CMD_NEW_DEST);
2950         if (!hdr)
2951                 return -EMSGSIZE;
2952
2953         if (ip_vs_genl_fill_dest(skb, dest) < 0)
2954                 goto nla_put_failure;
2955
2956         return genlmsg_end(skb, hdr);
2957
2958 nla_put_failure:
2959         genlmsg_cancel(skb, hdr);
2960         return -EMSGSIZE;
2961 }
2962
2963 static int ip_vs_genl_dump_dests(struct sk_buff *skb,
2964                                  struct netlink_callback *cb)
2965 {
2966         int idx = 0;
2967         int start = cb->args[0];
2968         struct ip_vs_service *svc;
2969         struct ip_vs_dest *dest;
2970         struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
2971         struct net *net = skb_sknet(skb);
2972
2973         mutex_lock(&__ip_vs_mutex);
2974
2975         /* Try to find the service for which to dump destinations */
2976         if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
2977                         IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
2978                 goto out_err;
2979
2980
2981         svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
2982         if (IS_ERR(svc) || svc == NULL)
2983                 goto out_err;
2984
2985         /* Dump the destinations */
2986         list_for_each_entry(dest, &svc->destinations, n_list) {
2987                 if (++idx <= start)
2988                         continue;
2989                 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
2990                         idx--;
2991                         goto nla_put_failure;
2992                 }
2993         }
2994
2995 nla_put_failure:
2996         cb->args[0] = idx;
2997
2998 out_err:
2999         mutex_unlock(&__ip_vs_mutex);
3000
3001         return skb->len;
3002 }
3003
3004 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
3005                                  struct nlattr *nla, int full_entry)
3006 {
3007         struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
3008         struct nlattr *nla_addr, *nla_port;
3009
3010         /* Parse mandatory identifying destination fields first */
3011         if (nla == NULL ||
3012             nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
3013                 return -EINVAL;
3014
3015         nla_addr        = attrs[IPVS_DEST_ATTR_ADDR];
3016         nla_port        = attrs[IPVS_DEST_ATTR_PORT];
3017
3018         if (!(nla_addr && nla_port))
3019                 return -EINVAL;
3020
3021         memset(udest, 0, sizeof(*udest));
3022
3023         nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
3024         udest->port = nla_get_u16(nla_port);
3025
3026         /* If a full entry was requested, check for the additional fields */
3027         if (full_entry) {
3028                 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
3029                               *nla_l_thresh;
3030
3031                 nla_fwd         = attrs[IPVS_DEST_ATTR_FWD_METHOD];
3032                 nla_weight      = attrs[IPVS_DEST_ATTR_WEIGHT];
3033                 nla_u_thresh    = attrs[IPVS_DEST_ATTR_U_THRESH];
3034                 nla_l_thresh    = attrs[IPVS_DEST_ATTR_L_THRESH];
3035
3036                 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
3037                         return -EINVAL;
3038
3039                 udest->conn_flags = nla_get_u32(nla_fwd)
3040                                     & IP_VS_CONN_F_FWD_MASK;
3041                 udest->weight = nla_get_u32(nla_weight);
3042                 udest->u_threshold = nla_get_u32(nla_u_thresh);
3043                 udest->l_threshold = nla_get_u32(nla_l_thresh);
3044         }
3045
3046         return 0;
3047 }
3048
3049 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
3050                                   const char *mcast_ifn, __be32 syncid)
3051 {
3052         struct nlattr *nl_daemon;
3053
3054         nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
3055         if (!nl_daemon)
3056                 return -EMSGSIZE;
3057
3058         NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state);
3059         NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn);
3060         NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid);
3061
3062         nla_nest_end(skb, nl_daemon);
3063
3064         return 0;
3065
3066 nla_put_failure:
3067         nla_nest_cancel(skb, nl_daemon);
3068         return -EMSGSIZE;
3069 }
3070
3071 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
3072                                   const char *mcast_ifn, __be32 syncid,
3073                                   struct netlink_callback *cb)
3074 {
3075         void *hdr;
3076         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
3077                           &ip_vs_genl_family, NLM_F_MULTI,
3078                           IPVS_CMD_NEW_DAEMON);
3079         if (!hdr)
3080                 return -EMSGSIZE;
3081
3082         if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
3083                 goto nla_put_failure;
3084
3085         return genlmsg_end(skb, hdr);
3086
3087 nla_put_failure:
3088         genlmsg_cancel(skb, hdr);
3089         return -EMSGSIZE;
3090 }
3091
3092 static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
3093                                    struct netlink_callback *cb)
3094 {
3095         struct net *net = skb_net(skb);
3096         struct netns_ipvs *ipvs = net_ipvs(net);
3097
3098         mutex_lock(&__ip_vs_mutex);
3099         if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
3100                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
3101                                            ipvs->master_mcast_ifn,
3102                                            ipvs->master_syncid, cb) < 0)
3103                         goto nla_put_failure;
3104
3105                 cb->args[0] = 1;
3106         }
3107
3108         if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
3109                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
3110                                            ipvs->backup_mcast_ifn,
3111                                            ipvs->backup_syncid, cb) < 0)
3112                         goto nla_put_failure;
3113
3114                 cb->args[1] = 1;
3115         }
3116
3117 nla_put_failure:
3118         mutex_unlock(&__ip_vs_mutex);
3119
3120         return skb->len;
3121 }
3122
3123 static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
3124 {
3125         if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
3126               attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
3127               attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
3128                 return -EINVAL;
3129
3130         return start_sync_thread(net,
3131                                  nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
3132                                  nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
3133                                  nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
3134 }
3135
3136 static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
3137 {
3138         if (!attrs[IPVS_DAEMON_ATTR_STATE])
3139                 return -EINVAL;
3140
3141         return stop_sync_thread(net,
3142                                 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3143 }
3144
3145 static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
3146 {
3147         struct ip_vs_timeout_user t;
3148
3149         __ip_vs_get_timeouts(net, &t);
3150
3151         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
3152                 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
3153
3154         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
3155                 t.tcp_fin_timeout =
3156                         nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
3157
3158         if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
3159                 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
3160
3161         return ip_vs_set_timeout(net, &t);
3162 }
3163
3164 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3165 {
3166         struct ip_vs_service *svc = NULL;
3167         struct ip_vs_service_user_kern usvc;
3168         struct ip_vs_dest_user_kern udest;
3169         int ret = 0, cmd;
3170         int need_full_svc = 0, need_full_dest = 0;
3171         struct net *net;
3172         struct netns_ipvs *ipvs;
3173
3174         net = skb_sknet(skb);
3175         ipvs = net_ipvs(net);
3176         cmd = info->genlhdr->cmd;
3177
3178         mutex_lock(&__ip_vs_mutex);
3179
3180         if (cmd == IPVS_CMD_FLUSH) {
3181                 ret = ip_vs_flush(net);
3182                 goto out;
3183         } else if (cmd == IPVS_CMD_SET_CONFIG) {
3184                 ret = ip_vs_genl_set_config(net, info->attrs);
3185                 goto out;
3186         } else if (cmd == IPVS_CMD_NEW_DAEMON ||
3187                    cmd == IPVS_CMD_DEL_DAEMON) {
3188
3189                 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
3190
3191                 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
3192                     nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
3193                                      info->attrs[IPVS_CMD_ATTR_DAEMON],
3194                                      ip_vs_daemon_policy)) {
3195                         ret = -EINVAL;
3196                         goto out;
3197                 }
3198
3199                 if (cmd == IPVS_CMD_NEW_DAEMON)
3200                         ret = ip_vs_genl_new_daemon(net, daemon_attrs);
3201                 else
3202                         ret = ip_vs_genl_del_daemon(net, daemon_attrs);
3203                 goto out;
3204         } else if (cmd == IPVS_CMD_ZERO &&
3205                    !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3206                 ret = ip_vs_zero_all(net);
3207                 goto out;
3208         }
3209
3210         /* All following commands require a service argument, so check if we
3211          * received a valid one. We need a full service specification when
3212          * adding / editing a service. Only identifying members otherwise. */
3213         if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
3214                 need_full_svc = 1;
3215
3216         ret = ip_vs_genl_parse_service(net, &usvc,
3217                                        info->attrs[IPVS_CMD_ATTR_SERVICE],
3218                                        need_full_svc, &svc);
3219         if (ret)
3220                 goto out;
3221
3222         /* Unless we're adding a new service, the service must already exist */
3223         if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
3224                 ret = -ESRCH;
3225                 goto out;
3226         }
3227
3228         /* Destination commands require a valid destination argument. For
3229          * adding / editing a destination, we need a full destination
3230          * specification. */
3231         if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
3232             cmd == IPVS_CMD_DEL_DEST) {
3233                 if (cmd != IPVS_CMD_DEL_DEST)
3234                         need_full_dest = 1;
3235
3236                 ret = ip_vs_genl_parse_dest(&udest,
3237                                             info->attrs[IPVS_CMD_ATTR_DEST],
3238                                             need_full_dest);
3239                 if (ret)
3240                         goto out;
3241         }
3242
3243         switch (cmd) {
3244         case IPVS_CMD_NEW_SERVICE:
3245                 if (svc == NULL)
3246                         ret = ip_vs_add_service(net, &usvc, &svc);
3247                 else
3248                         ret = -EEXIST;
3249                 break;
3250         case IPVS_CMD_SET_SERVICE:
3251                 ret = ip_vs_edit_service(svc, &usvc);
3252                 break;
3253         case IPVS_CMD_DEL_SERVICE:
3254                 ret = ip_vs_del_service(svc);
3255                 /* do not use svc, it can be freed */
3256                 break;
3257         case IPVS_CMD_NEW_DEST:
3258                 ret = ip_vs_add_dest(svc, &udest);
3259                 break;
3260         case IPVS_CMD_SET_DEST:
3261                 ret = ip_vs_edit_dest(svc, &udest);
3262                 break;
3263         case IPVS_CMD_DEL_DEST:
3264                 ret = ip_vs_del_dest(svc, &udest);
3265                 break;
3266         case IPVS_CMD_ZERO:
3267                 ret = ip_vs_zero_service(svc);
3268                 break;
3269         default:
3270                 ret = -EINVAL;
3271         }
3272
3273 out:
3274         mutex_unlock(&__ip_vs_mutex);
3275
3276         return ret;
3277 }
3278
3279 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3280 {
3281         struct sk_buff *msg;
3282         void *reply;
3283         int ret, cmd, reply_cmd;
3284         struct net *net;
3285         struct netns_ipvs *ipvs;
3286
3287         net = skb_sknet(skb);
3288         ipvs = net_ipvs(net);
3289         cmd = info->genlhdr->cmd;
3290
3291         if (cmd == IPVS_CMD_GET_SERVICE)
3292                 reply_cmd = IPVS_CMD_NEW_SERVICE;
3293         else if (cmd == IPVS_CMD_GET_INFO)
3294                 reply_cmd = IPVS_CMD_SET_INFO;
3295         else if (cmd == IPVS_CMD_GET_CONFIG)
3296                 reply_cmd = IPVS_CMD_SET_CONFIG;
3297         else {
3298                 pr_err("unknown Generic Netlink command\n");
3299                 return -EINVAL;
3300         }
3301
3302         msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
3303         if (!msg)
3304                 return -ENOMEM;
3305
3306         mutex_lock(&__ip_vs_mutex);
3307
3308         reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
3309         if (reply == NULL)
3310                 goto nla_put_failure;
3311
3312         switch (cmd) {
3313         case IPVS_CMD_GET_SERVICE:
3314         {
3315                 struct ip_vs_service *svc;
3316
3317                 svc = ip_vs_genl_find_service(net,
3318                                               info->attrs[IPVS_CMD_ATTR_SERVICE]);
3319                 if (IS_ERR(svc)) {
3320                         ret = PTR_ERR(svc);
3321                         goto out_err;
3322                 } else if (svc) {
3323                         ret = ip_vs_genl_fill_service(msg, svc);
3324                         if (ret)
3325                                 goto nla_put_failure;
3326                 } else {
3327                         ret = -ESRCH;
3328                         goto out_err;
3329                 }
3330
3331                 break;
3332         }
3333
3334         case IPVS_CMD_GET_CONFIG:
3335         {
3336                 struct ip_vs_timeout_user t;
3337
3338                 __ip_vs_get_timeouts(net, &t);
3339 #ifdef CONFIG_IP_VS_PROTO_TCP
3340                 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
3341                 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
3342                             t.tcp_fin_timeout);
3343 #endif
3344 #ifdef CONFIG_IP_VS_PROTO_UDP
3345                 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout);
3346 #endif
3347
3348                 break;
3349         }
3350
3351         case IPVS_CMD_GET_INFO:
3352                 NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
3353                 NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
3354                             ip_vs_conn_tab_size);
3355                 break;
3356         }
3357
3358         genlmsg_end(msg, reply);
3359         ret = genlmsg_reply(msg, info);
3360         goto out;
3361
3362 nla_put_failure:
3363         pr_err("not enough space in Netlink message\n");
3364         ret = -EMSGSIZE;
3365
3366 out_err:
3367         nlmsg_free(msg);
3368 out:
3369         mutex_unlock(&__ip_vs_mutex);
3370
3371         return ret;
3372 }
3373
3374
3375 static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
3376         {
3377                 .cmd    = IPVS_CMD_NEW_SERVICE,
3378                 .flags  = GENL_ADMIN_PERM,
3379                 .policy = ip_vs_cmd_policy,
3380                 .doit   = ip_vs_genl_set_cmd,
3381         },
3382         {
3383                 .cmd    = IPVS_CMD_SET_SERVICE,
3384                 .flags  = GENL_ADMIN_PERM,
3385                 .policy = ip_vs_cmd_policy,
3386                 .doit   = ip_vs_genl_set_cmd,
3387         },
3388         {
3389                 .cmd    = IPVS_CMD_DEL_SERVICE,
3390                 .flags  = GENL_ADMIN_PERM,
3391                 .policy = ip_vs_cmd_policy,
3392                 .doit   = ip_vs_genl_set_cmd,
3393         },
3394         {
3395                 .cmd    = IPVS_CMD_GET_SERVICE,
3396                 .flags  = GENL_ADMIN_PERM,
3397                 .doit   = ip_vs_genl_get_cmd,
3398                 .dumpit = ip_vs_genl_dump_services,
3399                 .policy = ip_vs_cmd_policy,
3400         },
3401         {
3402                 .cmd    = IPVS_CMD_NEW_DEST,
3403                 .flags  = GENL_ADMIN_PERM,
3404                 .policy = ip_vs_cmd_policy,
3405                 .doit   = ip_vs_genl_set_cmd,
3406         },
3407         {
3408                 .cmd    = IPVS_CMD_SET_DEST,
3409                 .flags  = GENL_ADMIN_PERM,
3410                 .policy = ip_vs_cmd_policy,
3411                 .doit   = ip_vs_genl_set_cmd,
3412         },
3413         {
3414                 .cmd    = IPVS_CMD_DEL_DEST,
3415                 .flags  = GENL_ADMIN_PERM,
3416                 .policy = ip_vs_cmd_policy,
3417                 .doit   = ip_vs_genl_set_cmd,
3418         },
3419         {
3420                 .cmd    = IPVS_CMD_GET_DEST,
3421                 .flags  = GENL_ADMIN_PERM,
3422                 .policy = ip_vs_cmd_policy,
3423                 .dumpit = ip_vs_genl_dump_dests,
3424         },
3425         {
3426                 .cmd    = IPVS_CMD_NEW_DAEMON,
3427                 .flags  = GENL_ADMIN_PERM,
3428                 .policy = ip_vs_cmd_policy,
3429                 .doit   = ip_vs_genl_set_cmd,
3430         },
3431         {
3432                 .cmd    = IPVS_CMD_DEL_DAEMON,
3433                 .flags  = GENL_ADMIN_PERM,
3434                 .policy = ip_vs_cmd_policy,
3435                 .doit   = ip_vs_genl_set_cmd,
3436         },
3437         {
3438                 .cmd    = IPVS_CMD_GET_DAEMON,
3439                 .flags  = GENL_ADMIN_PERM,
3440                 .dumpit = ip_vs_genl_dump_daemons,
3441         },
3442         {
3443                 .cmd    = IPVS_CMD_SET_CONFIG,
3444                 .flags  = GENL_ADMIN_PERM,
3445                 .policy = ip_vs_cmd_policy,
3446                 .doit   = ip_vs_genl_set_cmd,
3447         },
3448         {
3449                 .cmd    = IPVS_CMD_GET_CONFIG,
3450                 .flags  = GENL_ADMIN_PERM,
3451                 .doit   = ip_vs_genl_get_cmd,
3452         },
3453         {
3454                 .cmd    = IPVS_CMD_GET_INFO,
3455                 .flags  = GENL_ADMIN_PERM,
3456                 .doit   = ip_vs_genl_get_cmd,
3457         },
3458         {
3459                 .cmd    = IPVS_CMD_ZERO,
3460                 .flags  = GENL_ADMIN_PERM,
3461                 .policy = ip_vs_cmd_policy,
3462                 .doit   = ip_vs_genl_set_cmd,
3463         },
3464         {
3465                 .cmd    = IPVS_CMD_FLUSH,
3466                 .flags  = GENL_ADMIN_PERM,
3467                 .doit   = ip_vs_genl_set_cmd,
3468         },
3469 };
3470
3471 static int __init ip_vs_genl_register(void)
3472 {
3473         return genl_register_family_with_ops(&ip_vs_genl_family,
3474                 ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops));
3475 }
3476
3477 static void ip_vs_genl_unregister(void)
3478 {
3479         genl_unregister_family(&ip_vs_genl_family);
3480 }
3481
3482 /* End of Generic Netlink interface definitions */
3483
3484 /*
3485  * per netns intit/exit func.
3486  */
3487 int __net_init __ip_vs_control_init(struct net *net)
3488 {
3489         int idx;
3490         struct netns_ipvs *ipvs = net_ipvs(net);
3491         struct ctl_table *tbl;
3492
3493         atomic_set(&ipvs->dropentry, 0);
3494         spin_lock_init(&ipvs->dropentry_lock);
3495         spin_lock_init(&ipvs->droppacket_lock);
3496         spin_lock_init(&ipvs->securetcp_lock);
3497         ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock);
3498
3499         /* Initialize rs_table */
3500         for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
3501                 INIT_LIST_HEAD(&ipvs->rs_table[idx]);
3502
3503         INIT_LIST_HEAD(&ipvs->dest_trash);
3504         atomic_set(&ipvs->ftpsvc_counter, 0);
3505         atomic_set(&ipvs->nullsvc_counter, 0);
3506
3507         /* procfs stats */
3508         ipvs->tot_stats = kzalloc(sizeof(struct ip_vs_stats), GFP_KERNEL);
3509         if (ipvs->tot_stats == NULL) {
3510                 pr_err("%s(): no memory.\n", __func__);
3511                 return -ENOMEM;
3512         }
3513         ipvs->cpustats = alloc_percpu(struct ip_vs_cpu_stats);
3514         if (!ipvs->cpustats) {
3515                 pr_err("%s() alloc_percpu failed\n", __func__);
3516                 goto err_alloc;
3517         }
3518         spin_lock_init(&ipvs->tot_stats->lock);
3519
3520         proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops);
3521         proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops);
3522         proc_net_fops_create(net, "ip_vs_stats_percpu", 0,
3523                              &ip_vs_stats_percpu_fops);
3524
3525         if (!net_eq(net, &init_net)) {
3526                 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
3527                 if (tbl == NULL)
3528                         goto err_dup;
3529         } else
3530                 tbl = vs_vars;
3531         /* Initialize sysctl defaults */
3532         idx = 0;
3533         ipvs->sysctl_amemthresh = 1024;
3534         tbl[idx++].data = &ipvs->sysctl_amemthresh;
3535         ipvs->sysctl_am_droprate = 10;
3536         tbl[idx++].data = &ipvs->sysctl_am_droprate;
3537         tbl[idx++].data = &ipvs->sysctl_drop_entry;
3538         tbl[idx++].data = &ipvs->sysctl_drop_packet;
3539 #ifdef CONFIG_IP_VS_NFCT
3540         tbl[idx++].data = &ipvs->sysctl_conntrack;
3541 #endif
3542         tbl[idx++].data = &ipvs->sysctl_secure_tcp;
3543         ipvs->sysctl_snat_reroute = 1;
3544         tbl[idx++].data = &ipvs->sysctl_snat_reroute;
3545         ipvs->sysctl_sync_ver = 1;
3546         tbl[idx++].data = &ipvs->sysctl_sync_ver;
3547         tbl[idx++].data = &ipvs->sysctl_cache_bypass;
3548         tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
3549         tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
3550         ipvs->sysctl_sync_threshold[0] = 3;
3551         ipvs->sysctl_sync_threshold[1] = 50;
3552         tbl[idx].data = &ipvs->sysctl_sync_threshold;
3553         tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
3554         tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
3555
3556
3557 #ifdef CONFIG_SYSCTL
3558         ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path,
3559                                                      tbl);
3560         if (ipvs->sysctl_hdr == NULL) {
3561                 if (!net_eq(net, &init_net))
3562                         kfree(tbl);
3563                 goto err_dup;
3564         }
3565 #endif
3566         ip_vs_new_estimator(net, ipvs->tot_stats);
3567         ipvs->sysctl_tbl = tbl;
3568         /* Schedule defense work */
3569         INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
3570         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
3571         return 0;
3572
3573 err_dup:
3574         free_percpu(ipvs->cpustats);
3575 err_alloc:
3576         kfree(ipvs->tot_stats);
3577         return -ENOMEM;
3578 }
3579
3580 static void __net_exit __ip_vs_control_cleanup(struct net *net)
3581 {
3582         struct netns_ipvs *ipvs = net_ipvs(net);
3583
3584         ip_vs_trash_cleanup(net);
3585         ip_vs_kill_estimator(net, ipvs->tot_stats);
3586         cancel_delayed_work_sync(&ipvs->defense_work);
3587         cancel_work_sync(&ipvs->defense_work.work);
3588 #ifdef CONFIG_SYSCTL
3589         unregister_net_sysctl_table(ipvs->sysctl_hdr);
3590 #endif
3591         proc_net_remove(net, "ip_vs_stats_percpu");
3592         proc_net_remove(net, "ip_vs_stats");
3593         proc_net_remove(net, "ip_vs");
3594         free_percpu(ipvs->cpustats);
3595         kfree(ipvs->tot_stats);
3596 }
3597
3598 static struct pernet_operations ipvs_control_ops = {
3599         .init = __ip_vs_control_init,
3600         .exit = __ip_vs_control_cleanup,
3601 };
3602
3603 int __init ip_vs_control_init(void)
3604 {
3605         int idx;
3606         int ret;
3607
3608         EnterFunction(2);
3609
3610         /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
3611         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
3612                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
3613                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
3614         }
3615
3616         ret = register_pernet_subsys(&ipvs_control_ops);
3617         if (ret) {
3618                 pr_err("cannot register namespace.\n");
3619                 goto err;
3620         }
3621
3622         smp_wmb();      /* Do we really need it now ? */
3623
3624         ret = nf_register_sockopt(&ip_vs_sockopts);
3625         if (ret) {
3626                 pr_err("cannot register sockopt.\n");
3627                 goto err_net;
3628         }
3629
3630         ret = ip_vs_genl_register();
3631         if (ret) {
3632                 pr_err("cannot register Generic Netlink interface.\n");
3633                 nf_unregister_sockopt(&ip_vs_sockopts);
3634                 goto err_net;
3635         }
3636
3637         LeaveFunction(2);
3638         return 0;
3639
3640 err_net:
3641         unregister_pernet_subsys(&ipvs_control_ops);
3642 err:
3643         return ret;
3644 }
3645
3646
3647 void ip_vs_control_cleanup(void)
3648 {
3649         EnterFunction(2);
3650         unregister_pernet_subsys(&ipvs_control_ops);
3651         ip_vs_genl_unregister();
3652         nf_unregister_sockopt(&ip_vs_sockopts);
3653         LeaveFunction(2);
3654 }