IPVS: netns, final patch enabling network name space.
[linux-2.6.git] / net / netfilter / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * Changes:
18  *
19  */
20
21 #define KMSG_COMPONENT "IPVS"
22 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23
24 #include <linux/module.h>
25 #include <linux/init.h>
26 #include <linux/types.h>
27 #include <linux/capability.h>
28 #include <linux/fs.h>
29 #include <linux/sysctl.h>
30 #include <linux/proc_fs.h>
31 #include <linux/workqueue.h>
32 #include <linux/swap.h>
33 #include <linux/seq_file.h>
34 #include <linux/slab.h>
35
36 #include <linux/netfilter.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/mutex.h>
39
40 #include <net/net_namespace.h>
41 #include <linux/nsproxy.h>
42 #include <net/ip.h>
43 #ifdef CONFIG_IP_VS_IPV6
44 #include <net/ipv6.h>
45 #include <net/ip6_route.h>
46 #endif
47 #include <net/route.h>
48 #include <net/sock.h>
49 #include <net/genetlink.h>
50
51 #include <asm/uaccess.h>
52
53 #include <net/ip_vs.h>
54
55 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
56 static DEFINE_MUTEX(__ip_vs_mutex);
57
58 /* lock for service table */
59 static DEFINE_RWLOCK(__ip_vs_svc_lock);
60
61 /* sysctl variables */
62
63 #ifdef CONFIG_IP_VS_DEBUG
64 static int sysctl_ip_vs_debug_level = 0;
65
66 int ip_vs_get_debug_level(void)
67 {
68         return sysctl_ip_vs_debug_level;
69 }
70 #endif
71
72 #ifdef CONFIG_IP_VS_IPV6
73 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
74 static int __ip_vs_addr_is_local_v6(struct net *net,
75                                     const struct in6_addr *addr)
76 {
77         struct rt6_info *rt;
78         struct flowi fl = {
79                 .oif = 0,
80                 .fl6_dst = *addr,
81                 .fl6_src = { .s6_addr32 = {0, 0, 0, 0} },
82         };
83
84         rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl);
85         if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
86                         return 1;
87
88         return 0;
89 }
90 #endif
91 /*
92  *      update_defense_level is called from keventd and from sysctl,
93  *      so it needs to protect itself from softirqs
94  */
95 static void update_defense_level(struct netns_ipvs *ipvs)
96 {
97         struct sysinfo i;
98         static int old_secure_tcp = 0;
99         int availmem;
100         int nomem;
101         int to_change = -1;
102
103         /* we only count free and buffered memory (in pages) */
104         si_meminfo(&i);
105         availmem = i.freeram + i.bufferram;
106         /* however in linux 2.5 the i.bufferram is total page cache size,
107            we need adjust it */
108         /* si_swapinfo(&i); */
109         /* availmem = availmem - (i.totalswap - i.freeswap); */
110
111         nomem = (availmem < ipvs->sysctl_amemthresh);
112
113         local_bh_disable();
114
115         /* drop_entry */
116         spin_lock(&ipvs->dropentry_lock);
117         switch (ipvs->sysctl_drop_entry) {
118         case 0:
119                 atomic_set(&ipvs->dropentry, 0);
120                 break;
121         case 1:
122                 if (nomem) {
123                         atomic_set(&ipvs->dropentry, 1);
124                         ipvs->sysctl_drop_entry = 2;
125                 } else {
126                         atomic_set(&ipvs->dropentry, 0);
127                 }
128                 break;
129         case 2:
130                 if (nomem) {
131                         atomic_set(&ipvs->dropentry, 1);
132                 } else {
133                         atomic_set(&ipvs->dropentry, 0);
134                         ipvs->sysctl_drop_entry = 1;
135                 };
136                 break;
137         case 3:
138                 atomic_set(&ipvs->dropentry, 1);
139                 break;
140         }
141         spin_unlock(&ipvs->dropentry_lock);
142
143         /* drop_packet */
144         spin_lock(&ipvs->droppacket_lock);
145         switch (ipvs->sysctl_drop_packet) {
146         case 0:
147                 ipvs->drop_rate = 0;
148                 break;
149         case 1:
150                 if (nomem) {
151                         ipvs->drop_rate = ipvs->drop_counter
152                                 = ipvs->sysctl_amemthresh /
153                                 (ipvs->sysctl_amemthresh-availmem);
154                         ipvs->sysctl_drop_packet = 2;
155                 } else {
156                         ipvs->drop_rate = 0;
157                 }
158                 break;
159         case 2:
160                 if (nomem) {
161                         ipvs->drop_rate = ipvs->drop_counter
162                                 = ipvs->sysctl_amemthresh /
163                                 (ipvs->sysctl_amemthresh-availmem);
164                 } else {
165                         ipvs->drop_rate = 0;
166                         ipvs->sysctl_drop_packet = 1;
167                 }
168                 break;
169         case 3:
170                 ipvs->drop_rate = ipvs->sysctl_am_droprate;
171                 break;
172         }
173         spin_unlock(&ipvs->droppacket_lock);
174
175         /* secure_tcp */
176         spin_lock(&ipvs->securetcp_lock);
177         switch (ipvs->sysctl_secure_tcp) {
178         case 0:
179                 if (old_secure_tcp >= 2)
180                         to_change = 0;
181                 break;
182         case 1:
183                 if (nomem) {
184                         if (old_secure_tcp < 2)
185                                 to_change = 1;
186                         ipvs->sysctl_secure_tcp = 2;
187                 } else {
188                         if (old_secure_tcp >= 2)
189                                 to_change = 0;
190                 }
191                 break;
192         case 2:
193                 if (nomem) {
194                         if (old_secure_tcp < 2)
195                                 to_change = 1;
196                 } else {
197                         if (old_secure_tcp >= 2)
198                                 to_change = 0;
199                         ipvs->sysctl_secure_tcp = 1;
200                 }
201                 break;
202         case 3:
203                 if (old_secure_tcp < 2)
204                         to_change = 1;
205                 break;
206         }
207         old_secure_tcp = ipvs->sysctl_secure_tcp;
208         if (to_change >= 0)
209                 ip_vs_protocol_timeout_change(ipvs,
210                                               ipvs->sysctl_secure_tcp > 1);
211         spin_unlock(&ipvs->securetcp_lock);
212
213         local_bh_enable();
214 }
215
216
217 /*
218  *      Timer for checking the defense
219  */
220 #define DEFENSE_TIMER_PERIOD    1*HZ
221
222 static void defense_work_handler(struct work_struct *work)
223 {
224         struct netns_ipvs *ipvs =
225                 container_of(work, struct netns_ipvs, defense_work.work);
226
227         update_defense_level(ipvs);
228         if (atomic_read(&ipvs->dropentry))
229                 ip_vs_random_dropentry(ipvs->net);
230         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
231 }
232
233 int
234 ip_vs_use_count_inc(void)
235 {
236         return try_module_get(THIS_MODULE);
237 }
238
239 void
240 ip_vs_use_count_dec(void)
241 {
242         module_put(THIS_MODULE);
243 }
244
245
246 /*
247  *      Hash table: for virtual service lookups
248  */
249 #define IP_VS_SVC_TAB_BITS 8
250 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
251 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
252
253 /* the service table hashed by <protocol, addr, port> */
254 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
255 /* the service table hashed by fwmark */
256 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
257
258
259 /*
260  *      Returns hash value for virtual service
261  */
262 static inline unsigned
263 ip_vs_svc_hashkey(struct net *net, int af, unsigned proto,
264                   const union nf_inet_addr *addr, __be16 port)
265 {
266         register unsigned porth = ntohs(port);
267         __be32 addr_fold = addr->ip;
268
269 #ifdef CONFIG_IP_VS_IPV6
270         if (af == AF_INET6)
271                 addr_fold = addr->ip6[0]^addr->ip6[1]^
272                             addr->ip6[2]^addr->ip6[3];
273 #endif
274         addr_fold ^= ((size_t)net>>8);
275
276         return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
277                 & IP_VS_SVC_TAB_MASK;
278 }
279
280 /*
281  *      Returns hash value of fwmark for virtual service lookup
282  */
283 static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
284 {
285         return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
286 }
287
288 /*
289  *      Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
290  *      or in the ip_vs_svc_fwm_table by fwmark.
291  *      Should be called with locked tables.
292  */
293 static int ip_vs_svc_hash(struct ip_vs_service *svc)
294 {
295         unsigned hash;
296
297         if (svc->flags & IP_VS_SVC_F_HASHED) {
298                 pr_err("%s(): request for already hashed, called from %pF\n",
299                        __func__, __builtin_return_address(0));
300                 return 0;
301         }
302
303         if (svc->fwmark == 0) {
304                 /*
305                  *  Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
306                  */
307                 hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
308                                          &svc->addr, svc->port);
309                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
310         } else {
311                 /*
312                  *  Hash it by fwmark in svc_fwm_table
313                  */
314                 hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
315                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
316         }
317
318         svc->flags |= IP_VS_SVC_F_HASHED;
319         /* increase its refcnt because it is referenced by the svc table */
320         atomic_inc(&svc->refcnt);
321         return 1;
322 }
323
324
325 /*
326  *      Unhashes a service from svc_table / svc_fwm_table.
327  *      Should be called with locked tables.
328  */
329 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
330 {
331         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
332                 pr_err("%s(): request for unhash flagged, called from %pF\n",
333                        __func__, __builtin_return_address(0));
334                 return 0;
335         }
336
337         if (svc->fwmark == 0) {
338                 /* Remove it from the svc_table table */
339                 list_del(&svc->s_list);
340         } else {
341                 /* Remove it from the svc_fwm_table table */
342                 list_del(&svc->f_list);
343         }
344
345         svc->flags &= ~IP_VS_SVC_F_HASHED;
346         atomic_dec(&svc->refcnt);
347         return 1;
348 }
349
350
351 /*
352  *      Get service by {netns, proto,addr,port} in the service table.
353  */
354 static inline struct ip_vs_service *
355 __ip_vs_service_find(struct net *net, int af, __u16 protocol,
356                      const union nf_inet_addr *vaddr, __be16 vport)
357 {
358         unsigned hash;
359         struct ip_vs_service *svc;
360
361         /* Check for "full" addressed entries */
362         hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
363
364         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
365                 if ((svc->af == af)
366                     && ip_vs_addr_equal(af, &svc->addr, vaddr)
367                     && (svc->port == vport)
368                     && (svc->protocol == protocol)
369                     && net_eq(svc->net, net)) {
370                         /* HIT */
371                         return svc;
372                 }
373         }
374
375         return NULL;
376 }
377
378
379 /*
380  *      Get service by {fwmark} in the service table.
381  */
382 static inline struct ip_vs_service *
383 __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
384 {
385         unsigned hash;
386         struct ip_vs_service *svc;
387
388         /* Check for fwmark addressed entries */
389         hash = ip_vs_svc_fwm_hashkey(net, fwmark);
390
391         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
392                 if (svc->fwmark == fwmark && svc->af == af
393                     && net_eq(svc->net, net)) {
394                         /* HIT */
395                         return svc;
396                 }
397         }
398
399         return NULL;
400 }
401
402 struct ip_vs_service *
403 ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
404                   const union nf_inet_addr *vaddr, __be16 vport)
405 {
406         struct ip_vs_service *svc;
407         struct netns_ipvs *ipvs = net_ipvs(net);
408
409         read_lock(&__ip_vs_svc_lock);
410
411         /*
412          *      Check the table hashed by fwmark first
413          */
414         svc = __ip_vs_svc_fwm_find(net, af, fwmark);
415         if (fwmark && svc)
416                 goto out;
417
418         /*
419          *      Check the table hashed by <protocol,addr,port>
420          *      for "full" addressed entries
421          */
422         svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
423
424         if (svc == NULL
425             && protocol == IPPROTO_TCP
426             && atomic_read(&ipvs->ftpsvc_counter)
427             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
428                 /*
429                  * Check if ftp service entry exists, the packet
430                  * might belong to FTP data connections.
431                  */
432                 svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
433         }
434
435         if (svc == NULL
436             && atomic_read(&ipvs->nullsvc_counter)) {
437                 /*
438                  * Check if the catch-all port (port zero) exists
439                  */
440                 svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
441         }
442
443   out:
444         if (svc)
445                 atomic_inc(&svc->usecnt);
446         read_unlock(&__ip_vs_svc_lock);
447
448         IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
449                       fwmark, ip_vs_proto_name(protocol),
450                       IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
451                       svc ? "hit" : "not hit");
452
453         return svc;
454 }
455
456
457 static inline void
458 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
459 {
460         atomic_inc(&svc->refcnt);
461         dest->svc = svc;
462 }
463
464 static void
465 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
466 {
467         struct ip_vs_service *svc = dest->svc;
468
469         dest->svc = NULL;
470         if (atomic_dec_and_test(&svc->refcnt)) {
471                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
472                               svc->fwmark,
473                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
474                               ntohs(svc->port), atomic_read(&svc->usecnt));
475                 free_percpu(svc->stats.cpustats);
476                 kfree(svc);
477         }
478 }
479
480
481 /*
482  *      Returns hash value for real service
483  */
484 static inline unsigned ip_vs_rs_hashkey(int af,
485                                             const union nf_inet_addr *addr,
486                                             __be16 port)
487 {
488         register unsigned porth = ntohs(port);
489         __be32 addr_fold = addr->ip;
490
491 #ifdef CONFIG_IP_VS_IPV6
492         if (af == AF_INET6)
493                 addr_fold = addr->ip6[0]^addr->ip6[1]^
494                             addr->ip6[2]^addr->ip6[3];
495 #endif
496
497         return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
498                 & IP_VS_RTAB_MASK;
499 }
500
501 /*
502  *      Hashes ip_vs_dest in rs_table by <proto,addr,port>.
503  *      should be called with locked tables.
504  */
505 static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
506 {
507         unsigned hash;
508
509         if (!list_empty(&dest->d_list)) {
510                 return 0;
511         }
512
513         /*
514          *      Hash by proto,addr,port,
515          *      which are the parameters of the real service.
516          */
517         hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
518
519         list_add(&dest->d_list, &ipvs->rs_table[hash]);
520
521         return 1;
522 }
523
524 /*
525  *      UNhashes ip_vs_dest from rs_table.
526  *      should be called with locked tables.
527  */
528 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
529 {
530         /*
531          * Remove it from the rs_table table.
532          */
533         if (!list_empty(&dest->d_list)) {
534                 list_del(&dest->d_list);
535                 INIT_LIST_HEAD(&dest->d_list);
536         }
537
538         return 1;
539 }
540
541 /*
542  *      Lookup real service by <proto,addr,port> in the real service table.
543  */
544 struct ip_vs_dest *
545 ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
546                           const union nf_inet_addr *daddr,
547                           __be16 dport)
548 {
549         struct netns_ipvs *ipvs = net_ipvs(net);
550         unsigned hash;
551         struct ip_vs_dest *dest;
552
553         /*
554          *      Check for "full" addressed entries
555          *      Return the first found entry
556          */
557         hash = ip_vs_rs_hashkey(af, daddr, dport);
558
559         read_lock(&ipvs->rs_lock);
560         list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
561                 if ((dest->af == af)
562                     && ip_vs_addr_equal(af, &dest->addr, daddr)
563                     && (dest->port == dport)
564                     && ((dest->protocol == protocol) ||
565                         dest->vfwmark)) {
566                         /* HIT */
567                         read_unlock(&ipvs->rs_lock);
568                         return dest;
569                 }
570         }
571         read_unlock(&ipvs->rs_lock);
572
573         return NULL;
574 }
575
576 /*
577  *      Lookup destination by {addr,port} in the given service
578  */
579 static struct ip_vs_dest *
580 ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
581                   __be16 dport)
582 {
583         struct ip_vs_dest *dest;
584
585         /*
586          * Find the destination for the given service
587          */
588         list_for_each_entry(dest, &svc->destinations, n_list) {
589                 if ((dest->af == svc->af)
590                     && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
591                     && (dest->port == dport)) {
592                         /* HIT */
593                         return dest;
594                 }
595         }
596
597         return NULL;
598 }
599
600 /*
601  * Find destination by {daddr,dport,vaddr,protocol}
602  * Cretaed to be used in ip_vs_process_message() in
603  * the backup synchronization daemon. It finds the
604  * destination to be bound to the received connection
605  * on the backup.
606  *
607  * ip_vs_lookup_real_service() looked promissing, but
608  * seems not working as expected.
609  */
610 struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
611                                    const union nf_inet_addr *daddr,
612                                    __be16 dport,
613                                    const union nf_inet_addr *vaddr,
614                                    __be16 vport, __u16 protocol, __u32 fwmark)
615 {
616         struct ip_vs_dest *dest;
617         struct ip_vs_service *svc;
618
619         svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
620         if (!svc)
621                 return NULL;
622         dest = ip_vs_lookup_dest(svc, daddr, dport);
623         if (dest)
624                 atomic_inc(&dest->refcnt);
625         ip_vs_service_put(svc);
626         return dest;
627 }
628
629 /*
630  *  Lookup dest by {svc,addr,port} in the destination trash.
631  *  The destination trash is used to hold the destinations that are removed
632  *  from the service table but are still referenced by some conn entries.
633  *  The reason to add the destination trash is when the dest is temporary
634  *  down (either by administrator or by monitor program), the dest can be
635  *  picked back from the trash, the remaining connections to the dest can
636  *  continue, and the counting information of the dest is also useful for
637  *  scheduling.
638  */
639 static struct ip_vs_dest *
640 ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
641                      __be16 dport)
642 {
643         struct ip_vs_dest *dest, *nxt;
644         struct netns_ipvs *ipvs = net_ipvs(svc->net);
645
646         /*
647          * Find the destination in trash
648          */
649         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
650                 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
651                               "dest->refcnt=%d\n",
652                               dest->vfwmark,
653                               IP_VS_DBG_ADDR(svc->af, &dest->addr),
654                               ntohs(dest->port),
655                               atomic_read(&dest->refcnt));
656                 if (dest->af == svc->af &&
657                     ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
658                     dest->port == dport &&
659                     dest->vfwmark == svc->fwmark &&
660                     dest->protocol == svc->protocol &&
661                     (svc->fwmark ||
662                      (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
663                       dest->vport == svc->port))) {
664                         /* HIT */
665                         return dest;
666                 }
667
668                 /*
669                  * Try to purge the destination from trash if not referenced
670                  */
671                 if (atomic_read(&dest->refcnt) == 1) {
672                         IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
673                                       "from trash\n",
674                                       dest->vfwmark,
675                                       IP_VS_DBG_ADDR(svc->af, &dest->addr),
676                                       ntohs(dest->port));
677                         list_del(&dest->n_list);
678                         ip_vs_dst_reset(dest);
679                         __ip_vs_unbind_svc(dest);
680                         free_percpu(dest->stats.cpustats);
681                         kfree(dest);
682                 }
683         }
684
685         return NULL;
686 }
687
688
689 /*
690  *  Clean up all the destinations in the trash
691  *  Called by the ip_vs_control_cleanup()
692  *
693  *  When the ip_vs_control_clearup is activated by ipvs module exit,
694  *  the service tables must have been flushed and all the connections
695  *  are expired, and the refcnt of each destination in the trash must
696  *  be 1, so we simply release them here.
697  */
698 static void ip_vs_trash_cleanup(struct net *net)
699 {
700         struct ip_vs_dest *dest, *nxt;
701         struct netns_ipvs *ipvs = net_ipvs(net);
702
703         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
704                 list_del(&dest->n_list);
705                 ip_vs_dst_reset(dest);
706                 __ip_vs_unbind_svc(dest);
707                 free_percpu(dest->stats.cpustats);
708                 kfree(dest);
709         }
710 }
711
712
713 static void
714 ip_vs_zero_stats(struct ip_vs_stats *stats)
715 {
716         spin_lock_bh(&stats->lock);
717
718         memset(&stats->ustats, 0, sizeof(stats->ustats));
719         ip_vs_zero_estimator(stats);
720
721         spin_unlock_bh(&stats->lock);
722 }
723
724 /*
725  *      Update a destination in the given service
726  */
727 static void
728 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
729                     struct ip_vs_dest_user_kern *udest, int add)
730 {
731         struct netns_ipvs *ipvs = net_ipvs(svc->net);
732         int conn_flags;
733
734         /* set the weight and the flags */
735         atomic_set(&dest->weight, udest->weight);
736         conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
737         conn_flags |= IP_VS_CONN_F_INACTIVE;
738
739         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
740         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
741                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
742         } else {
743                 /*
744                  *    Put the real service in rs_table if not present.
745                  *    For now only for NAT!
746                  */
747                 write_lock_bh(&ipvs->rs_lock);
748                 ip_vs_rs_hash(ipvs, dest);
749                 write_unlock_bh(&ipvs->rs_lock);
750         }
751         atomic_set(&dest->conn_flags, conn_flags);
752
753         /* bind the service */
754         if (!dest->svc) {
755                 __ip_vs_bind_svc(dest, svc);
756         } else {
757                 if (dest->svc != svc) {
758                         __ip_vs_unbind_svc(dest);
759                         ip_vs_zero_stats(&dest->stats);
760                         __ip_vs_bind_svc(dest, svc);
761                 }
762         }
763
764         /* set the dest status flags */
765         dest->flags |= IP_VS_DEST_F_AVAILABLE;
766
767         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
768                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
769         dest->u_threshold = udest->u_threshold;
770         dest->l_threshold = udest->l_threshold;
771
772         spin_lock(&dest->dst_lock);
773         ip_vs_dst_reset(dest);
774         spin_unlock(&dest->dst_lock);
775
776         if (add)
777                 ip_vs_new_estimator(svc->net, &dest->stats);
778
779         write_lock_bh(&__ip_vs_svc_lock);
780
781         /* Wait until all other svc users go away */
782         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
783
784         if (add) {
785                 list_add(&dest->n_list, &svc->destinations);
786                 svc->num_dests++;
787         }
788
789         /* call the update_service, because server weight may be changed */
790         if (svc->scheduler->update_service)
791                 svc->scheduler->update_service(svc);
792
793         write_unlock_bh(&__ip_vs_svc_lock);
794 }
795
796
797 /*
798  *      Create a destination for the given service
799  */
800 static int
801 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
802                struct ip_vs_dest **dest_p)
803 {
804         struct ip_vs_dest *dest;
805         unsigned atype;
806
807         EnterFunction(2);
808
809 #ifdef CONFIG_IP_VS_IPV6
810         if (svc->af == AF_INET6) {
811                 atype = ipv6_addr_type(&udest->addr.in6);
812                 if ((!(atype & IPV6_ADDR_UNICAST) ||
813                         atype & IPV6_ADDR_LINKLOCAL) &&
814                         !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
815                         return -EINVAL;
816         } else
817 #endif
818         {
819                 atype = inet_addr_type(svc->net, udest->addr.ip);
820                 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
821                         return -EINVAL;
822         }
823
824         dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
825         if (dest == NULL) {
826                 pr_err("%s(): no memory.\n", __func__);
827                 return -ENOMEM;
828         }
829         dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
830         if (!dest->stats.cpustats) {
831                 pr_err("%s() alloc_percpu failed\n", __func__);
832                 goto err_alloc;
833         }
834
835         dest->af = svc->af;
836         dest->protocol = svc->protocol;
837         dest->vaddr = svc->addr;
838         dest->vport = svc->port;
839         dest->vfwmark = svc->fwmark;
840         ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
841         dest->port = udest->port;
842
843         atomic_set(&dest->activeconns, 0);
844         atomic_set(&dest->inactconns, 0);
845         atomic_set(&dest->persistconns, 0);
846         atomic_set(&dest->refcnt, 1);
847
848         INIT_LIST_HEAD(&dest->d_list);
849         spin_lock_init(&dest->dst_lock);
850         spin_lock_init(&dest->stats.lock);
851         __ip_vs_update_dest(svc, dest, udest, 1);
852
853         *dest_p = dest;
854
855         LeaveFunction(2);
856         return 0;
857
858 err_alloc:
859         kfree(dest);
860         return -ENOMEM;
861 }
862
863
864 /*
865  *      Add a destination into an existing service
866  */
867 static int
868 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
869 {
870         struct ip_vs_dest *dest;
871         union nf_inet_addr daddr;
872         __be16 dport = udest->port;
873         int ret;
874
875         EnterFunction(2);
876
877         if (udest->weight < 0) {
878                 pr_err("%s(): server weight less than zero\n", __func__);
879                 return -ERANGE;
880         }
881
882         if (udest->l_threshold > udest->u_threshold) {
883                 pr_err("%s(): lower threshold is higher than upper threshold\n",
884                         __func__);
885                 return -ERANGE;
886         }
887
888         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
889
890         /*
891          * Check if the dest already exists in the list
892          */
893         dest = ip_vs_lookup_dest(svc, &daddr, dport);
894
895         if (dest != NULL) {
896                 IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
897                 return -EEXIST;
898         }
899
900         /*
901          * Check if the dest already exists in the trash and
902          * is from the same service
903          */
904         dest = ip_vs_trash_get_dest(svc, &daddr, dport);
905
906         if (dest != NULL) {
907                 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
908                               "dest->refcnt=%d, service %u/%s:%u\n",
909                               IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
910                               atomic_read(&dest->refcnt),
911                               dest->vfwmark,
912                               IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
913                               ntohs(dest->vport));
914
915                 /*
916                  * Get the destination from the trash
917                  */
918                 list_del(&dest->n_list);
919
920                 __ip_vs_update_dest(svc, dest, udest, 1);
921                 ret = 0;
922         } else {
923                 /*
924                  * Allocate and initialize the dest structure
925                  */
926                 ret = ip_vs_new_dest(svc, udest, &dest);
927         }
928         LeaveFunction(2);
929
930         return ret;
931 }
932
933
934 /*
935  *      Edit a destination in the given service
936  */
937 static int
938 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
939 {
940         struct ip_vs_dest *dest;
941         union nf_inet_addr daddr;
942         __be16 dport = udest->port;
943
944         EnterFunction(2);
945
946         if (udest->weight < 0) {
947                 pr_err("%s(): server weight less than zero\n", __func__);
948                 return -ERANGE;
949         }
950
951         if (udest->l_threshold > udest->u_threshold) {
952                 pr_err("%s(): lower threshold is higher than upper threshold\n",
953                         __func__);
954                 return -ERANGE;
955         }
956
957         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
958
959         /*
960          *  Lookup the destination list
961          */
962         dest = ip_vs_lookup_dest(svc, &daddr, dport);
963
964         if (dest == NULL) {
965                 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
966                 return -ENOENT;
967         }
968
969         __ip_vs_update_dest(svc, dest, udest, 0);
970         LeaveFunction(2);
971
972         return 0;
973 }
974
975
976 /*
977  *      Delete a destination (must be already unlinked from the service)
978  */
979 static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
980 {
981         struct netns_ipvs *ipvs = net_ipvs(net);
982
983         ip_vs_kill_estimator(net, &dest->stats);
984
985         /*
986          *  Remove it from the d-linked list with the real services.
987          */
988         write_lock_bh(&ipvs->rs_lock);
989         ip_vs_rs_unhash(dest);
990         write_unlock_bh(&ipvs->rs_lock);
991
992         /*
993          *  Decrease the refcnt of the dest, and free the dest
994          *  if nobody refers to it (refcnt=0). Otherwise, throw
995          *  the destination into the trash.
996          */
997         if (atomic_dec_and_test(&dest->refcnt)) {
998                 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
999                               dest->vfwmark,
1000                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1001                               ntohs(dest->port));
1002                 ip_vs_dst_reset(dest);
1003                 /* simply decrease svc->refcnt here, let the caller check
1004                    and release the service if nobody refers to it.
1005                    Only user context can release destination and service,
1006                    and only one user context can update virtual service at a
1007                    time, so the operation here is OK */
1008                 atomic_dec(&dest->svc->refcnt);
1009                 free_percpu(dest->stats.cpustats);
1010                 kfree(dest);
1011         } else {
1012                 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
1013                               "dest->refcnt=%d\n",
1014                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1015                               ntohs(dest->port),
1016                               atomic_read(&dest->refcnt));
1017                 list_add(&dest->n_list, &ipvs->dest_trash);
1018                 atomic_inc(&dest->refcnt);
1019         }
1020 }
1021
1022
1023 /*
1024  *      Unlink a destination from the given service
1025  */
1026 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1027                                 struct ip_vs_dest *dest,
1028                                 int svcupd)
1029 {
1030         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1031
1032         /*
1033          *  Remove it from the d-linked destination list.
1034          */
1035         list_del(&dest->n_list);
1036         svc->num_dests--;
1037
1038         /*
1039          *  Call the update_service function of its scheduler
1040          */
1041         if (svcupd && svc->scheduler->update_service)
1042                         svc->scheduler->update_service(svc);
1043 }
1044
1045
1046 /*
1047  *      Delete a destination server in the given service
1048  */
1049 static int
1050 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1051 {
1052         struct ip_vs_dest *dest;
1053         __be16 dport = udest->port;
1054
1055         EnterFunction(2);
1056
1057         dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
1058
1059         if (dest == NULL) {
1060                 IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
1061                 return -ENOENT;
1062         }
1063
1064         write_lock_bh(&__ip_vs_svc_lock);
1065
1066         /*
1067          *      Wait until all other svc users go away.
1068          */
1069         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1070
1071         /*
1072          *      Unlink dest from the service
1073          */
1074         __ip_vs_unlink_dest(svc, dest, 1);
1075
1076         write_unlock_bh(&__ip_vs_svc_lock);
1077
1078         /*
1079          *      Delete the destination
1080          */
1081         __ip_vs_del_dest(svc->net, dest);
1082
1083         LeaveFunction(2);
1084
1085         return 0;
1086 }
1087
1088
1089 /*
1090  *      Add a service into the service hash table
1091  */
1092 static int
1093 ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
1094                   struct ip_vs_service **svc_p)
1095 {
1096         int ret = 0;
1097         struct ip_vs_scheduler *sched = NULL;
1098         struct ip_vs_pe *pe = NULL;
1099         struct ip_vs_service *svc = NULL;
1100         struct netns_ipvs *ipvs = net_ipvs(net);
1101
1102         /* increase the module use count */
1103         ip_vs_use_count_inc();
1104
1105         /* Lookup the scheduler by 'u->sched_name' */
1106         sched = ip_vs_scheduler_get(u->sched_name);
1107         if (sched == NULL) {
1108                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1109                 ret = -ENOENT;
1110                 goto out_err;
1111         }
1112
1113         if (u->pe_name && *u->pe_name) {
1114                 pe = ip_vs_pe_getbyname(u->pe_name);
1115                 if (pe == NULL) {
1116                         pr_info("persistence engine module ip_vs_pe_%s "
1117                                 "not found\n", u->pe_name);
1118                         ret = -ENOENT;
1119                         goto out_err;
1120                 }
1121         }
1122
1123 #ifdef CONFIG_IP_VS_IPV6
1124         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1125                 ret = -EINVAL;
1126                 goto out_err;
1127         }
1128 #endif
1129
1130         svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
1131         if (svc == NULL) {
1132                 IP_VS_DBG(1, "%s(): no memory\n", __func__);
1133                 ret = -ENOMEM;
1134                 goto out_err;
1135         }
1136         svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1137         if (!svc->stats.cpustats) {
1138                 pr_err("%s() alloc_percpu failed\n", __func__);
1139                 goto out_err;
1140         }
1141
1142         /* I'm the first user of the service */
1143         atomic_set(&svc->usecnt, 0);
1144         atomic_set(&svc->refcnt, 0);
1145
1146         svc->af = u->af;
1147         svc->protocol = u->protocol;
1148         ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1149         svc->port = u->port;
1150         svc->fwmark = u->fwmark;
1151         svc->flags = u->flags;
1152         svc->timeout = u->timeout * HZ;
1153         svc->netmask = u->netmask;
1154         svc->net = net;
1155
1156         INIT_LIST_HEAD(&svc->destinations);
1157         rwlock_init(&svc->sched_lock);
1158         spin_lock_init(&svc->stats.lock);
1159
1160         /* Bind the scheduler */
1161         ret = ip_vs_bind_scheduler(svc, sched);
1162         if (ret)
1163                 goto out_err;
1164         sched = NULL;
1165
1166         /* Bind the ct retriever */
1167         ip_vs_bind_pe(svc, pe);
1168         pe = NULL;
1169
1170         /* Update the virtual service counters */
1171         if (svc->port == FTPPORT)
1172                 atomic_inc(&ipvs->ftpsvc_counter);
1173         else if (svc->port == 0)
1174                 atomic_inc(&ipvs->nullsvc_counter);
1175
1176         ip_vs_new_estimator(net, &svc->stats);
1177
1178         /* Count only IPv4 services for old get/setsockopt interface */
1179         if (svc->af == AF_INET)
1180                 ipvs->num_services++;
1181
1182         /* Hash the service into the service table */
1183         write_lock_bh(&__ip_vs_svc_lock);
1184         ip_vs_svc_hash(svc);
1185         write_unlock_bh(&__ip_vs_svc_lock);
1186
1187         *svc_p = svc;
1188         return 0;
1189
1190
1191  out_err:
1192         if (svc != NULL) {
1193                 ip_vs_unbind_scheduler(svc);
1194                 if (svc->inc) {
1195                         local_bh_disable();
1196                         ip_vs_app_inc_put(svc->inc);
1197                         local_bh_enable();
1198                 }
1199                 if (svc->stats.cpustats)
1200                         free_percpu(svc->stats.cpustats);
1201                 kfree(svc);
1202         }
1203         ip_vs_scheduler_put(sched);
1204         ip_vs_pe_put(pe);
1205
1206         /* decrease the module use count */
1207         ip_vs_use_count_dec();
1208
1209         return ret;
1210 }
1211
1212
1213 /*
1214  *      Edit a service and bind it with a new scheduler
1215  */
1216 static int
1217 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1218 {
1219         struct ip_vs_scheduler *sched, *old_sched;
1220         struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1221         int ret = 0;
1222
1223         /*
1224          * Lookup the scheduler, by 'u->sched_name'
1225          */
1226         sched = ip_vs_scheduler_get(u->sched_name);
1227         if (sched == NULL) {
1228                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1229                 return -ENOENT;
1230         }
1231         old_sched = sched;
1232
1233         if (u->pe_name && *u->pe_name) {
1234                 pe = ip_vs_pe_getbyname(u->pe_name);
1235                 if (pe == NULL) {
1236                         pr_info("persistence engine module ip_vs_pe_%s "
1237                                 "not found\n", u->pe_name);
1238                         ret = -ENOENT;
1239                         goto out;
1240                 }
1241                 old_pe = pe;
1242         }
1243
1244 #ifdef CONFIG_IP_VS_IPV6
1245         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1246                 ret = -EINVAL;
1247                 goto out;
1248         }
1249 #endif
1250
1251         write_lock_bh(&__ip_vs_svc_lock);
1252
1253         /*
1254          * Wait until all other svc users go away.
1255          */
1256         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1257
1258         /*
1259          * Set the flags and timeout value
1260          */
1261         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1262         svc->timeout = u->timeout * HZ;
1263         svc->netmask = u->netmask;
1264
1265         old_sched = svc->scheduler;
1266         if (sched != old_sched) {
1267                 /*
1268                  * Unbind the old scheduler
1269                  */
1270                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1271                         old_sched = sched;
1272                         goto out_unlock;
1273                 }
1274
1275                 /*
1276                  * Bind the new scheduler
1277                  */
1278                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1279                         /*
1280                          * If ip_vs_bind_scheduler fails, restore the old
1281                          * scheduler.
1282                          * The main reason of failure is out of memory.
1283                          *
1284                          * The question is if the old scheduler can be
1285                          * restored all the time. TODO: if it cannot be
1286                          * restored some time, we must delete the service,
1287                          * otherwise the system may crash.
1288                          */
1289                         ip_vs_bind_scheduler(svc, old_sched);
1290                         old_sched = sched;
1291                         goto out_unlock;
1292                 }
1293         }
1294
1295         old_pe = svc->pe;
1296         if (pe != old_pe) {
1297                 ip_vs_unbind_pe(svc);
1298                 ip_vs_bind_pe(svc, pe);
1299         }
1300
1301   out_unlock:
1302         write_unlock_bh(&__ip_vs_svc_lock);
1303   out:
1304         ip_vs_scheduler_put(old_sched);
1305         ip_vs_pe_put(old_pe);
1306         return ret;
1307 }
1308
1309
1310 /*
1311  *      Delete a service from the service list
1312  *      - The service must be unlinked, unlocked and not referenced!
1313  *      - We are called under _bh lock
1314  */
1315 static void __ip_vs_del_service(struct ip_vs_service *svc)
1316 {
1317         struct ip_vs_dest *dest, *nxt;
1318         struct ip_vs_scheduler *old_sched;
1319         struct ip_vs_pe *old_pe;
1320         struct netns_ipvs *ipvs = net_ipvs(svc->net);
1321
1322         pr_info("%s: enter\n", __func__);
1323
1324         /* Count only IPv4 services for old get/setsockopt interface */
1325         if (svc->af == AF_INET)
1326                 ipvs->num_services--;
1327
1328         ip_vs_kill_estimator(svc->net, &svc->stats);
1329
1330         /* Unbind scheduler */
1331         old_sched = svc->scheduler;
1332         ip_vs_unbind_scheduler(svc);
1333         ip_vs_scheduler_put(old_sched);
1334
1335         /* Unbind persistence engine */
1336         old_pe = svc->pe;
1337         ip_vs_unbind_pe(svc);
1338         ip_vs_pe_put(old_pe);
1339
1340         /* Unbind app inc */
1341         if (svc->inc) {
1342                 ip_vs_app_inc_put(svc->inc);
1343                 svc->inc = NULL;
1344         }
1345
1346         /*
1347          *    Unlink the whole destination list
1348          */
1349         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1350                 __ip_vs_unlink_dest(svc, dest, 0);
1351                 __ip_vs_del_dest(svc->net, dest);
1352         }
1353
1354         /*
1355          *    Update the virtual service counters
1356          */
1357         if (svc->port == FTPPORT)
1358                 atomic_dec(&ipvs->ftpsvc_counter);
1359         else if (svc->port == 0)
1360                 atomic_dec(&ipvs->nullsvc_counter);
1361
1362         /*
1363          *    Free the service if nobody refers to it
1364          */
1365         if (atomic_read(&svc->refcnt) == 0) {
1366                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
1367                               svc->fwmark,
1368                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
1369                               ntohs(svc->port), atomic_read(&svc->usecnt));
1370                 free_percpu(svc->stats.cpustats);
1371                 kfree(svc);
1372         }
1373
1374         /* decrease the module use count */
1375         ip_vs_use_count_dec();
1376 }
1377
1378 /*
1379  * Unlink a service from list and try to delete it if its refcnt reached 0
1380  */
1381 static void ip_vs_unlink_service(struct ip_vs_service *svc)
1382 {
1383         /*
1384          * Unhash it from the service table
1385          */
1386         write_lock_bh(&__ip_vs_svc_lock);
1387
1388         ip_vs_svc_unhash(svc);
1389
1390         /*
1391          * Wait until all the svc users go away.
1392          */
1393         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1394
1395         __ip_vs_del_service(svc);
1396
1397         write_unlock_bh(&__ip_vs_svc_lock);
1398 }
1399
1400 /*
1401  *      Delete a service from the service list
1402  */
1403 static int ip_vs_del_service(struct ip_vs_service *svc)
1404 {
1405         if (svc == NULL)
1406                 return -EEXIST;
1407         ip_vs_unlink_service(svc);
1408
1409         return 0;
1410 }
1411
1412
1413 /*
1414  *      Flush all the virtual services
1415  */
1416 static int ip_vs_flush(struct net *net)
1417 {
1418         int idx;
1419         struct ip_vs_service *svc, *nxt;
1420
1421         /*
1422          * Flush the service table hashed by <netns,protocol,addr,port>
1423          */
1424         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1425                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
1426                                          s_list) {
1427                         if (net_eq(svc->net, net))
1428                                 ip_vs_unlink_service(svc);
1429                 }
1430         }
1431
1432         /*
1433          * Flush the service table hashed by fwmark
1434          */
1435         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1436                 list_for_each_entry_safe(svc, nxt,
1437                                          &ip_vs_svc_fwm_table[idx], f_list) {
1438                         if (net_eq(svc->net, net))
1439                                 ip_vs_unlink_service(svc);
1440                 }
1441         }
1442
1443         return 0;
1444 }
1445
1446
1447 /*
1448  *      Zero counters in a service or all services
1449  */
1450 static int ip_vs_zero_service(struct ip_vs_service *svc)
1451 {
1452         struct ip_vs_dest *dest;
1453
1454         write_lock_bh(&__ip_vs_svc_lock);
1455         list_for_each_entry(dest, &svc->destinations, n_list) {
1456                 ip_vs_zero_stats(&dest->stats);
1457         }
1458         ip_vs_zero_stats(&svc->stats);
1459         write_unlock_bh(&__ip_vs_svc_lock);
1460         return 0;
1461 }
1462
1463 static int ip_vs_zero_all(struct net *net)
1464 {
1465         int idx;
1466         struct ip_vs_service *svc;
1467
1468         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1469                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1470                         if (net_eq(svc->net, net))
1471                                 ip_vs_zero_service(svc);
1472                 }
1473         }
1474
1475         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1476                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1477                         if (net_eq(svc->net, net))
1478                                 ip_vs_zero_service(svc);
1479                 }
1480         }
1481
1482         ip_vs_zero_stats(net_ipvs(net)->tot_stats);
1483         return 0;
1484 }
1485
1486
1487 static int
1488 proc_do_defense_mode(ctl_table *table, int write,
1489                      void __user *buffer, size_t *lenp, loff_t *ppos)
1490 {
1491         struct net *net = current->nsproxy->net_ns;
1492         int *valp = table->data;
1493         int val = *valp;
1494         int rc;
1495
1496         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1497         if (write && (*valp != val)) {
1498                 if ((*valp < 0) || (*valp > 3)) {
1499                         /* Restore the correct value */
1500                         *valp = val;
1501                 } else {
1502                         update_defense_level(net_ipvs(net));
1503                 }
1504         }
1505         return rc;
1506 }
1507
1508
1509 static int
1510 proc_do_sync_threshold(ctl_table *table, int write,
1511                        void __user *buffer, size_t *lenp, loff_t *ppos)
1512 {
1513         int *valp = table->data;
1514         int val[2];
1515         int rc;
1516
1517         /* backup the value first */
1518         memcpy(val, valp, sizeof(val));
1519
1520         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1521         if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
1522                 /* Restore the correct value */
1523                 memcpy(valp, val, sizeof(val));
1524         }
1525         return rc;
1526 }
1527
1528 static int
1529 proc_do_sync_mode(ctl_table *table, int write,
1530                      void __user *buffer, size_t *lenp, loff_t *ppos)
1531 {
1532         int *valp = table->data;
1533         int val = *valp;
1534         int rc;
1535
1536         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1537         if (write && (*valp != val)) {
1538                 if ((*valp < 0) || (*valp > 1)) {
1539                         /* Restore the correct value */
1540                         *valp = val;
1541                 } else {
1542                         struct net *net = current->nsproxy->net_ns;
1543                         ip_vs_sync_switch_mode(net, val);
1544                 }
1545         }
1546         return rc;
1547 }
1548
1549 /*
1550  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1551  *      Do not change order or insert new entries without
1552  *      align with netns init in __ip_vs_control_init()
1553  */
1554
1555 static struct ctl_table vs_vars[] = {
1556         {
1557                 .procname       = "amemthresh",
1558                 .maxlen         = sizeof(int),
1559                 .mode           = 0644,
1560                 .proc_handler   = proc_dointvec,
1561         },
1562         {
1563                 .procname       = "am_droprate",
1564                 .maxlen         = sizeof(int),
1565                 .mode           = 0644,
1566                 .proc_handler   = proc_dointvec,
1567         },
1568         {
1569                 .procname       = "drop_entry",
1570                 .maxlen         = sizeof(int),
1571                 .mode           = 0644,
1572                 .proc_handler   = proc_do_defense_mode,
1573         },
1574         {
1575                 .procname       = "drop_packet",
1576                 .maxlen         = sizeof(int),
1577                 .mode           = 0644,
1578                 .proc_handler   = proc_do_defense_mode,
1579         },
1580 #ifdef CONFIG_IP_VS_NFCT
1581         {
1582                 .procname       = "conntrack",
1583                 .maxlen         = sizeof(int),
1584                 .mode           = 0644,
1585                 .proc_handler   = &proc_dointvec,
1586         },
1587 #endif
1588         {
1589                 .procname       = "secure_tcp",
1590                 .maxlen         = sizeof(int),
1591                 .mode           = 0644,
1592                 .proc_handler   = proc_do_defense_mode,
1593         },
1594         {
1595                 .procname       = "snat_reroute",
1596                 .maxlen         = sizeof(int),
1597                 .mode           = 0644,
1598                 .proc_handler   = &proc_dointvec,
1599         },
1600         {
1601                 .procname       = "sync_version",
1602                 .maxlen         = sizeof(int),
1603                 .mode           = 0644,
1604                 .proc_handler   = &proc_do_sync_mode,
1605         },
1606         {
1607                 .procname       = "cache_bypass",
1608                 .maxlen         = sizeof(int),
1609                 .mode           = 0644,
1610                 .proc_handler   = proc_dointvec,
1611         },
1612         {
1613                 .procname       = "expire_nodest_conn",
1614                 .maxlen         = sizeof(int),
1615                 .mode           = 0644,
1616                 .proc_handler   = proc_dointvec,
1617         },
1618         {
1619                 .procname       = "expire_quiescent_template",
1620                 .maxlen         = sizeof(int),
1621                 .mode           = 0644,
1622                 .proc_handler   = proc_dointvec,
1623         },
1624         {
1625                 .procname       = "sync_threshold",
1626                 .maxlen         =
1627                         sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
1628                 .mode           = 0644,
1629                 .proc_handler   = proc_do_sync_threshold,
1630         },
1631         {
1632                 .procname       = "nat_icmp_send",
1633                 .maxlen         = sizeof(int),
1634                 .mode           = 0644,
1635                 .proc_handler   = proc_dointvec,
1636         },
1637 #ifdef CONFIG_IP_VS_DEBUG
1638         {
1639                 .procname       = "debug_level",
1640                 .data           = &sysctl_ip_vs_debug_level,
1641                 .maxlen         = sizeof(int),
1642                 .mode           = 0644,
1643                 .proc_handler   = proc_dointvec,
1644         },
1645 #endif
1646 #if 0
1647         {
1648                 .procname       = "timeout_established",
1649                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1650                 .maxlen         = sizeof(int),
1651                 .mode           = 0644,
1652                 .proc_handler   = proc_dointvec_jiffies,
1653         },
1654         {
1655                 .procname       = "timeout_synsent",
1656                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1657                 .maxlen         = sizeof(int),
1658                 .mode           = 0644,
1659                 .proc_handler   = proc_dointvec_jiffies,
1660         },
1661         {
1662                 .procname       = "timeout_synrecv",
1663                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1664                 .maxlen         = sizeof(int),
1665                 .mode           = 0644,
1666                 .proc_handler   = proc_dointvec_jiffies,
1667         },
1668         {
1669                 .procname       = "timeout_finwait",
1670                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1671                 .maxlen         = sizeof(int),
1672                 .mode           = 0644,
1673                 .proc_handler   = proc_dointvec_jiffies,
1674         },
1675         {
1676                 .procname       = "timeout_timewait",
1677                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1678                 .maxlen         = sizeof(int),
1679                 .mode           = 0644,
1680                 .proc_handler   = proc_dointvec_jiffies,
1681         },
1682         {
1683                 .procname       = "timeout_close",
1684                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1685                 .maxlen         = sizeof(int),
1686                 .mode           = 0644,
1687                 .proc_handler   = proc_dointvec_jiffies,
1688         },
1689         {
1690                 .procname       = "timeout_closewait",
1691                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1692                 .maxlen         = sizeof(int),
1693                 .mode           = 0644,
1694                 .proc_handler   = proc_dointvec_jiffies,
1695         },
1696         {
1697                 .procname       = "timeout_lastack",
1698                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1699                 .maxlen         = sizeof(int),
1700                 .mode           = 0644,
1701                 .proc_handler   = proc_dointvec_jiffies,
1702         },
1703         {
1704                 .procname       = "timeout_listen",
1705                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1706                 .maxlen         = sizeof(int),
1707                 .mode           = 0644,
1708                 .proc_handler   = proc_dointvec_jiffies,
1709         },
1710         {
1711                 .procname       = "timeout_synack",
1712                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1713                 .maxlen         = sizeof(int),
1714                 .mode           = 0644,
1715                 .proc_handler   = proc_dointvec_jiffies,
1716         },
1717         {
1718                 .procname       = "timeout_udp",
1719                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1720                 .maxlen         = sizeof(int),
1721                 .mode           = 0644,
1722                 .proc_handler   = proc_dointvec_jiffies,
1723         },
1724         {
1725                 .procname       = "timeout_icmp",
1726                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1727                 .maxlen         = sizeof(int),
1728                 .mode           = 0644,
1729                 .proc_handler   = proc_dointvec_jiffies,
1730         },
1731 #endif
1732         { }
1733 };
1734
1735 const struct ctl_path net_vs_ctl_path[] = {
1736         { .procname = "net", },
1737         { .procname = "ipv4", },
1738         { .procname = "vs", },
1739         { }
1740 };
1741 EXPORT_SYMBOL_GPL(net_vs_ctl_path);
1742
1743 #ifdef CONFIG_PROC_FS
1744
1745 struct ip_vs_iter {
1746         struct seq_net_private p;  /* Do not move this, netns depends upon it*/
1747         struct list_head *table;
1748         int bucket;
1749 };
1750
1751 /*
1752  *      Write the contents of the VS rule table to a PROCfs file.
1753  *      (It is kept just for backward compatibility)
1754  */
1755 static inline const char *ip_vs_fwd_name(unsigned flags)
1756 {
1757         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1758         case IP_VS_CONN_F_LOCALNODE:
1759                 return "Local";
1760         case IP_VS_CONN_F_TUNNEL:
1761                 return "Tunnel";
1762         case IP_VS_CONN_F_DROUTE:
1763                 return "Route";
1764         default:
1765                 return "Masq";
1766         }
1767 }
1768
1769
1770 /* Get the Nth entry in the two lists */
1771 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1772 {
1773         struct net *net = seq_file_net(seq);
1774         struct ip_vs_iter *iter = seq->private;
1775         int idx;
1776         struct ip_vs_service *svc;
1777
1778         /* look in hash by protocol */
1779         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1780                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1781                         if (net_eq(svc->net, net) && pos-- == 0) {
1782                                 iter->table = ip_vs_svc_table;
1783                                 iter->bucket = idx;
1784                                 return svc;
1785                         }
1786                 }
1787         }
1788
1789         /* keep looking in fwmark */
1790         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1791                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1792                         if (net_eq(svc->net, net) && pos-- == 0) {
1793                                 iter->table = ip_vs_svc_fwm_table;
1794                                 iter->bucket = idx;
1795                                 return svc;
1796                         }
1797                 }
1798         }
1799
1800         return NULL;
1801 }
1802
1803 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1804 __acquires(__ip_vs_svc_lock)
1805 {
1806
1807         read_lock_bh(&__ip_vs_svc_lock);
1808         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1809 }
1810
1811
1812 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1813 {
1814         struct list_head *e;
1815         struct ip_vs_iter *iter;
1816         struct ip_vs_service *svc;
1817
1818         ++*pos;
1819         if (v == SEQ_START_TOKEN)
1820                 return ip_vs_info_array(seq,0);
1821
1822         svc = v;
1823         iter = seq->private;
1824
1825         if (iter->table == ip_vs_svc_table) {
1826                 /* next service in table hashed by protocol */
1827                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1828                         return list_entry(e, struct ip_vs_service, s_list);
1829
1830
1831                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1832                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
1833                                             s_list) {
1834                                 return svc;
1835                         }
1836                 }
1837
1838                 iter->table = ip_vs_svc_fwm_table;
1839                 iter->bucket = -1;
1840                 goto scan_fwmark;
1841         }
1842
1843         /* next service in hashed by fwmark */
1844         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
1845                 return list_entry(e, struct ip_vs_service, f_list);
1846
1847  scan_fwmark:
1848         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
1849                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
1850                                     f_list)
1851                         return svc;
1852         }
1853
1854         return NULL;
1855 }
1856
1857 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
1858 __releases(__ip_vs_svc_lock)
1859 {
1860         read_unlock_bh(&__ip_vs_svc_lock);
1861 }
1862
1863
1864 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
1865 {
1866         if (v == SEQ_START_TOKEN) {
1867                 seq_printf(seq,
1868                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
1869                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
1870                 seq_puts(seq,
1871                          "Prot LocalAddress:Port Scheduler Flags\n");
1872                 seq_puts(seq,
1873                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
1874         } else {
1875                 const struct ip_vs_service *svc = v;
1876                 const struct ip_vs_iter *iter = seq->private;
1877                 const struct ip_vs_dest *dest;
1878
1879                 if (iter->table == ip_vs_svc_table) {
1880 #ifdef CONFIG_IP_VS_IPV6
1881                         if (svc->af == AF_INET6)
1882                                 seq_printf(seq, "%s  [%pI6]:%04X %s ",
1883                                            ip_vs_proto_name(svc->protocol),
1884                                            &svc->addr.in6,
1885                                            ntohs(svc->port),
1886                                            svc->scheduler->name);
1887                         else
1888 #endif
1889                                 seq_printf(seq, "%s  %08X:%04X %s %s ",
1890                                            ip_vs_proto_name(svc->protocol),
1891                                            ntohl(svc->addr.ip),
1892                                            ntohs(svc->port),
1893                                            svc->scheduler->name,
1894                                            (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
1895                 } else {
1896                         seq_printf(seq, "FWM  %08X %s %s",
1897                                    svc->fwmark, svc->scheduler->name,
1898                                    (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
1899                 }
1900
1901                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
1902                         seq_printf(seq, "persistent %d %08X\n",
1903                                 svc->timeout,
1904                                 ntohl(svc->netmask));
1905                 else
1906                         seq_putc(seq, '\n');
1907
1908                 list_for_each_entry(dest, &svc->destinations, n_list) {
1909 #ifdef CONFIG_IP_VS_IPV6
1910                         if (dest->af == AF_INET6)
1911                                 seq_printf(seq,
1912                                            "  -> [%pI6]:%04X"
1913                                            "      %-7s %-6d %-10d %-10d\n",
1914                                            &dest->addr.in6,
1915                                            ntohs(dest->port),
1916                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1917                                            atomic_read(&dest->weight),
1918                                            atomic_read(&dest->activeconns),
1919                                            atomic_read(&dest->inactconns));
1920                         else
1921 #endif
1922                                 seq_printf(seq,
1923                                            "  -> %08X:%04X      "
1924                                            "%-7s %-6d %-10d %-10d\n",
1925                                            ntohl(dest->addr.ip),
1926                                            ntohs(dest->port),
1927                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
1928                                            atomic_read(&dest->weight),
1929                                            atomic_read(&dest->activeconns),
1930                                            atomic_read(&dest->inactconns));
1931
1932                 }
1933         }
1934         return 0;
1935 }
1936
1937 static const struct seq_operations ip_vs_info_seq_ops = {
1938         .start = ip_vs_info_seq_start,
1939         .next  = ip_vs_info_seq_next,
1940         .stop  = ip_vs_info_seq_stop,
1941         .show  = ip_vs_info_seq_show,
1942 };
1943
1944 static int ip_vs_info_open(struct inode *inode, struct file *file)
1945 {
1946         return seq_open_net(inode, file, &ip_vs_info_seq_ops,
1947                         sizeof(struct ip_vs_iter));
1948 }
1949
1950 static const struct file_operations ip_vs_info_fops = {
1951         .owner   = THIS_MODULE,
1952         .open    = ip_vs_info_open,
1953         .read    = seq_read,
1954         .llseek  = seq_lseek,
1955         .release = seq_release_private,
1956 };
1957
1958 #endif
1959
1960 #ifdef CONFIG_PROC_FS
1961 static int ip_vs_stats_show(struct seq_file *seq, void *v)
1962 {
1963         struct net *net = seq_file_single_net(seq);
1964         struct ip_vs_stats *tot_stats = net_ipvs(net)->tot_stats;
1965
1966 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
1967         seq_puts(seq,
1968                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
1969         seq_printf(seq,
1970                    "   Conns  Packets  Packets            Bytes            Bytes\n");
1971
1972         spin_lock_bh(&tot_stats->lock);
1973         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", tot_stats->ustats.conns,
1974                    tot_stats->ustats.inpkts, tot_stats->ustats.outpkts,
1975                    (unsigned long long) tot_stats->ustats.inbytes,
1976                    (unsigned long long) tot_stats->ustats.outbytes);
1977
1978 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1979         seq_puts(seq,
1980                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
1981         seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1982                         tot_stats->ustats.cps,
1983                         tot_stats->ustats.inpps,
1984                         tot_stats->ustats.outpps,
1985                         tot_stats->ustats.inbps,
1986                         tot_stats->ustats.outbps);
1987         spin_unlock_bh(&tot_stats->lock);
1988
1989         return 0;
1990 }
1991
1992 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1993 {
1994         return single_open_net(inode, file, ip_vs_stats_show);
1995 }
1996
1997 static const struct file_operations ip_vs_stats_fops = {
1998         .owner = THIS_MODULE,
1999         .open = ip_vs_stats_seq_open,
2000         .read = seq_read,
2001         .llseek = seq_lseek,
2002         .release = single_release,
2003 };
2004
2005 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
2006 {
2007         struct net *net = seq_file_single_net(seq);
2008         struct ip_vs_stats *tot_stats = net_ipvs(net)->tot_stats;
2009         int i;
2010
2011 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2012         seq_puts(seq,
2013                  "       Total Incoming Outgoing         Incoming         Outgoing\n");
2014         seq_printf(seq,
2015                    "CPU    Conns  Packets  Packets            Bytes            Bytes\n");
2016
2017         for_each_possible_cpu(i) {
2018                 struct ip_vs_cpu_stats *u = per_cpu_ptr(net->ipvs->cpustats, i);
2019                 seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
2020                             i, u->ustats.conns, u->ustats.inpkts,
2021                             u->ustats.outpkts, (__u64)u->ustats.inbytes,
2022                             (__u64)u->ustats.outbytes);
2023         }
2024
2025         spin_lock_bh(&tot_stats->lock);
2026         seq_printf(seq, "  ~ %8X %8X %8X %16LX %16LX\n\n",
2027                    tot_stats->ustats.conns, tot_stats->ustats.inpkts,
2028                    tot_stats->ustats.outpkts,
2029                    (unsigned long long) tot_stats->ustats.inbytes,
2030                    (unsigned long long) tot_stats->ustats.outbytes);
2031
2032 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2033         seq_puts(seq,
2034                    "     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2035         seq_printf(seq, "    %8X %8X %8X %16X %16X\n",
2036                         tot_stats->ustats.cps,
2037                         tot_stats->ustats.inpps,
2038                         tot_stats->ustats.outpps,
2039                         tot_stats->ustats.inbps,
2040                         tot_stats->ustats.outbps);
2041         spin_unlock_bh(&tot_stats->lock);
2042
2043         return 0;
2044 }
2045
2046 static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
2047 {
2048         return single_open_net(inode, file, ip_vs_stats_percpu_show);
2049 }
2050
2051 static const struct file_operations ip_vs_stats_percpu_fops = {
2052         .owner = THIS_MODULE,
2053         .open = ip_vs_stats_percpu_seq_open,
2054         .read = seq_read,
2055         .llseek = seq_lseek,
2056         .release = single_release,
2057 };
2058 #endif
2059
2060 /*
2061  *      Set timeout values for tcp tcpfin udp in the timeout_table.
2062  */
2063 static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
2064 {
2065         struct ip_vs_proto_data *pd;
2066
2067         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
2068                   u->tcp_timeout,
2069                   u->tcp_fin_timeout,
2070                   u->udp_timeout);
2071
2072 #ifdef CONFIG_IP_VS_PROTO_TCP
2073         if (u->tcp_timeout) {
2074                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2075                 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
2076                         = u->tcp_timeout * HZ;
2077         }
2078
2079         if (u->tcp_fin_timeout) {
2080                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2081                 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
2082                         = u->tcp_fin_timeout * HZ;
2083         }
2084 #endif
2085
2086 #ifdef CONFIG_IP_VS_PROTO_UDP
2087         if (u->udp_timeout) {
2088                 pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2089                 pd->timeout_table[IP_VS_UDP_S_NORMAL]
2090                         = u->udp_timeout * HZ;
2091         }
2092 #endif
2093         return 0;
2094 }
2095
2096
2097 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2098 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
2099 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
2100                                  sizeof(struct ip_vs_dest_user))
2101 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
2102 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
2103 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
2104
2105 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
2106         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
2107         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
2108         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
2109         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
2110         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
2111         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
2112         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
2113         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
2114         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
2115         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
2116         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
2117 };
2118
2119 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2120                                   struct ip_vs_service_user *usvc_compat)
2121 {
2122         memset(usvc, 0, sizeof(*usvc));
2123
2124         usvc->af                = AF_INET;
2125         usvc->protocol          = usvc_compat->protocol;
2126         usvc->addr.ip           = usvc_compat->addr;
2127         usvc->port              = usvc_compat->port;
2128         usvc->fwmark            = usvc_compat->fwmark;
2129
2130         /* Deep copy of sched_name is not needed here */
2131         usvc->sched_name        = usvc_compat->sched_name;
2132
2133         usvc->flags             = usvc_compat->flags;
2134         usvc->timeout           = usvc_compat->timeout;
2135         usvc->netmask           = usvc_compat->netmask;
2136 }
2137
2138 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2139                                    struct ip_vs_dest_user *udest_compat)
2140 {
2141         memset(udest, 0, sizeof(*udest));
2142
2143         udest->addr.ip          = udest_compat->addr;
2144         udest->port             = udest_compat->port;
2145         udest->conn_flags       = udest_compat->conn_flags;
2146         udest->weight           = udest_compat->weight;
2147         udest->u_threshold      = udest_compat->u_threshold;
2148         udest->l_threshold      = udest_compat->l_threshold;
2149 }
2150
2151 static int
2152 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2153 {
2154         struct net *net = sock_net(sk);
2155         int ret;
2156         unsigned char arg[MAX_ARG_LEN];
2157         struct ip_vs_service_user *usvc_compat;
2158         struct ip_vs_service_user_kern usvc;
2159         struct ip_vs_service *svc;
2160         struct ip_vs_dest_user *udest_compat;
2161         struct ip_vs_dest_user_kern udest;
2162
2163         if (!capable(CAP_NET_ADMIN))
2164                 return -EPERM;
2165
2166         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
2167                 return -EINVAL;
2168         if (len < 0 || len >  MAX_ARG_LEN)
2169                 return -EINVAL;
2170         if (len != set_arglen[SET_CMDID(cmd)]) {
2171                 pr_err("set_ctl: len %u != %u\n",
2172                        len, set_arglen[SET_CMDID(cmd)]);
2173                 return -EINVAL;
2174         }
2175
2176         if (copy_from_user(arg, user, len) != 0)
2177                 return -EFAULT;
2178
2179         /* increase the module use count */
2180         ip_vs_use_count_inc();
2181
2182         if (mutex_lock_interruptible(&__ip_vs_mutex)) {
2183                 ret = -ERESTARTSYS;
2184                 goto out_dec;
2185         }
2186
2187         if (cmd == IP_VS_SO_SET_FLUSH) {
2188                 /* Flush the virtual service */
2189                 ret = ip_vs_flush(net);
2190                 goto out_unlock;
2191         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
2192                 /* Set timeout values for (tcp tcpfin udp) */
2193                 ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
2194                 goto out_unlock;
2195         } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
2196                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2197                 ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
2198                                         dm->syncid);
2199                 goto out_unlock;
2200         } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
2201                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2202                 ret = stop_sync_thread(net, dm->state);
2203                 goto out_unlock;
2204         }
2205
2206         usvc_compat = (struct ip_vs_service_user *)arg;
2207         udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
2208
2209         /* We only use the new structs internally, so copy userspace compat
2210          * structs to extended internal versions */
2211         ip_vs_copy_usvc_compat(&usvc, usvc_compat);
2212         ip_vs_copy_udest_compat(&udest, udest_compat);
2213
2214         if (cmd == IP_VS_SO_SET_ZERO) {
2215                 /* if no service address is set, zero counters in all */
2216                 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2217                         ret = ip_vs_zero_all(net);
2218                         goto out_unlock;
2219                 }
2220         }
2221
2222         /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
2223         if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
2224             usvc.protocol != IPPROTO_SCTP) {
2225                 pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n",
2226                        usvc.protocol, &usvc.addr.ip,
2227                        ntohs(usvc.port), usvc.sched_name);
2228                 ret = -EFAULT;
2229                 goto out_unlock;
2230         }
2231
2232         /* Lookup the exact service by <protocol, addr, port> or fwmark */
2233         if (usvc.fwmark == 0)
2234                 svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
2235                                            &usvc.addr, usvc.port);
2236         else
2237                 svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
2238
2239         if (cmd != IP_VS_SO_SET_ADD
2240             && (svc == NULL || svc->protocol != usvc.protocol)) {
2241                 ret = -ESRCH;
2242                 goto out_unlock;
2243         }
2244
2245         switch (cmd) {
2246         case IP_VS_SO_SET_ADD:
2247                 if (svc != NULL)
2248                         ret = -EEXIST;
2249                 else
2250                         ret = ip_vs_add_service(net, &usvc, &svc);
2251                 break;
2252         case IP_VS_SO_SET_EDIT:
2253                 ret = ip_vs_edit_service(svc, &usvc);
2254                 break;
2255         case IP_VS_SO_SET_DEL:
2256                 ret = ip_vs_del_service(svc);
2257                 if (!ret)
2258                         goto out_unlock;
2259                 break;
2260         case IP_VS_SO_SET_ZERO:
2261                 ret = ip_vs_zero_service(svc);
2262                 break;
2263         case IP_VS_SO_SET_ADDDEST:
2264                 ret = ip_vs_add_dest(svc, &udest);
2265                 break;
2266         case IP_VS_SO_SET_EDITDEST:
2267                 ret = ip_vs_edit_dest(svc, &udest);
2268                 break;
2269         case IP_VS_SO_SET_DELDEST:
2270                 ret = ip_vs_del_dest(svc, &udest);
2271                 break;
2272         default:
2273                 ret = -EINVAL;
2274         }
2275
2276   out_unlock:
2277         mutex_unlock(&__ip_vs_mutex);
2278   out_dec:
2279         /* decrease the module use count */
2280         ip_vs_use_count_dec();
2281
2282         return ret;
2283 }
2284
2285
2286 static void
2287 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
2288 {
2289         spin_lock_bh(&src->lock);
2290         memcpy(dst, &src->ustats, sizeof(*dst));
2291         spin_unlock_bh(&src->lock);
2292 }
2293
2294 static void
2295 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2296 {
2297         dst->protocol = src->protocol;
2298         dst->addr = src->addr.ip;
2299         dst->port = src->port;
2300         dst->fwmark = src->fwmark;
2301         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2302         dst->flags = src->flags;
2303         dst->timeout = src->timeout / HZ;
2304         dst->netmask = src->netmask;
2305         dst->num_dests = src->num_dests;
2306         ip_vs_copy_stats(&dst->stats, &src->stats);
2307 }
2308
2309 static inline int
2310 __ip_vs_get_service_entries(struct net *net,
2311                             const struct ip_vs_get_services *get,
2312                             struct ip_vs_get_services __user *uptr)
2313 {
2314         int idx, count=0;
2315         struct ip_vs_service *svc;
2316         struct ip_vs_service_entry entry;
2317         int ret = 0;
2318
2319         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2320                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2321                         /* Only expose IPv4 entries to old interface */
2322                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2323                                 continue;
2324
2325                         if (count >= get->num_services)
2326                                 goto out;
2327                         memset(&entry, 0, sizeof(entry));
2328                         ip_vs_copy_service(&entry, svc);
2329                         if (copy_to_user(&uptr->entrytable[count],
2330                                          &entry, sizeof(entry))) {
2331                                 ret = -EFAULT;
2332                                 goto out;
2333                         }
2334                         count++;
2335                 }
2336         }
2337
2338         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2339                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2340                         /* Only expose IPv4 entries to old interface */
2341                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2342                                 continue;
2343
2344                         if (count >= get->num_services)
2345                                 goto out;
2346                         memset(&entry, 0, sizeof(entry));
2347                         ip_vs_copy_service(&entry, svc);
2348                         if (copy_to_user(&uptr->entrytable[count],
2349                                          &entry, sizeof(entry))) {
2350                                 ret = -EFAULT;
2351                                 goto out;
2352                         }
2353                         count++;
2354                 }
2355         }
2356   out:
2357         return ret;
2358 }
2359
2360 static inline int
2361 __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
2362                          struct ip_vs_get_dests __user *uptr)
2363 {
2364         struct ip_vs_service *svc;
2365         union nf_inet_addr addr = { .ip = get->addr };
2366         int ret = 0;
2367
2368         if (get->fwmark)
2369                 svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
2370         else
2371                 svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
2372                                            get->port);
2373
2374         if (svc) {
2375                 int count = 0;
2376                 struct ip_vs_dest *dest;
2377                 struct ip_vs_dest_entry entry;
2378
2379                 list_for_each_entry(dest, &svc->destinations, n_list) {
2380                         if (count >= get->num_dests)
2381                                 break;
2382
2383                         entry.addr = dest->addr.ip;
2384                         entry.port = dest->port;
2385                         entry.conn_flags = atomic_read(&dest->conn_flags);
2386                         entry.weight = atomic_read(&dest->weight);
2387                         entry.u_threshold = dest->u_threshold;
2388                         entry.l_threshold = dest->l_threshold;
2389                         entry.activeconns = atomic_read(&dest->activeconns);
2390                         entry.inactconns = atomic_read(&dest->inactconns);
2391                         entry.persistconns = atomic_read(&dest->persistconns);
2392                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2393                         if (copy_to_user(&uptr->entrytable[count],
2394                                          &entry, sizeof(entry))) {
2395                                 ret = -EFAULT;
2396                                 break;
2397                         }
2398                         count++;
2399                 }
2400         } else
2401                 ret = -ESRCH;
2402         return ret;
2403 }
2404
2405 static inline void
2406 __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
2407 {
2408         struct ip_vs_proto_data *pd;
2409
2410 #ifdef CONFIG_IP_VS_PROTO_TCP
2411         pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2412         u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2413         u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2414 #endif
2415 #ifdef CONFIG_IP_VS_PROTO_UDP
2416         pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2417         u->udp_timeout =
2418                         pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2419 #endif
2420 }
2421
2422
2423 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2424 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2425 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2426 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2427 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2428 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2429 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2430
2431 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2432         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2433         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2434         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2435         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2436         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2437         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2438         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2439 };
2440
2441 static int
2442 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2443 {
2444         unsigned char arg[128];
2445         int ret = 0;
2446         unsigned int copylen;
2447         struct net *net = sock_net(sk);
2448         struct netns_ipvs *ipvs = net_ipvs(net);
2449
2450         BUG_ON(!net);
2451         if (!capable(CAP_NET_ADMIN))
2452                 return -EPERM;
2453
2454         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
2455                 return -EINVAL;
2456
2457         if (*len < get_arglen[GET_CMDID(cmd)]) {
2458                 pr_err("get_ctl: len %u < %u\n",
2459                        *len, get_arglen[GET_CMDID(cmd)]);
2460                 return -EINVAL;
2461         }
2462
2463         copylen = get_arglen[GET_CMDID(cmd)];
2464         if (copylen > 128)
2465                 return -EINVAL;
2466
2467         if (copy_from_user(arg, user, copylen) != 0)
2468                 return -EFAULT;
2469
2470         if (mutex_lock_interruptible(&__ip_vs_mutex))
2471                 return -ERESTARTSYS;
2472
2473         switch (cmd) {
2474         case IP_VS_SO_GET_VERSION:
2475         {
2476                 char buf[64];
2477
2478                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2479                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2480                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2481                         ret = -EFAULT;
2482                         goto out;
2483                 }
2484                 *len = strlen(buf)+1;
2485         }
2486         break;
2487
2488         case IP_VS_SO_GET_INFO:
2489         {
2490                 struct ip_vs_getinfo info;
2491                 info.version = IP_VS_VERSION_CODE;
2492                 info.size = ip_vs_conn_tab_size;
2493                 info.num_services = ipvs->num_services;
2494                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2495                         ret = -EFAULT;
2496         }
2497         break;
2498
2499         case IP_VS_SO_GET_SERVICES:
2500         {
2501                 struct ip_vs_get_services *get;
2502                 int size;
2503
2504                 get = (struct ip_vs_get_services *)arg;
2505                 size = sizeof(*get) +
2506                         sizeof(struct ip_vs_service_entry) * get->num_services;
2507                 if (*len != size) {
2508                         pr_err("length: %u != %u\n", *len, size);
2509                         ret = -EINVAL;
2510                         goto out;
2511                 }
2512                 ret = __ip_vs_get_service_entries(net, get, user);
2513         }
2514         break;
2515
2516         case IP_VS_SO_GET_SERVICE:
2517         {
2518                 struct ip_vs_service_entry *entry;
2519                 struct ip_vs_service *svc;
2520                 union nf_inet_addr addr;
2521
2522                 entry = (struct ip_vs_service_entry *)arg;
2523                 addr.ip = entry->addr;
2524                 if (entry->fwmark)
2525                         svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
2526                 else
2527                         svc = __ip_vs_service_find(net, AF_INET,
2528                                                    entry->protocol, &addr,
2529                                                    entry->port);
2530                 if (svc) {
2531                         ip_vs_copy_service(entry, svc);
2532                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2533                                 ret = -EFAULT;
2534                 } else
2535                         ret = -ESRCH;
2536         }
2537         break;
2538
2539         case IP_VS_SO_GET_DESTS:
2540         {
2541                 struct ip_vs_get_dests *get;
2542                 int size;
2543
2544                 get = (struct ip_vs_get_dests *)arg;
2545                 size = sizeof(*get) +
2546                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2547                 if (*len != size) {
2548                         pr_err("length: %u != %u\n", *len, size);
2549                         ret = -EINVAL;
2550                         goto out;
2551                 }
2552                 ret = __ip_vs_get_dest_entries(net, get, user);
2553         }
2554         break;
2555
2556         case IP_VS_SO_GET_TIMEOUT:
2557         {
2558                 struct ip_vs_timeout_user t;
2559
2560                 __ip_vs_get_timeouts(net, &t);
2561                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2562                         ret = -EFAULT;
2563         }
2564         break;
2565
2566         case IP_VS_SO_GET_DAEMON:
2567         {
2568                 struct ip_vs_daemon_user d[2];
2569
2570                 memset(&d, 0, sizeof(d));
2571                 if (ipvs->sync_state & IP_VS_STATE_MASTER) {
2572                         d[0].state = IP_VS_STATE_MASTER;
2573                         strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
2574                                 sizeof(d[0].mcast_ifn));
2575                         d[0].syncid = ipvs->master_syncid;
2576                 }
2577                 if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
2578                         d[1].state = IP_VS_STATE_BACKUP;
2579                         strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
2580                                 sizeof(d[1].mcast_ifn));
2581                         d[1].syncid = ipvs->backup_syncid;
2582                 }
2583                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2584                         ret = -EFAULT;
2585         }
2586         break;
2587
2588         default:
2589                 ret = -EINVAL;
2590         }
2591
2592   out:
2593         mutex_unlock(&__ip_vs_mutex);
2594         return ret;
2595 }
2596
2597
2598 static struct nf_sockopt_ops ip_vs_sockopts = {
2599         .pf             = PF_INET,
2600         .set_optmin     = IP_VS_BASE_CTL,
2601         .set_optmax     = IP_VS_SO_SET_MAX+1,
2602         .set            = do_ip_vs_set_ctl,
2603         .get_optmin     = IP_VS_BASE_CTL,
2604         .get_optmax     = IP_VS_SO_GET_MAX+1,
2605         .get            = do_ip_vs_get_ctl,
2606         .owner          = THIS_MODULE,
2607 };
2608
2609 /*
2610  * Generic Netlink interface
2611  */
2612
2613 /* IPVS genetlink family */
2614 static struct genl_family ip_vs_genl_family = {
2615         .id             = GENL_ID_GENERATE,
2616         .hdrsize        = 0,
2617         .name           = IPVS_GENL_NAME,
2618         .version        = IPVS_GENL_VERSION,
2619         .maxattr        = IPVS_CMD_MAX,
2620         .netnsok        = true,         /* Make ipvsadm to work on netns */
2621 };
2622
2623 /* Policy used for first-level command attributes */
2624 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
2625         [IPVS_CMD_ATTR_SERVICE]         = { .type = NLA_NESTED },
2626         [IPVS_CMD_ATTR_DEST]            = { .type = NLA_NESTED },
2627         [IPVS_CMD_ATTR_DAEMON]          = { .type = NLA_NESTED },
2628         [IPVS_CMD_ATTR_TIMEOUT_TCP]     = { .type = NLA_U32 },
2629         [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
2630         [IPVS_CMD_ATTR_TIMEOUT_UDP]     = { .type = NLA_U32 },
2631 };
2632
2633 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
2634 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
2635         [IPVS_DAEMON_ATTR_STATE]        = { .type = NLA_U32 },
2636         [IPVS_DAEMON_ATTR_MCAST_IFN]    = { .type = NLA_NUL_STRING,
2637                                             .len = IP_VS_IFNAME_MAXLEN },
2638         [IPVS_DAEMON_ATTR_SYNC_ID]      = { .type = NLA_U32 },
2639 };
2640
2641 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
2642 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
2643         [IPVS_SVC_ATTR_AF]              = { .type = NLA_U16 },
2644         [IPVS_SVC_ATTR_PROTOCOL]        = { .type = NLA_U16 },
2645         [IPVS_SVC_ATTR_ADDR]            = { .type = NLA_BINARY,
2646                                             .len = sizeof(union nf_inet_addr) },
2647         [IPVS_SVC_ATTR_PORT]            = { .type = NLA_U16 },
2648         [IPVS_SVC_ATTR_FWMARK]          = { .type = NLA_U32 },
2649         [IPVS_SVC_ATTR_SCHED_NAME]      = { .type = NLA_NUL_STRING,
2650                                             .len = IP_VS_SCHEDNAME_MAXLEN },
2651         [IPVS_SVC_ATTR_PE_NAME]         = { .type = NLA_NUL_STRING,
2652                                             .len = IP_VS_PENAME_MAXLEN },
2653         [IPVS_SVC_ATTR_FLAGS]           = { .type = NLA_BINARY,
2654                                             .len = sizeof(struct ip_vs_flags) },
2655         [IPVS_SVC_ATTR_TIMEOUT]         = { .type = NLA_U32 },
2656         [IPVS_SVC_ATTR_NETMASK]         = { .type = NLA_U32 },
2657         [IPVS_SVC_ATTR_STATS]           = { .type = NLA_NESTED },
2658 };
2659
2660 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
2661 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
2662         [IPVS_DEST_ATTR_ADDR]           = { .type = NLA_BINARY,
2663                                             .len = sizeof(union nf_inet_addr) },
2664         [IPVS_DEST_ATTR_PORT]           = { .type = NLA_U16 },
2665         [IPVS_DEST_ATTR_FWD_METHOD]     = { .type = NLA_U32 },
2666         [IPVS_DEST_ATTR_WEIGHT]         = { .type = NLA_U32 },
2667         [IPVS_DEST_ATTR_U_THRESH]       = { .type = NLA_U32 },
2668         [IPVS_DEST_ATTR_L_THRESH]       = { .type = NLA_U32 },
2669         [IPVS_DEST_ATTR_ACTIVE_CONNS]   = { .type = NLA_U32 },
2670         [IPVS_DEST_ATTR_INACT_CONNS]    = { .type = NLA_U32 },
2671         [IPVS_DEST_ATTR_PERSIST_CONNS]  = { .type = NLA_U32 },
2672         [IPVS_DEST_ATTR_STATS]          = { .type = NLA_NESTED },
2673 };
2674
2675 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
2676                                  struct ip_vs_stats *stats)
2677 {
2678         struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2679         if (!nl_stats)
2680                 return -EMSGSIZE;
2681
2682         spin_lock_bh(&stats->lock);
2683
2684         NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns);
2685         NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts);
2686         NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts);
2687         NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes);
2688         NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes);
2689         NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps);
2690         NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps);
2691         NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps);
2692         NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps);
2693         NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps);
2694
2695         spin_unlock_bh(&stats->lock);
2696
2697         nla_nest_end(skb, nl_stats);
2698
2699         return 0;
2700
2701 nla_put_failure:
2702         spin_unlock_bh(&stats->lock);
2703         nla_nest_cancel(skb, nl_stats);
2704         return -EMSGSIZE;
2705 }
2706
2707 static int ip_vs_genl_fill_service(struct sk_buff *skb,
2708                                    struct ip_vs_service *svc)
2709 {
2710         struct nlattr *nl_service;
2711         struct ip_vs_flags flags = { .flags = svc->flags,
2712                                      .mask = ~0 };
2713
2714         nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
2715         if (!nl_service)
2716                 return -EMSGSIZE;
2717
2718         NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af);
2719
2720         if (svc->fwmark) {
2721                 NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark);
2722         } else {
2723                 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol);
2724                 NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr);
2725                 NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port);
2726         }
2727
2728         NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name);
2729         if (svc->pe)
2730                 NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name);
2731         NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags);
2732         NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ);
2733         NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask);
2734
2735         if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
2736                 goto nla_put_failure;
2737
2738         nla_nest_end(skb, nl_service);
2739
2740         return 0;
2741
2742 nla_put_failure:
2743         nla_nest_cancel(skb, nl_service);
2744         return -EMSGSIZE;
2745 }
2746
2747 static int ip_vs_genl_dump_service(struct sk_buff *skb,
2748                                    struct ip_vs_service *svc,
2749                                    struct netlink_callback *cb)
2750 {
2751         void *hdr;
2752
2753         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2754                           &ip_vs_genl_family, NLM_F_MULTI,
2755                           IPVS_CMD_NEW_SERVICE);
2756         if (!hdr)
2757                 return -EMSGSIZE;
2758
2759         if (ip_vs_genl_fill_service(skb, svc) < 0)
2760                 goto nla_put_failure;
2761
2762         return genlmsg_end(skb, hdr);
2763
2764 nla_put_failure:
2765         genlmsg_cancel(skb, hdr);
2766         return -EMSGSIZE;
2767 }
2768
2769 static int ip_vs_genl_dump_services(struct sk_buff *skb,
2770                                     struct netlink_callback *cb)
2771 {
2772         int idx = 0, i;
2773         int start = cb->args[0];
2774         struct ip_vs_service *svc;
2775         struct net *net = skb_sknet(skb);
2776
2777         mutex_lock(&__ip_vs_mutex);
2778         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2779                 list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
2780                         if (++idx <= start || !net_eq(svc->net, net))
2781                                 continue;
2782                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2783                                 idx--;
2784                                 goto nla_put_failure;
2785                         }
2786                 }
2787         }
2788
2789         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2790                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
2791                         if (++idx <= start || !net_eq(svc->net, net))
2792                                 continue;
2793                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2794                                 idx--;
2795                                 goto nla_put_failure;
2796                         }
2797                 }
2798         }
2799
2800 nla_put_failure:
2801         mutex_unlock(&__ip_vs_mutex);
2802         cb->args[0] = idx;
2803
2804         return skb->len;
2805 }
2806
2807 static int ip_vs_genl_parse_service(struct net *net,
2808                                     struct ip_vs_service_user_kern *usvc,
2809                                     struct nlattr *nla, int full_entry,
2810                                     struct ip_vs_service **ret_svc)
2811 {
2812         struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
2813         struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
2814         struct ip_vs_service *svc;
2815
2816         /* Parse mandatory identifying service fields first */
2817         if (nla == NULL ||
2818             nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
2819                 return -EINVAL;
2820
2821         nla_af          = attrs[IPVS_SVC_ATTR_AF];
2822         nla_protocol    = attrs[IPVS_SVC_ATTR_PROTOCOL];
2823         nla_addr        = attrs[IPVS_SVC_ATTR_ADDR];
2824         nla_port        = attrs[IPVS_SVC_ATTR_PORT];
2825         nla_fwmark      = attrs[IPVS_SVC_ATTR_FWMARK];
2826
2827         if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
2828                 return -EINVAL;
2829
2830         memset(usvc, 0, sizeof(*usvc));
2831
2832         usvc->af = nla_get_u16(nla_af);
2833 #ifdef CONFIG_IP_VS_IPV6
2834         if (usvc->af != AF_INET && usvc->af != AF_INET6)
2835 #else
2836         if (usvc->af != AF_INET)
2837 #endif
2838                 return -EAFNOSUPPORT;
2839
2840         if (nla_fwmark) {
2841                 usvc->protocol = IPPROTO_TCP;
2842                 usvc->fwmark = nla_get_u32(nla_fwmark);
2843         } else {
2844                 usvc->protocol = nla_get_u16(nla_protocol);
2845                 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
2846                 usvc->port = nla_get_u16(nla_port);
2847                 usvc->fwmark = 0;
2848         }
2849
2850         if (usvc->fwmark)
2851                 svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
2852         else
2853                 svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
2854                                            &usvc->addr, usvc->port);
2855         *ret_svc = svc;
2856
2857         /* If a full entry was requested, check for the additional fields */
2858         if (full_entry) {
2859                 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
2860                               *nla_netmask;
2861                 struct ip_vs_flags flags;
2862
2863                 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
2864                 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
2865                 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
2866                 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
2867                 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
2868
2869                 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
2870                         return -EINVAL;
2871
2872                 nla_memcpy(&flags, nla_flags, sizeof(flags));
2873
2874                 /* prefill flags from service if it already exists */
2875                 if (svc)
2876                         usvc->flags = svc->flags;
2877
2878                 /* set new flags from userland */
2879                 usvc->flags = (usvc->flags & ~flags.mask) |
2880                               (flags.flags & flags.mask);
2881                 usvc->sched_name = nla_data(nla_sched);
2882                 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
2883                 usvc->timeout = nla_get_u32(nla_timeout);
2884                 usvc->netmask = nla_get_u32(nla_netmask);
2885         }
2886
2887         return 0;
2888 }
2889
2890 static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
2891                                                      struct nlattr *nla)
2892 {
2893         struct ip_vs_service_user_kern usvc;
2894         struct ip_vs_service *svc;
2895         int ret;
2896
2897         ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
2898         return ret ? ERR_PTR(ret) : svc;
2899 }
2900
2901 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
2902 {
2903         struct nlattr *nl_dest;
2904
2905         nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
2906         if (!nl_dest)
2907                 return -EMSGSIZE;
2908
2909         NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr);
2910         NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port);
2911
2912         NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD,
2913                     atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK);
2914         NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight));
2915         NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold);
2916         NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold);
2917         NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
2918                     atomic_read(&dest->activeconns));
2919         NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS,
2920                     atomic_read(&dest->inactconns));
2921         NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
2922                     atomic_read(&dest->persistconns));
2923
2924         if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
2925                 goto nla_put_failure;
2926
2927         nla_nest_end(skb, nl_dest);
2928
2929         return 0;
2930
2931 nla_put_failure:
2932         nla_nest_cancel(skb, nl_dest);
2933         return -EMSGSIZE;
2934 }
2935
2936 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
2937                                 struct netlink_callback *cb)
2938 {
2939         void *hdr;
2940
2941         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
2942                           &ip_vs_genl_family, NLM_F_MULTI,
2943                           IPVS_CMD_NEW_DEST);
2944         if (!hdr)
2945                 return -EMSGSIZE;
2946
2947         if (ip_vs_genl_fill_dest(skb, dest) < 0)
2948                 goto nla_put_failure;
2949
2950         return genlmsg_end(skb, hdr);
2951
2952 nla_put_failure:
2953         genlmsg_cancel(skb, hdr);
2954         return -EMSGSIZE;
2955 }
2956
2957 static int ip_vs_genl_dump_dests(struct sk_buff *skb,
2958                                  struct netlink_callback *cb)
2959 {
2960         int idx = 0;
2961         int start = cb->args[0];
2962         struct ip_vs_service *svc;
2963         struct ip_vs_dest *dest;
2964         struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
2965         struct net *net = skb_sknet(skb);
2966
2967         mutex_lock(&__ip_vs_mutex);
2968
2969         /* Try to find the service for which to dump destinations */
2970         if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
2971                         IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
2972                 goto out_err;
2973
2974
2975         svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
2976         if (IS_ERR(svc) || svc == NULL)
2977                 goto out_err;
2978
2979         /* Dump the destinations */
2980         list_for_each_entry(dest, &svc->destinations, n_list) {
2981                 if (++idx <= start)
2982                         continue;
2983                 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
2984                         idx--;
2985                         goto nla_put_failure;
2986                 }
2987         }
2988
2989 nla_put_failure:
2990         cb->args[0] = idx;
2991
2992 out_err:
2993         mutex_unlock(&__ip_vs_mutex);
2994
2995         return skb->len;
2996 }
2997
2998 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
2999                                  struct nlattr *nla, int full_entry)
3000 {
3001         struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
3002         struct nlattr *nla_addr, *nla_port;
3003
3004         /* Parse mandatory identifying destination fields first */
3005         if (nla == NULL ||
3006             nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
3007                 return -EINVAL;
3008
3009         nla_addr        = attrs[IPVS_DEST_ATTR_ADDR];
3010         nla_port        = attrs[IPVS_DEST_ATTR_PORT];
3011
3012         if (!(nla_addr && nla_port))
3013                 return -EINVAL;
3014
3015         memset(udest, 0, sizeof(*udest));
3016
3017         nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
3018         udest->port = nla_get_u16(nla_port);
3019
3020         /* If a full entry was requested, check for the additional fields */
3021         if (full_entry) {
3022                 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
3023                               *nla_l_thresh;
3024
3025                 nla_fwd         = attrs[IPVS_DEST_ATTR_FWD_METHOD];
3026                 nla_weight      = attrs[IPVS_DEST_ATTR_WEIGHT];
3027                 nla_u_thresh    = attrs[IPVS_DEST_ATTR_U_THRESH];
3028                 nla_l_thresh    = attrs[IPVS_DEST_ATTR_L_THRESH];
3029
3030                 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
3031                         return -EINVAL;
3032
3033                 udest->conn_flags = nla_get_u32(nla_fwd)
3034                                     & IP_VS_CONN_F_FWD_MASK;
3035                 udest->weight = nla_get_u32(nla_weight);
3036                 udest->u_threshold = nla_get_u32(nla_u_thresh);
3037                 udest->l_threshold = nla_get_u32(nla_l_thresh);
3038         }
3039
3040         return 0;
3041 }
3042
3043 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
3044                                   const char *mcast_ifn, __be32 syncid)
3045 {
3046         struct nlattr *nl_daemon;
3047
3048         nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
3049         if (!nl_daemon)
3050                 return -EMSGSIZE;
3051
3052         NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state);
3053         NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn);
3054         NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid);
3055
3056         nla_nest_end(skb, nl_daemon);
3057
3058         return 0;
3059
3060 nla_put_failure:
3061         nla_nest_cancel(skb, nl_daemon);
3062         return -EMSGSIZE;
3063 }
3064
3065 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
3066                                   const char *mcast_ifn, __be32 syncid,
3067                                   struct netlink_callback *cb)
3068 {
3069         void *hdr;
3070         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq,
3071                           &ip_vs_genl_family, NLM_F_MULTI,
3072                           IPVS_CMD_NEW_DAEMON);
3073         if (!hdr)
3074                 return -EMSGSIZE;
3075
3076         if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
3077                 goto nla_put_failure;
3078
3079         return genlmsg_end(skb, hdr);
3080
3081 nla_put_failure:
3082         genlmsg_cancel(skb, hdr);
3083         return -EMSGSIZE;
3084 }
3085
3086 static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
3087                                    struct netlink_callback *cb)
3088 {
3089         struct net *net = skb_net(skb);
3090         struct netns_ipvs *ipvs = net_ipvs(net);
3091
3092         mutex_lock(&__ip_vs_mutex);
3093         if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
3094                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
3095                                            ipvs->master_mcast_ifn,
3096                                            ipvs->master_syncid, cb) < 0)
3097                         goto nla_put_failure;
3098
3099                 cb->args[0] = 1;
3100         }
3101
3102         if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
3103                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
3104                                            ipvs->backup_mcast_ifn,
3105                                            ipvs->backup_syncid, cb) < 0)
3106                         goto nla_put_failure;
3107
3108                 cb->args[1] = 1;
3109         }
3110
3111 nla_put_failure:
3112         mutex_unlock(&__ip_vs_mutex);
3113
3114         return skb->len;
3115 }
3116
3117 static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
3118 {
3119         if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
3120               attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
3121               attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
3122                 return -EINVAL;
3123
3124         return start_sync_thread(net,
3125                                  nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
3126                                  nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
3127                                  nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
3128 }
3129
3130 static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
3131 {
3132         if (!attrs[IPVS_DAEMON_ATTR_STATE])
3133                 return -EINVAL;
3134
3135         return stop_sync_thread(net,
3136                                 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3137 }
3138
3139 static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
3140 {
3141         struct ip_vs_timeout_user t;
3142
3143         __ip_vs_get_timeouts(net, &t);
3144
3145         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
3146                 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
3147
3148         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
3149                 t.tcp_fin_timeout =
3150                         nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
3151
3152         if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
3153                 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
3154
3155         return ip_vs_set_timeout(net, &t);
3156 }
3157
3158 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3159 {
3160         struct ip_vs_service *svc = NULL;
3161         struct ip_vs_service_user_kern usvc;
3162         struct ip_vs_dest_user_kern udest;
3163         int ret = 0, cmd;
3164         int need_full_svc = 0, need_full_dest = 0;
3165         struct net *net;
3166         struct netns_ipvs *ipvs;
3167
3168         net = skb_sknet(skb);
3169         ipvs = net_ipvs(net);
3170         cmd = info->genlhdr->cmd;
3171
3172         mutex_lock(&__ip_vs_mutex);
3173
3174         if (cmd == IPVS_CMD_FLUSH) {
3175                 ret = ip_vs_flush(net);
3176                 goto out;
3177         } else if (cmd == IPVS_CMD_SET_CONFIG) {
3178                 ret = ip_vs_genl_set_config(net, info->attrs);
3179                 goto out;
3180         } else if (cmd == IPVS_CMD_NEW_DAEMON ||
3181                    cmd == IPVS_CMD_DEL_DAEMON) {
3182
3183                 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
3184
3185                 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
3186                     nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
3187                                      info->attrs[IPVS_CMD_ATTR_DAEMON],
3188                                      ip_vs_daemon_policy)) {
3189                         ret = -EINVAL;
3190                         goto out;
3191                 }
3192
3193                 if (cmd == IPVS_CMD_NEW_DAEMON)
3194                         ret = ip_vs_genl_new_daemon(net, daemon_attrs);
3195                 else
3196                         ret = ip_vs_genl_del_daemon(net, daemon_attrs);
3197                 goto out;
3198         } else if (cmd == IPVS_CMD_ZERO &&
3199                    !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3200                 ret = ip_vs_zero_all(net);
3201                 goto out;
3202         }
3203
3204         /* All following commands require a service argument, so check if we
3205          * received a valid one. We need a full service specification when
3206          * adding / editing a service. Only identifying members otherwise. */
3207         if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
3208                 need_full_svc = 1;
3209
3210         ret = ip_vs_genl_parse_service(net, &usvc,
3211                                        info->attrs[IPVS_CMD_ATTR_SERVICE],
3212                                        need_full_svc, &svc);
3213         if (ret)
3214                 goto out;
3215
3216         /* Unless we're adding a new service, the service must already exist */
3217         if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
3218                 ret = -ESRCH;
3219                 goto out;
3220         }
3221
3222         /* Destination commands require a valid destination argument. For
3223          * adding / editing a destination, we need a full destination
3224          * specification. */
3225         if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
3226             cmd == IPVS_CMD_DEL_DEST) {
3227                 if (cmd != IPVS_CMD_DEL_DEST)
3228                         need_full_dest = 1;
3229
3230                 ret = ip_vs_genl_parse_dest(&udest,
3231                                             info->attrs[IPVS_CMD_ATTR_DEST],
3232                                             need_full_dest);
3233                 if (ret)
3234                         goto out;
3235         }
3236
3237         switch (cmd) {
3238         case IPVS_CMD_NEW_SERVICE:
3239                 if (svc == NULL)
3240                         ret = ip_vs_add_service(net, &usvc, &svc);
3241                 else
3242                         ret = -EEXIST;
3243                 break;
3244         case IPVS_CMD_SET_SERVICE:
3245                 ret = ip_vs_edit_service(svc, &usvc);
3246                 break;
3247         case IPVS_CMD_DEL_SERVICE:
3248                 ret = ip_vs_del_service(svc);
3249                 /* do not use svc, it can be freed */
3250                 break;
3251         case IPVS_CMD_NEW_DEST:
3252                 ret = ip_vs_add_dest(svc, &udest);
3253                 break;
3254         case IPVS_CMD_SET_DEST:
3255                 ret = ip_vs_edit_dest(svc, &udest);
3256                 break;
3257         case IPVS_CMD_DEL_DEST:
3258                 ret = ip_vs_del_dest(svc, &udest);
3259                 break;
3260         case IPVS_CMD_ZERO:
3261                 ret = ip_vs_zero_service(svc);
3262                 break;
3263         default:
3264                 ret = -EINVAL;
3265         }
3266
3267 out:
3268         mutex_unlock(&__ip_vs_mutex);
3269
3270         return ret;
3271 }
3272
3273 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3274 {
3275         struct sk_buff *msg;
3276         void *reply;
3277         int ret, cmd, reply_cmd;
3278         struct net *net;
3279         struct netns_ipvs *ipvs;
3280
3281         net = skb_sknet(skb);
3282         ipvs = net_ipvs(net);
3283         cmd = info->genlhdr->cmd;
3284
3285         if (cmd == IPVS_CMD_GET_SERVICE)
3286                 reply_cmd = IPVS_CMD_NEW_SERVICE;
3287         else if (cmd == IPVS_CMD_GET_INFO)
3288                 reply_cmd = IPVS_CMD_SET_INFO;
3289         else if (cmd == IPVS_CMD_GET_CONFIG)
3290                 reply_cmd = IPVS_CMD_SET_CONFIG;
3291         else {
3292                 pr_err("unknown Generic Netlink command\n");
3293                 return -EINVAL;
3294         }
3295
3296         msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
3297         if (!msg)
3298                 return -ENOMEM;
3299
3300         mutex_lock(&__ip_vs_mutex);
3301
3302         reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
3303         if (reply == NULL)
3304                 goto nla_put_failure;
3305
3306         switch (cmd) {
3307         case IPVS_CMD_GET_SERVICE:
3308         {
3309                 struct ip_vs_service *svc;
3310
3311                 svc = ip_vs_genl_find_service(net,
3312                                               info->attrs[IPVS_CMD_ATTR_SERVICE]);
3313                 if (IS_ERR(svc)) {
3314                         ret = PTR_ERR(svc);
3315                         goto out_err;
3316                 } else if (svc) {
3317                         ret = ip_vs_genl_fill_service(msg, svc);
3318                         if (ret)
3319                                 goto nla_put_failure;
3320                 } else {
3321                         ret = -ESRCH;
3322                         goto out_err;
3323                 }
3324
3325                 break;
3326         }
3327
3328         case IPVS_CMD_GET_CONFIG:
3329         {
3330                 struct ip_vs_timeout_user t;
3331
3332                 __ip_vs_get_timeouts(net, &t);
3333 #ifdef CONFIG_IP_VS_PROTO_TCP
3334                 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
3335                 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
3336                             t.tcp_fin_timeout);
3337 #endif
3338 #ifdef CONFIG_IP_VS_PROTO_UDP
3339                 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout);
3340 #endif
3341
3342                 break;
3343         }
3344
3345         case IPVS_CMD_GET_INFO:
3346                 NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE);
3347                 NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
3348                             ip_vs_conn_tab_size);
3349                 break;
3350         }
3351
3352         genlmsg_end(msg, reply);
3353         ret = genlmsg_reply(msg, info);
3354         goto out;
3355
3356 nla_put_failure:
3357         pr_err("not enough space in Netlink message\n");
3358         ret = -EMSGSIZE;
3359
3360 out_err:
3361         nlmsg_free(msg);
3362 out:
3363         mutex_unlock(&__ip_vs_mutex);
3364
3365         return ret;
3366 }
3367
3368
3369 static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
3370         {
3371                 .cmd    = IPVS_CMD_NEW_SERVICE,
3372                 .flags  = GENL_ADMIN_PERM,
3373                 .policy = ip_vs_cmd_policy,
3374                 .doit   = ip_vs_genl_set_cmd,
3375         },
3376         {
3377                 .cmd    = IPVS_CMD_SET_SERVICE,
3378                 .flags  = GENL_ADMIN_PERM,
3379                 .policy = ip_vs_cmd_policy,
3380                 .doit   = ip_vs_genl_set_cmd,
3381         },
3382         {
3383                 .cmd    = IPVS_CMD_DEL_SERVICE,
3384                 .flags  = GENL_ADMIN_PERM,
3385                 .policy = ip_vs_cmd_policy,
3386                 .doit   = ip_vs_genl_set_cmd,
3387         },
3388         {
3389                 .cmd    = IPVS_CMD_GET_SERVICE,
3390                 .flags  = GENL_ADMIN_PERM,
3391                 .doit   = ip_vs_genl_get_cmd,
3392                 .dumpit = ip_vs_genl_dump_services,
3393                 .policy = ip_vs_cmd_policy,
3394         },
3395         {
3396                 .cmd    = IPVS_CMD_NEW_DEST,
3397                 .flags  = GENL_ADMIN_PERM,
3398                 .policy = ip_vs_cmd_policy,
3399                 .doit   = ip_vs_genl_set_cmd,
3400         },
3401         {
3402                 .cmd    = IPVS_CMD_SET_DEST,
3403                 .flags  = GENL_ADMIN_PERM,
3404                 .policy = ip_vs_cmd_policy,
3405                 .doit   = ip_vs_genl_set_cmd,
3406         },
3407         {
3408                 .cmd    = IPVS_CMD_DEL_DEST,
3409                 .flags  = GENL_ADMIN_PERM,
3410                 .policy = ip_vs_cmd_policy,
3411                 .doit   = ip_vs_genl_set_cmd,
3412         },
3413         {
3414                 .cmd    = IPVS_CMD_GET_DEST,
3415                 .flags  = GENL_ADMIN_PERM,
3416                 .policy = ip_vs_cmd_policy,
3417                 .dumpit = ip_vs_genl_dump_dests,
3418         },
3419         {
3420                 .cmd    = IPVS_CMD_NEW_DAEMON,
3421                 .flags  = GENL_ADMIN_PERM,
3422                 .policy = ip_vs_cmd_policy,
3423                 .doit   = ip_vs_genl_set_cmd,
3424         },
3425         {
3426                 .cmd    = IPVS_CMD_DEL_DAEMON,
3427                 .flags  = GENL_ADMIN_PERM,
3428                 .policy = ip_vs_cmd_policy,
3429                 .doit   = ip_vs_genl_set_cmd,
3430         },
3431         {
3432                 .cmd    = IPVS_CMD_GET_DAEMON,
3433                 .flags  = GENL_ADMIN_PERM,
3434                 .dumpit = ip_vs_genl_dump_daemons,
3435         },
3436         {
3437                 .cmd    = IPVS_CMD_SET_CONFIG,
3438                 .flags  = GENL_ADMIN_PERM,
3439                 .policy = ip_vs_cmd_policy,
3440                 .doit   = ip_vs_genl_set_cmd,
3441         },
3442         {
3443                 .cmd    = IPVS_CMD_GET_CONFIG,
3444                 .flags  = GENL_ADMIN_PERM,
3445                 .doit   = ip_vs_genl_get_cmd,
3446         },
3447         {
3448                 .cmd    = IPVS_CMD_GET_INFO,
3449                 .flags  = GENL_ADMIN_PERM,
3450                 .doit   = ip_vs_genl_get_cmd,
3451         },
3452         {
3453                 .cmd    = IPVS_CMD_ZERO,
3454                 .flags  = GENL_ADMIN_PERM,
3455                 .policy = ip_vs_cmd_policy,
3456                 .doit   = ip_vs_genl_set_cmd,
3457         },
3458         {
3459                 .cmd    = IPVS_CMD_FLUSH,
3460                 .flags  = GENL_ADMIN_PERM,
3461                 .doit   = ip_vs_genl_set_cmd,
3462         },
3463 };
3464
3465 static int __init ip_vs_genl_register(void)
3466 {
3467         return genl_register_family_with_ops(&ip_vs_genl_family,
3468                 ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops));
3469 }
3470
3471 static void ip_vs_genl_unregister(void)
3472 {
3473         genl_unregister_family(&ip_vs_genl_family);
3474 }
3475
3476 /* End of Generic Netlink interface definitions */
3477
3478 /*
3479  * per netns intit/exit func.
3480  */
3481 int __net_init __ip_vs_control_init(struct net *net)
3482 {
3483         int idx;
3484         struct netns_ipvs *ipvs = net_ipvs(net);
3485         struct ctl_table *tbl;
3486
3487         atomic_set(&ipvs->dropentry, 0);
3488         spin_lock_init(&ipvs->dropentry_lock);
3489         spin_lock_init(&ipvs->droppacket_lock);
3490         spin_lock_init(&ipvs->securetcp_lock);
3491         ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock);
3492
3493         /* Initialize rs_table */
3494         for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
3495                 INIT_LIST_HEAD(&ipvs->rs_table[idx]);
3496
3497         INIT_LIST_HEAD(&ipvs->dest_trash);
3498         atomic_set(&ipvs->ftpsvc_counter, 0);
3499         atomic_set(&ipvs->nullsvc_counter, 0);
3500
3501         /* procfs stats */
3502         ipvs->tot_stats = kzalloc(sizeof(struct ip_vs_stats), GFP_KERNEL);
3503         if (ipvs->tot_stats == NULL) {
3504                 pr_err("%s(): no memory.\n", __func__);
3505                 return -ENOMEM;
3506         }
3507         ipvs->cpustats = alloc_percpu(struct ip_vs_cpu_stats);
3508         if (!ipvs->cpustats) {
3509                 pr_err("%s() alloc_percpu failed\n", __func__);
3510                 goto err_alloc;
3511         }
3512         spin_lock_init(&ipvs->tot_stats->lock);
3513
3514         for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
3515                 INIT_LIST_HEAD(&ipvs->rs_table[idx]);
3516
3517         proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops);
3518         proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops);
3519         proc_net_fops_create(net, "ip_vs_stats_percpu", 0,
3520                              &ip_vs_stats_percpu_fops);
3521
3522         if (!net_eq(net, &init_net)) {
3523                 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
3524                 if (tbl == NULL)
3525                         goto err_dup;
3526         } else
3527                 tbl = vs_vars;
3528         /* Initialize sysctl defaults */
3529         idx = 0;
3530         ipvs->sysctl_amemthresh = 1024;
3531         tbl[idx++].data = &ipvs->sysctl_amemthresh;
3532         ipvs->sysctl_am_droprate = 10;
3533         tbl[idx++].data = &ipvs->sysctl_am_droprate;
3534         tbl[idx++].data = &ipvs->sysctl_drop_entry;
3535         tbl[idx++].data = &ipvs->sysctl_drop_packet;
3536 #ifdef CONFIG_IP_VS_NFCT
3537         tbl[idx++].data = &ipvs->sysctl_conntrack;
3538 #endif
3539         tbl[idx++].data = &ipvs->sysctl_secure_tcp;
3540         ipvs->sysctl_snat_reroute = 1;
3541         tbl[idx++].data = &ipvs->sysctl_snat_reroute;
3542         ipvs->sysctl_sync_ver = 1;
3543         tbl[idx++].data = &ipvs->sysctl_sync_ver;
3544         tbl[idx++].data = &ipvs->sysctl_cache_bypass;
3545         tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
3546         tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
3547         ipvs->sysctl_sync_threshold[0] = 3;
3548         ipvs->sysctl_sync_threshold[1] = 50;
3549         tbl[idx].data = &ipvs->sysctl_sync_threshold;
3550         tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
3551         tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
3552
3553
3554         ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path,
3555                                                   vs_vars);
3556         if (ipvs->sysctl_hdr == NULL)
3557                 goto err_reg;
3558         ip_vs_new_estimator(net, ipvs->tot_stats);
3559         ipvs->sysctl_tbl = tbl;
3560         /* Schedule defense work */
3561         INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
3562         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
3563         return 0;
3564
3565 err_reg:
3566         if (!net_eq(net, &init_net))
3567                 kfree(tbl);
3568 err_dup:
3569         free_percpu(ipvs->cpustats);
3570 err_alloc:
3571         kfree(ipvs->tot_stats);
3572         return -ENOMEM;
3573 }
3574
3575 static void __net_exit __ip_vs_control_cleanup(struct net *net)
3576 {
3577         struct netns_ipvs *ipvs = net_ipvs(net);
3578
3579         ip_vs_trash_cleanup(net);
3580         ip_vs_kill_estimator(net, ipvs->tot_stats);
3581         cancel_delayed_work_sync(&ipvs->defense_work);
3582         cancel_work_sync(&ipvs->defense_work.work);
3583         unregister_net_sysctl_table(ipvs->sysctl_hdr);
3584         proc_net_remove(net, "ip_vs_stats_percpu");
3585         proc_net_remove(net, "ip_vs_stats");
3586         proc_net_remove(net, "ip_vs");
3587         free_percpu(ipvs->cpustats);
3588         kfree(ipvs->tot_stats);
3589 }
3590
3591 static struct pernet_operations ipvs_control_ops = {
3592         .init = __ip_vs_control_init,
3593         .exit = __ip_vs_control_cleanup,
3594 };
3595
3596 int __init ip_vs_control_init(void)
3597 {
3598         int idx;
3599         int ret;
3600
3601         EnterFunction(2);
3602
3603         /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
3604         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
3605                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
3606                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
3607         }
3608
3609         ret = register_pernet_subsys(&ipvs_control_ops);
3610         if (ret) {
3611                 pr_err("cannot register namespace.\n");
3612                 goto err;
3613         }
3614
3615         smp_wmb();      /* Do we really need it now ? */
3616
3617         ret = nf_register_sockopt(&ip_vs_sockopts);
3618         if (ret) {
3619                 pr_err("cannot register sockopt.\n");
3620                 goto err_net;
3621         }
3622
3623         ret = ip_vs_genl_register();
3624         if (ret) {
3625                 pr_err("cannot register Generic Netlink interface.\n");
3626                 nf_unregister_sockopt(&ip_vs_sockopts);
3627                 goto err_net;
3628         }
3629
3630         LeaveFunction(2);
3631         return 0;
3632
3633 err_net:
3634         unregister_pernet_subsys(&ipvs_control_ops);
3635 err:
3636         return ret;
3637 }
3638
3639
3640 void ip_vs_control_cleanup(void)
3641 {
3642         EnterFunction(2);
3643         unregister_pernet_subsys(&ipvs_control_ops);
3644         ip_vs_genl_unregister();
3645         nf_unregister_sockopt(&ip_vs_sockopts);
3646         LeaveFunction(2);
3647 }