ipvs: reorganize tot_stats
[linux-2.6.git] / net / netfilter / ipvs / ip_vs_core.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the Netfilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19  * and others.
20  *
21  * Changes:
22  *      Paul `Rusty' Russell            properly handle non-linear skbs
23  *      Harald Welte                    don't use nfcache
24  *
25  */
26
27 #define KMSG_COMPONENT "IPVS"
28 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
29
30 #include <linux/module.h>
31 #include <linux/kernel.h>
32 #include <linux/ip.h>
33 #include <linux/tcp.h>
34 #include <linux/sctp.h>
35 #include <linux/icmp.h>
36 #include <linux/slab.h>
37
38 #include <net/ip.h>
39 #include <net/tcp.h>
40 #include <net/udp.h>
41 #include <net/icmp.h>                   /* for icmp_send */
42 #include <net/route.h>
43 #include <net/ip6_checksum.h>
44 #include <net/netns/generic.h>          /* net_generic() */
45
46 #include <linux/netfilter.h>
47 #include <linux/netfilter_ipv4.h>
48
49 #ifdef CONFIG_IP_VS_IPV6
50 #include <net/ipv6.h>
51 #include <linux/netfilter_ipv6.h>
52 #include <net/ip6_route.h>
53 #endif
54
55 #include <net/ip_vs.h>
56
57
58 EXPORT_SYMBOL(register_ip_vs_scheduler);
59 EXPORT_SYMBOL(unregister_ip_vs_scheduler);
60 EXPORT_SYMBOL(ip_vs_proto_name);
61 EXPORT_SYMBOL(ip_vs_conn_new);
62 EXPORT_SYMBOL(ip_vs_conn_in_get);
63 EXPORT_SYMBOL(ip_vs_conn_out_get);
64 #ifdef CONFIG_IP_VS_PROTO_TCP
65 EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
66 #endif
67 EXPORT_SYMBOL(ip_vs_conn_put);
68 #ifdef CONFIG_IP_VS_DEBUG
69 EXPORT_SYMBOL(ip_vs_get_debug_level);
70 #endif
71
72 int ip_vs_net_id __read_mostly;
73 #ifdef IP_VS_GENERIC_NETNS
74 EXPORT_SYMBOL(ip_vs_net_id);
75 #endif
76 /* netns cnt used for uniqueness */
77 static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
78
79 /* ID used in ICMP lookups */
80 #define icmp_id(icmph)          (((icmph)->un).echo.id)
81 #define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
82
83 const char *ip_vs_proto_name(unsigned proto)
84 {
85         static char buf[20];
86
87         switch (proto) {
88         case IPPROTO_IP:
89                 return "IP";
90         case IPPROTO_UDP:
91                 return "UDP";
92         case IPPROTO_TCP:
93                 return "TCP";
94         case IPPROTO_SCTP:
95                 return "SCTP";
96         case IPPROTO_ICMP:
97                 return "ICMP";
98 #ifdef CONFIG_IP_VS_IPV6
99         case IPPROTO_ICMPV6:
100                 return "ICMPv6";
101 #endif
102         default:
103                 sprintf(buf, "IP_%d", proto);
104                 return buf;
105         }
106 }
107
108 void ip_vs_init_hash_table(struct list_head *table, int rows)
109 {
110         while (--rows >= 0)
111                 INIT_LIST_HEAD(&table[rows]);
112 }
113
114 static inline void
115 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
116 {
117         struct ip_vs_dest *dest = cp->dest;
118         struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
119
120         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
121                 struct ip_vs_cpu_stats *s;
122
123                 s = this_cpu_ptr(dest->stats.cpustats);
124                 s->ustats.inpkts++;
125                 u64_stats_update_begin(&s->syncp);
126                 s->ustats.inbytes += skb->len;
127                 u64_stats_update_end(&s->syncp);
128
129                 s = this_cpu_ptr(dest->svc->stats.cpustats);
130                 s->ustats.inpkts++;
131                 u64_stats_update_begin(&s->syncp);
132                 s->ustats.inbytes += skb->len;
133                 u64_stats_update_end(&s->syncp);
134
135                 s = this_cpu_ptr(ipvs->tot_stats.cpustats);
136                 s->ustats.inpkts++;
137                 u64_stats_update_begin(&s->syncp);
138                 s->ustats.inbytes += skb->len;
139                 u64_stats_update_end(&s->syncp);
140         }
141 }
142
143
144 static inline void
145 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
146 {
147         struct ip_vs_dest *dest = cp->dest;
148         struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
149
150         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
151                 struct ip_vs_cpu_stats *s;
152
153                 s = this_cpu_ptr(dest->stats.cpustats);
154                 s->ustats.outpkts++;
155                 u64_stats_update_begin(&s->syncp);
156                 s->ustats.outbytes += skb->len;
157                 u64_stats_update_end(&s->syncp);
158
159                 s = this_cpu_ptr(dest->svc->stats.cpustats);
160                 s->ustats.outpkts++;
161                 u64_stats_update_begin(&s->syncp);
162                 s->ustats.outbytes += skb->len;
163                 u64_stats_update_end(&s->syncp);
164
165                 s = this_cpu_ptr(ipvs->tot_stats.cpustats);
166                 s->ustats.outpkts++;
167                 u64_stats_update_begin(&s->syncp);
168                 s->ustats.outbytes += skb->len;
169                 u64_stats_update_end(&s->syncp);
170         }
171 }
172
173
174 static inline void
175 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
176 {
177         struct netns_ipvs *ipvs = net_ipvs(svc->net);
178         struct ip_vs_cpu_stats *s;
179
180         s = this_cpu_ptr(cp->dest->stats.cpustats);
181         s->ustats.conns++;
182
183         s = this_cpu_ptr(svc->stats.cpustats);
184         s->ustats.conns++;
185
186         s = this_cpu_ptr(ipvs->tot_stats.cpustats);
187         s->ustats.conns++;
188 }
189
190
191 static inline int
192 ip_vs_set_state(struct ip_vs_conn *cp, int direction,
193                 const struct sk_buff *skb,
194                 struct ip_vs_proto_data *pd)
195 {
196         if (unlikely(!pd->pp->state_transition))
197                 return 0;
198         return pd->pp->state_transition(cp, direction, skb, pd);
199 }
200
201 static inline int
202 ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
203                               struct sk_buff *skb, int protocol,
204                               const union nf_inet_addr *caddr, __be16 cport,
205                               const union nf_inet_addr *vaddr, __be16 vport,
206                               struct ip_vs_conn_param *p)
207 {
208         ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
209                               vport, p);
210         p->pe = svc->pe;
211         if (p->pe && p->pe->fill_param)
212                 return p->pe->fill_param(p, skb);
213
214         return 0;
215 }
216
217 /*
218  *  IPVS persistent scheduling function
219  *  It creates a connection entry according to its template if exists,
220  *  or selects a server and creates a connection entry plus a template.
221  *  Locking: we are svc user (svc->refcnt), so we hold all dests too
222  *  Protocols supported: TCP, UDP
223  */
224 static struct ip_vs_conn *
225 ip_vs_sched_persist(struct ip_vs_service *svc,
226                     struct sk_buff *skb,
227                     __be16 src_port, __be16 dst_port, int *ignored)
228 {
229         struct ip_vs_conn *cp = NULL;
230         struct ip_vs_iphdr iph;
231         struct ip_vs_dest *dest;
232         struct ip_vs_conn *ct;
233         __be16 dport = 0;               /* destination port to forward */
234         unsigned int flags;
235         struct ip_vs_conn_param param;
236         union nf_inet_addr snet;        /* source network of the client,
237                                            after masking */
238
239         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
240
241         /* Mask saddr with the netmask to adjust template granularity */
242 #ifdef CONFIG_IP_VS_IPV6
243         if (svc->af == AF_INET6)
244                 ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
245         else
246 #endif
247                 snet.ip = iph.saddr.ip & svc->netmask;
248
249         IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
250                       "mnet %s\n",
251                       IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port),
252                       IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port),
253                       IP_VS_DBG_ADDR(svc->af, &snet));
254
255         /*
256          * As far as we know, FTP is a very complicated network protocol, and
257          * it uses control connection and data connections. For active FTP,
258          * FTP server initialize data connection to the client, its source port
259          * is often 20. For passive FTP, FTP server tells the clients the port
260          * that it passively listens to,  and the client issues the data
261          * connection. In the tunneling or direct routing mode, the load
262          * balancer is on the client-to-server half of connection, the port
263          * number is unknown to the load balancer. So, a conn template like
264          * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
265          * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
266          * is created for other persistent services.
267          */
268         {
269                 int protocol = iph.protocol;
270                 const union nf_inet_addr *vaddr = &iph.daddr;
271                 const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
272                 __be16 vport = 0;
273
274                 if (dst_port == svc->port) {
275                         /* non-FTP template:
276                          * <protocol, caddr, 0, vaddr, vport, daddr, dport>
277                          * FTP template:
278                          * <protocol, caddr, 0, vaddr, 0, daddr, 0>
279                          */
280                         if (svc->port != FTPPORT)
281                                 vport = dst_port;
282                 } else {
283                         /* Note: persistent fwmark-based services and
284                          * persistent port zero service are handled here.
285                          * fwmark template:
286                          * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
287                          * port zero template:
288                          * <protocol,caddr,0,vaddr,0,daddr,0>
289                          */
290                         if (svc->fwmark) {
291                                 protocol = IPPROTO_IP;
292                                 vaddr = &fwmark;
293                         }
294                 }
295                 /* return *ignored = -1 so NF_DROP can be used */
296                 if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
297                                                   vaddr, vport, &param) < 0) {
298                         *ignored = -1;
299                         return NULL;
300                 }
301         }
302
303         /* Check if a template already exists */
304         ct = ip_vs_ct_in_get(&param);
305         if (!ct || !ip_vs_check_template(ct)) {
306                 /*
307                  * No template found or the dest of the connection
308                  * template is not available.
309                  * return *ignored=0 i.e. ICMP and NF_DROP
310                  */
311                 dest = svc->scheduler->schedule(svc, skb);
312                 if (!dest) {
313                         IP_VS_DBG(1, "p-schedule: no dest found.\n");
314                         kfree(param.pe_data);
315                         *ignored = 0;
316                         return NULL;
317                 }
318
319                 if (dst_port == svc->port && svc->port != FTPPORT)
320                         dport = dest->port;
321
322                 /* Create a template
323                  * This adds param.pe_data to the template,
324                  * and thus param.pe_data will be destroyed
325                  * when the template expires */
326                 ct = ip_vs_conn_new(&param, &dest->addr, dport,
327                                     IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
328                 if (ct == NULL) {
329                         kfree(param.pe_data);
330                         *ignored = -1;
331                         return NULL;
332                 }
333
334                 ct->timeout = svc->timeout;
335         } else {
336                 /* set destination with the found template */
337                 dest = ct->dest;
338                 kfree(param.pe_data);
339         }
340
341         dport = dst_port;
342         if (dport == svc->port && dest->port)
343                 dport = dest->port;
344
345         flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
346                  && iph.protocol == IPPROTO_UDP)?
347                 IP_VS_CONN_F_ONE_PACKET : 0;
348
349         /*
350          *    Create a new connection according to the template
351          */
352         ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr,
353                               src_port, &iph.daddr, dst_port, &param);
354
355         cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
356         if (cp == NULL) {
357                 ip_vs_conn_put(ct);
358                 *ignored = -1;
359                 return NULL;
360         }
361
362         /*
363          *    Add its control
364          */
365         ip_vs_control_add(cp, ct);
366         ip_vs_conn_put(ct);
367
368         ip_vs_conn_stats(cp, svc);
369         return cp;
370 }
371
372
373 /*
374  *  IPVS main scheduling function
375  *  It selects a server according to the virtual service, and
376  *  creates a connection entry.
377  *  Protocols supported: TCP, UDP
378  *
379  *  Usage of *ignored
380  *
381  * 1 :   protocol tried to schedule (eg. on SYN), found svc but the
382  *       svc/scheduler decides that this packet should be accepted with
383  *       NF_ACCEPT because it must not be scheduled.
384  *
385  * 0 :   scheduler can not find destination, so try bypass or
386  *       return ICMP and then NF_DROP (ip_vs_leave).
387  *
388  * -1 :  scheduler tried to schedule but fatal error occurred, eg.
389  *       ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
390  *       failure such as missing Call-ID, ENOMEM on skb_linearize
391  *       or pe_data. In this case we should return NF_DROP without
392  *       any attempts to send ICMP with ip_vs_leave.
393  */
394 struct ip_vs_conn *
395 ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
396                struct ip_vs_proto_data *pd, int *ignored)
397 {
398         struct ip_vs_protocol *pp = pd->pp;
399         struct ip_vs_conn *cp = NULL;
400         struct ip_vs_iphdr iph;
401         struct ip_vs_dest *dest;
402         __be16 _ports[2], *pptr;
403         unsigned int flags;
404
405         *ignored = 1;
406         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
407         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
408         if (pptr == NULL)
409                 return NULL;
410
411         /*
412          * FTPDATA needs this check when using local real server.
413          * Never schedule Active FTPDATA connections from real server.
414          * For LVS-NAT they must be already created. For other methods
415          * with persistence the connection is created on SYN+ACK.
416          */
417         if (pptr[0] == FTPDATA) {
418                 IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
419                               "Not scheduling FTPDATA");
420                 return NULL;
421         }
422
423         /*
424          *    Do not schedule replies from local real server.
425          */
426         if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
427             (cp = pp->conn_in_get(svc->af, skb, &iph, iph.len, 1))) {
428                 IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
429                               "Not scheduling reply for existing connection");
430                 __ip_vs_conn_put(cp);
431                 return NULL;
432         }
433
434         /*
435          *    Persistent service
436          */
437         if (svc->flags & IP_VS_SVC_F_PERSISTENT)
438                 return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored);
439
440         *ignored = 0;
441
442         /*
443          *    Non-persistent service
444          */
445         if (!svc->fwmark && pptr[1] != svc->port) {
446                 if (!svc->port)
447                         pr_err("Schedule: port zero only supported "
448                                "in persistent services, "
449                                "check your ipvs configuration\n");
450                 return NULL;
451         }
452
453         dest = svc->scheduler->schedule(svc, skb);
454         if (dest == NULL) {
455                 IP_VS_DBG(1, "Schedule: no dest found.\n");
456                 return NULL;
457         }
458
459         flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
460                  && iph.protocol == IPPROTO_UDP)?
461                 IP_VS_CONN_F_ONE_PACKET : 0;
462
463         /*
464          *    Create a connection entry.
465          */
466         {
467                 struct ip_vs_conn_param p;
468
469                 ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
470                                       &iph.saddr, pptr[0], &iph.daddr, pptr[1],
471                                       &p);
472                 cp = ip_vs_conn_new(&p, &dest->addr,
473                                     dest->port ? dest->port : pptr[1],
474                                     flags, dest, skb->mark);
475                 if (!cp) {
476                         *ignored = -1;
477                         return NULL;
478                 }
479         }
480
481         IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
482                       "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
483                       ip_vs_fwd_tag(cp),
484                       IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
485                       IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
486                       IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
487                       cp->flags, atomic_read(&cp->refcnt));
488
489         ip_vs_conn_stats(cp, svc);
490         return cp;
491 }
492
493
494 /*
495  *  Pass or drop the packet.
496  *  Called by ip_vs_in, when the virtual service is available but
497  *  no destination is available for a new connection.
498  */
499 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
500                 struct ip_vs_proto_data *pd)
501 {
502         struct net *net;
503         struct netns_ipvs *ipvs;
504         __be16 _ports[2], *pptr;
505         struct ip_vs_iphdr iph;
506         int unicast;
507
508         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
509
510         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
511         if (pptr == NULL) {
512                 ip_vs_service_put(svc);
513                 return NF_DROP;
514         }
515         net = skb_net(skb);
516
517 #ifdef CONFIG_IP_VS_IPV6
518         if (svc->af == AF_INET6)
519                 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
520         else
521 #endif
522                 unicast = (inet_addr_type(net, iph.daddr.ip) == RTN_UNICAST);
523
524         /* if it is fwmark-based service, the cache_bypass sysctl is up
525            and the destination is a non-local unicast, then create
526            a cache_bypass connection entry */
527         ipvs = net_ipvs(net);
528         if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) {
529                 int ret, cs;
530                 struct ip_vs_conn *cp;
531                 unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
532                                       iph.protocol == IPPROTO_UDP)?
533                                       IP_VS_CONN_F_ONE_PACKET : 0;
534                 union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
535
536                 ip_vs_service_put(svc);
537
538                 /* create a new connection entry */
539                 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
540                 {
541                         struct ip_vs_conn_param p;
542                         ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
543                                               &iph.saddr, pptr[0],
544                                               &iph.daddr, pptr[1], &p);
545                         cp = ip_vs_conn_new(&p, &daddr, 0,
546                                             IP_VS_CONN_F_BYPASS | flags,
547                                             NULL, skb->mark);
548                         if (!cp)
549                                 return NF_DROP;
550                 }
551
552                 /* statistics */
553                 ip_vs_in_stats(cp, skb);
554
555                 /* set state */
556                 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
557
558                 /* transmit the first SYN packet */
559                 ret = cp->packet_xmit(skb, cp, pd->pp);
560                 /* do not touch skb anymore */
561
562                 atomic_inc(&cp->in_pkts);
563                 ip_vs_conn_put(cp);
564                 return ret;
565         }
566
567         /*
568          * When the virtual ftp service is presented, packets destined
569          * for other services on the VIP may get here (except services
570          * listed in the ipvs table), pass the packets, because it is
571          * not ipvs job to decide to drop the packets.
572          */
573         if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
574                 ip_vs_service_put(svc);
575                 return NF_ACCEPT;
576         }
577
578         ip_vs_service_put(svc);
579
580         /*
581          * Notify the client that the destination is unreachable, and
582          * release the socket buffer.
583          * Since it is in IP layer, the TCP socket is not actually
584          * created, the TCP RST packet cannot be sent, instead that
585          * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
586          */
587 #ifdef CONFIG_IP_VS_IPV6
588         if (svc->af == AF_INET6) {
589                 if (!skb->dev) {
590                         struct net *net = dev_net(skb_dst(skb)->dev);
591
592                         skb->dev = net->loopback_dev;
593                 }
594                 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
595         } else
596 #endif
597                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
598
599         return NF_DROP;
600 }
601
602 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
603 {
604         return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
605 }
606
607 static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
608 {
609         if (NF_INET_LOCAL_IN == hooknum)
610                 return IP_DEFRAG_VS_IN;
611         if (NF_INET_FORWARD == hooknum)
612                 return IP_DEFRAG_VS_FWD;
613         return IP_DEFRAG_VS_OUT;
614 }
615
616 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
617 {
618         int err = ip_defrag(skb, user);
619
620         if (!err)
621                 ip_send_check(ip_hdr(skb));
622
623         return err;
624 }
625
626 #ifdef CONFIG_IP_VS_IPV6
627 static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
628 {
629         /* TODO IPv6: Find out what to do here for IPv6 */
630         return 0;
631 }
632 #endif
633
634 /*
635  * Packet has been made sufficiently writable in caller
636  * - inout: 1=in->out, 0=out->in
637  */
638 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
639                     struct ip_vs_conn *cp, int inout)
640 {
641         struct iphdr *iph        = ip_hdr(skb);
642         unsigned int icmp_offset = iph->ihl*4;
643         struct icmphdr *icmph    = (struct icmphdr *)(skb_network_header(skb) +
644                                                       icmp_offset);
645         struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
646
647         if (inout) {
648                 iph->saddr = cp->vaddr.ip;
649                 ip_send_check(iph);
650                 ciph->daddr = cp->vaddr.ip;
651                 ip_send_check(ciph);
652         } else {
653                 iph->daddr = cp->daddr.ip;
654                 ip_send_check(iph);
655                 ciph->saddr = cp->daddr.ip;
656                 ip_send_check(ciph);
657         }
658
659         /* the TCP/UDP/SCTP port */
660         if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
661             IPPROTO_SCTP == ciph->protocol) {
662                 __be16 *ports = (void *)ciph + ciph->ihl*4;
663
664                 if (inout)
665                         ports[1] = cp->vport;
666                 else
667                         ports[0] = cp->dport;
668         }
669
670         /* And finally the ICMP checksum */
671         icmph->checksum = 0;
672         icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
673         skb->ip_summed = CHECKSUM_UNNECESSARY;
674
675         if (inout)
676                 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
677                         "Forwarding altered outgoing ICMP");
678         else
679                 IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
680                         "Forwarding altered incoming ICMP");
681 }
682
683 #ifdef CONFIG_IP_VS_IPV6
684 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
685                     struct ip_vs_conn *cp, int inout)
686 {
687         struct ipv6hdr *iph      = ipv6_hdr(skb);
688         unsigned int icmp_offset = sizeof(struct ipv6hdr);
689         struct icmp6hdr *icmph   = (struct icmp6hdr *)(skb_network_header(skb) +
690                                                       icmp_offset);
691         struct ipv6hdr *ciph     = (struct ipv6hdr *)(icmph + 1);
692
693         if (inout) {
694                 iph->saddr = cp->vaddr.in6;
695                 ciph->daddr = cp->vaddr.in6;
696         } else {
697                 iph->daddr = cp->daddr.in6;
698                 ciph->saddr = cp->daddr.in6;
699         }
700
701         /* the TCP/UDP/SCTP port */
702         if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr ||
703             IPPROTO_SCTP == ciph->nexthdr) {
704                 __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
705
706                 if (inout)
707                         ports[1] = cp->vport;
708                 else
709                         ports[0] = cp->dport;
710         }
711
712         /* And finally the ICMP checksum */
713         icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
714                                               skb->len - icmp_offset,
715                                               IPPROTO_ICMPV6, 0);
716         skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
717         skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
718         skb->ip_summed = CHECKSUM_PARTIAL;
719
720         if (inout)
721                 IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
722                               (void *)ciph - (void *)iph,
723                               "Forwarding altered outgoing ICMPv6");
724         else
725                 IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
726                               (void *)ciph - (void *)iph,
727                               "Forwarding altered incoming ICMPv6");
728 }
729 #endif
730
731 /* Handle relevant response ICMP messages - forward to the right
732  * destination host.
733  */
734 static int handle_response_icmp(int af, struct sk_buff *skb,
735                                 union nf_inet_addr *snet,
736                                 __u8 protocol, struct ip_vs_conn *cp,
737                                 struct ip_vs_protocol *pp,
738                                 unsigned int offset, unsigned int ihl)
739 {
740         struct netns_ipvs *ipvs;
741         unsigned int verdict = NF_DROP;
742
743         if (IP_VS_FWD_METHOD(cp) != 0) {
744                 pr_err("shouldn't reach here, because the box is on the "
745                        "half connection in the tun/dr module.\n");
746         }
747
748         /* Ensure the checksum is correct */
749         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
750                 /* Failed checksum! */
751                 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
752                               IP_VS_DBG_ADDR(af, snet));
753                 goto out;
754         }
755
756         if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
757             IPPROTO_SCTP == protocol)
758                 offset += 2 * sizeof(__u16);
759         if (!skb_make_writable(skb, offset))
760                 goto out;
761
762         ipvs = net_ipvs(skb_net(skb));
763
764 #ifdef CONFIG_IP_VS_IPV6
765         if (af == AF_INET6)
766                 ip_vs_nat_icmp_v6(skb, pp, cp, 1);
767         else
768 #endif
769                 ip_vs_nat_icmp(skb, pp, cp, 1);
770
771 #ifdef CONFIG_IP_VS_IPV6
772         if (af == AF_INET6) {
773                 if (ipvs->sysctl_snat_reroute && ip6_route_me_harder(skb) != 0)
774                         goto out;
775         } else
776 #endif
777                 if ((ipvs->sysctl_snat_reroute ||
778                      skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
779                     ip_route_me_harder(skb, RTN_LOCAL) != 0)
780                         goto out;
781
782         /* do the statistics and put it back */
783         ip_vs_out_stats(cp, skb);
784
785         skb->ipvs_property = 1;
786         if (!(cp->flags & IP_VS_CONN_F_NFCT))
787                 ip_vs_notrack(skb);
788         else
789                 ip_vs_update_conntrack(skb, cp, 0);
790         verdict = NF_ACCEPT;
791
792 out:
793         __ip_vs_conn_put(cp);
794
795         return verdict;
796 }
797
798 /*
799  *      Handle ICMP messages in the inside-to-outside direction (outgoing).
800  *      Find any that might be relevant, check against existing connections.
801  *      Currently handles error types - unreachable, quench, ttl exceeded.
802  */
803 static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
804                           unsigned int hooknum)
805 {
806         struct iphdr *iph;
807         struct icmphdr  _icmph, *ic;
808         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
809         struct ip_vs_iphdr ciph;
810         struct ip_vs_conn *cp;
811         struct ip_vs_protocol *pp;
812         unsigned int offset, ihl;
813         union nf_inet_addr snet;
814
815         *related = 1;
816
817         /* reassemble IP fragments */
818         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
819                 if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
820                         return NF_STOLEN;
821         }
822
823         iph = ip_hdr(skb);
824         offset = ihl = iph->ihl * 4;
825         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
826         if (ic == NULL)
827                 return NF_DROP;
828
829         IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
830                   ic->type, ntohs(icmp_id(ic)),
831                   &iph->saddr, &iph->daddr);
832
833         /*
834          * Work through seeing if this is for us.
835          * These checks are supposed to be in an order that means easy
836          * things are checked first to speed up processing.... however
837          * this means that some packets will manage to get a long way
838          * down this stack and then be rejected, but that's life.
839          */
840         if ((ic->type != ICMP_DEST_UNREACH) &&
841             (ic->type != ICMP_SOURCE_QUENCH) &&
842             (ic->type != ICMP_TIME_EXCEEDED)) {
843                 *related = 0;
844                 return NF_ACCEPT;
845         }
846
847         /* Now find the contained IP header */
848         offset += sizeof(_icmph);
849         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
850         if (cih == NULL)
851                 return NF_ACCEPT; /* The packet looks wrong, ignore */
852
853         pp = ip_vs_proto_get(cih->protocol);
854         if (!pp)
855                 return NF_ACCEPT;
856
857         /* Is the embedded protocol header present? */
858         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
859                      pp->dont_defrag))
860                 return NF_ACCEPT;
861
862         IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
863                       "Checking outgoing ICMP for");
864
865         offset += cih->ihl * 4;
866
867         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
868         /* The embedded headers contain source and dest in reverse order */
869         cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1);
870         if (!cp)
871                 return NF_ACCEPT;
872
873         snet.ip = iph->saddr;
874         return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
875                                     pp, offset, ihl);
876 }
877
878 #ifdef CONFIG_IP_VS_IPV6
879 static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
880                              unsigned int hooknum)
881 {
882         struct ipv6hdr *iph;
883         struct icmp6hdr _icmph, *ic;
884         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
885                                            within the ICMP */
886         struct ip_vs_iphdr ciph;
887         struct ip_vs_conn *cp;
888         struct ip_vs_protocol *pp;
889         unsigned int offset;
890         union nf_inet_addr snet;
891
892         *related = 1;
893
894         /* reassemble IP fragments */
895         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
896                 if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
897                         return NF_STOLEN;
898         }
899
900         iph = ipv6_hdr(skb);
901         offset = sizeof(struct ipv6hdr);
902         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
903         if (ic == NULL)
904                 return NF_DROP;
905
906         IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
907                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
908                   &iph->saddr, &iph->daddr);
909
910         /*
911          * Work through seeing if this is for us.
912          * These checks are supposed to be in an order that means easy
913          * things are checked first to speed up processing.... however
914          * this means that some packets will manage to get a long way
915          * down this stack and then be rejected, but that's life.
916          */
917         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
918             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
919             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
920                 *related = 0;
921                 return NF_ACCEPT;
922         }
923
924         /* Now find the contained IP header */
925         offset += sizeof(_icmph);
926         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
927         if (cih == NULL)
928                 return NF_ACCEPT; /* The packet looks wrong, ignore */
929
930         pp = ip_vs_proto_get(cih->nexthdr);
931         if (!pp)
932                 return NF_ACCEPT;
933
934         /* Is the embedded protocol header present? */
935         /* TODO: we don't support fragmentation at the moment anyways */
936         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
937                 return NF_ACCEPT;
938
939         IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
940                       "Checking outgoing ICMPv6 for");
941
942         offset += sizeof(struct ipv6hdr);
943
944         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
945         /* The embedded headers contain source and dest in reverse order */
946         cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1);
947         if (!cp)
948                 return NF_ACCEPT;
949
950         ipv6_addr_copy(&snet.in6, &iph->saddr);
951         return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
952                                     pp, offset, sizeof(struct ipv6hdr));
953 }
954 #endif
955
956 /*
957  * Check if sctp chunc is ABORT chunk
958  */
959 static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
960 {
961         sctp_chunkhdr_t *sch, schunk;
962         sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t),
963                         sizeof(schunk), &schunk);
964         if (sch == NULL)
965                 return 0;
966         if (sch->type == SCTP_CID_ABORT)
967                 return 1;
968         return 0;
969 }
970
971 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
972 {
973         struct tcphdr _tcph, *th;
974
975         th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
976         if (th == NULL)
977                 return 0;
978         return th->rst;
979 }
980
981 /* Handle response packets: rewrite addresses and send away...
982  */
983 static unsigned int
984 handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
985                 struct ip_vs_conn *cp, int ihl)
986 {
987         struct ip_vs_protocol *pp = pd->pp;
988         struct netns_ipvs *ipvs;
989
990         IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
991
992         if (!skb_make_writable(skb, ihl))
993                 goto drop;
994
995         /* mangle the packet */
996         if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
997                 goto drop;
998
999 #ifdef CONFIG_IP_VS_IPV6
1000         if (af == AF_INET6)
1001                 ipv6_hdr(skb)->saddr = cp->vaddr.in6;
1002         else
1003 #endif
1004         {
1005                 ip_hdr(skb)->saddr = cp->vaddr.ip;
1006                 ip_send_check(ip_hdr(skb));
1007         }
1008
1009         /*
1010          * nf_iterate does not expect change in the skb->dst->dev.
1011          * It looks like it is not fatal to enable this code for hooks
1012          * where our handlers are at the end of the chain list and
1013          * when all next handlers use skb->dst->dev and not outdev.
1014          * It will definitely route properly the inout NAT traffic
1015          * when multiple paths are used.
1016          */
1017
1018         /* For policy routing, packets originating from this
1019          * machine itself may be routed differently to packets
1020          * passing through.  We want this packet to be routed as
1021          * if it came from this machine itself.  So re-compute
1022          * the routing information.
1023          */
1024         ipvs = net_ipvs(skb_net(skb));
1025
1026 #ifdef CONFIG_IP_VS_IPV6
1027         if (af == AF_INET6) {
1028                 if (ipvs->sysctl_snat_reroute && ip6_route_me_harder(skb) != 0)
1029                         goto drop;
1030         } else
1031 #endif
1032                 if ((ipvs->sysctl_snat_reroute ||
1033                      skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
1034                     ip_route_me_harder(skb, RTN_LOCAL) != 0)
1035                         goto drop;
1036
1037         IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
1038
1039         ip_vs_out_stats(cp, skb);
1040         ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
1041         skb->ipvs_property = 1;
1042         if (!(cp->flags & IP_VS_CONN_F_NFCT))
1043                 ip_vs_notrack(skb);
1044         else
1045                 ip_vs_update_conntrack(skb, cp, 0);
1046         ip_vs_conn_put(cp);
1047
1048         LeaveFunction(11);
1049         return NF_ACCEPT;
1050
1051 drop:
1052         ip_vs_conn_put(cp);
1053         kfree_skb(skb);
1054         LeaveFunction(11);
1055         return NF_STOLEN;
1056 }
1057
1058 /*
1059  *      Check if outgoing packet belongs to the established ip_vs_conn.
1060  */
1061 static unsigned int
1062 ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
1063 {
1064         struct net *net = NULL;
1065         struct ip_vs_iphdr iph;
1066         struct ip_vs_protocol *pp;
1067         struct ip_vs_proto_data *pd;
1068         struct ip_vs_conn *cp;
1069         struct netns_ipvs *ipvs;
1070
1071         EnterFunction(11);
1072
1073         /* Already marked as IPVS request or reply? */
1074         if (skb->ipvs_property)
1075                 return NF_ACCEPT;
1076
1077         /* Bad... Do not break raw sockets */
1078         if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
1079                      af == AF_INET)) {
1080                 struct sock *sk = skb->sk;
1081                 struct inet_sock *inet = inet_sk(skb->sk);
1082
1083                 if (inet && sk->sk_family == PF_INET && inet->nodefrag)
1084                         return NF_ACCEPT;
1085         }
1086
1087         if (unlikely(!skb_dst(skb)))
1088                 return NF_ACCEPT;
1089
1090         net = skb_net(skb);
1091         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1092 #ifdef CONFIG_IP_VS_IPV6
1093         if (af == AF_INET6) {
1094                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1095                         int related;
1096                         int verdict = ip_vs_out_icmp_v6(skb, &related,
1097                                                         hooknum);
1098
1099                         if (related)
1100                                 return verdict;
1101                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1102                 }
1103         } else
1104 #endif
1105                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1106                         int related;
1107                         int verdict = ip_vs_out_icmp(skb, &related, hooknum);
1108
1109                         if (related)
1110                                 return verdict;
1111                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1112                 }
1113
1114         pd = ip_vs_proto_data_get(net, iph.protocol);
1115         if (unlikely(!pd))
1116                 return NF_ACCEPT;
1117         pp = pd->pp;
1118
1119         /* reassemble IP fragments */
1120 #ifdef CONFIG_IP_VS_IPV6
1121         if (af == AF_INET6) {
1122                 if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1123                         if (ip_vs_gather_frags_v6(skb,
1124                                                   ip_vs_defrag_user(hooknum)))
1125                                 return NF_STOLEN;
1126                 }
1127
1128                 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1129         } else
1130 #endif
1131                 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
1132                              !pp->dont_defrag)) {
1133                         if (ip_vs_gather_frags(skb,
1134                                                ip_vs_defrag_user(hooknum)))
1135                                 return NF_STOLEN;
1136
1137                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1138                 }
1139
1140         /*
1141          * Check if the packet belongs to an existing entry
1142          */
1143         cp = pp->conn_out_get(af, skb, &iph, iph.len, 0);
1144         ipvs = net_ipvs(net);
1145
1146         if (likely(cp))
1147                 return handle_response(af, skb, pd, cp, iph.len);
1148         if (ipvs->sysctl_nat_icmp_send &&
1149             (pp->protocol == IPPROTO_TCP ||
1150              pp->protocol == IPPROTO_UDP ||
1151              pp->protocol == IPPROTO_SCTP)) {
1152                 __be16 _ports[2], *pptr;
1153
1154                 pptr = skb_header_pointer(skb, iph.len,
1155                                           sizeof(_ports), _ports);
1156                 if (pptr == NULL)
1157                         return NF_ACCEPT;       /* Not for me */
1158                 if (ip_vs_lookup_real_service(net, af, iph.protocol,
1159                                               &iph.saddr,
1160                                               pptr[0])) {
1161                         /*
1162                          * Notify the real server: there is no
1163                          * existing entry if it is not RST
1164                          * packet or not TCP packet.
1165                          */
1166                         if ((iph.protocol != IPPROTO_TCP &&
1167                              iph.protocol != IPPROTO_SCTP)
1168                              || ((iph.protocol == IPPROTO_TCP
1169                                   && !is_tcp_reset(skb, iph.len))
1170                                  || (iph.protocol == IPPROTO_SCTP
1171                                         && !is_sctp_abort(skb,
1172                                                 iph.len)))) {
1173 #ifdef CONFIG_IP_VS_IPV6
1174                                 if (af == AF_INET6) {
1175                                         struct net *net =
1176                                                 dev_net(skb_dst(skb)->dev);
1177
1178                                         if (!skb->dev)
1179                                                 skb->dev = net->loopback_dev;
1180                                         icmpv6_send(skb,
1181                                                     ICMPV6_DEST_UNREACH,
1182                                                     ICMPV6_PORT_UNREACH,
1183                                                     0);
1184                                 } else
1185 #endif
1186                                         icmp_send(skb,
1187                                                   ICMP_DEST_UNREACH,
1188                                                   ICMP_PORT_UNREACH, 0);
1189                                 return NF_DROP;
1190                         }
1191                 }
1192         }
1193         IP_VS_DBG_PKT(12, af, pp, skb, 0,
1194                       "ip_vs_out: packet continues traversal as normal");
1195         return NF_ACCEPT;
1196 }
1197
1198 /*
1199  *      It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1200  *      used only for VS/NAT.
1201  *      Check if packet is reply for established ip_vs_conn.
1202  */
1203 static unsigned int
1204 ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb,
1205              const struct net_device *in, const struct net_device *out,
1206              int (*okfn)(struct sk_buff *))
1207 {
1208         return ip_vs_out(hooknum, skb, AF_INET);
1209 }
1210
1211 /*
1212  *      It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1213  *      Check if packet is reply for established ip_vs_conn.
1214  */
1215 static unsigned int
1216 ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
1217                    const struct net_device *in, const struct net_device *out,
1218                    int (*okfn)(struct sk_buff *))
1219 {
1220         unsigned int verdict;
1221
1222         /* Disable BH in LOCAL_OUT until all places are fixed */
1223         local_bh_disable();
1224         verdict = ip_vs_out(hooknum, skb, AF_INET);
1225         local_bh_enable();
1226         return verdict;
1227 }
1228
1229 #ifdef CONFIG_IP_VS_IPV6
1230
1231 /*
1232  *      It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1233  *      used only for VS/NAT.
1234  *      Check if packet is reply for established ip_vs_conn.
1235  */
1236 static unsigned int
1237 ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb,
1238              const struct net_device *in, const struct net_device *out,
1239              int (*okfn)(struct sk_buff *))
1240 {
1241         return ip_vs_out(hooknum, skb, AF_INET6);
1242 }
1243
1244 /*
1245  *      It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1246  *      Check if packet is reply for established ip_vs_conn.
1247  */
1248 static unsigned int
1249 ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
1250                    const struct net_device *in, const struct net_device *out,
1251                    int (*okfn)(struct sk_buff *))
1252 {
1253         unsigned int verdict;
1254
1255         /* Disable BH in LOCAL_OUT until all places are fixed */
1256         local_bh_disable();
1257         verdict = ip_vs_out(hooknum, skb, AF_INET6);
1258         local_bh_enable();
1259         return verdict;
1260 }
1261
1262 #endif
1263
1264 /*
1265  *      Handle ICMP messages in the outside-to-inside direction (incoming).
1266  *      Find any that might be relevant, check against existing connections,
1267  *      forward to the right destination host if relevant.
1268  *      Currently handles error types - unreachable, quench, ttl exceeded.
1269  */
1270 static int
1271 ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1272 {
1273         struct net *net = NULL;
1274         struct iphdr *iph;
1275         struct icmphdr  _icmph, *ic;
1276         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
1277         struct ip_vs_iphdr ciph;
1278         struct ip_vs_conn *cp;
1279         struct ip_vs_protocol *pp;
1280         struct ip_vs_proto_data *pd;
1281         unsigned int offset, ihl, verdict;
1282
1283         *related = 1;
1284
1285         /* reassemble IP fragments */
1286         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1287                 if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
1288                         return NF_STOLEN;
1289         }
1290
1291         iph = ip_hdr(skb);
1292         offset = ihl = iph->ihl * 4;
1293         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1294         if (ic == NULL)
1295                 return NF_DROP;
1296
1297         IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1298                   ic->type, ntohs(icmp_id(ic)),
1299                   &iph->saddr, &iph->daddr);
1300
1301         /*
1302          * Work through seeing if this is for us.
1303          * These checks are supposed to be in an order that means easy
1304          * things are checked first to speed up processing.... however
1305          * this means that some packets will manage to get a long way
1306          * down this stack and then be rejected, but that's life.
1307          */
1308         if ((ic->type != ICMP_DEST_UNREACH) &&
1309             (ic->type != ICMP_SOURCE_QUENCH) &&
1310             (ic->type != ICMP_TIME_EXCEEDED)) {
1311                 *related = 0;
1312                 return NF_ACCEPT;
1313         }
1314
1315         /* Now find the contained IP header */
1316         offset += sizeof(_icmph);
1317         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1318         if (cih == NULL)
1319                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1320
1321         net = skb_net(skb);
1322         pd = ip_vs_proto_data_get(net, cih->protocol);
1323         if (!pd)
1324                 return NF_ACCEPT;
1325         pp = pd->pp;
1326
1327         /* Is the embedded protocol header present? */
1328         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1329                      pp->dont_defrag))
1330                 return NF_ACCEPT;
1331
1332         IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
1333                       "Checking incoming ICMP for");
1334
1335         offset += cih->ihl * 4;
1336
1337         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1338         /* The embedded headers contain source and dest in reverse order */
1339         cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1);
1340         if (!cp)
1341                 return NF_ACCEPT;
1342
1343         verdict = NF_DROP;
1344
1345         /* Ensure the checksum is correct */
1346         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1347                 /* Failed checksum! */
1348                 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1349                           &iph->saddr);
1350                 goto out;
1351         }
1352
1353         /* do the statistics and put it back */
1354         ip_vs_in_stats(cp, skb);
1355         if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1356                 offset += 2 * sizeof(__u16);
1357         verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
1358         /* LOCALNODE from FORWARD hook is not supported */
1359         if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
1360             skb_rtable(skb)->rt_flags & RTCF_LOCAL) {
1361                 IP_VS_DBG(1, "%s(): "
1362                           "local delivery to %pI4 but in FORWARD\n",
1363                           __func__, &skb_rtable(skb)->rt_dst);
1364                 verdict = NF_DROP;
1365         }
1366
1367   out:
1368         __ip_vs_conn_put(cp);
1369
1370         return verdict;
1371 }
1372
1373 #ifdef CONFIG_IP_VS_IPV6
1374 static int
1375 ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1376 {
1377         struct net *net = NULL;
1378         struct ipv6hdr *iph;
1379         struct icmp6hdr _icmph, *ic;
1380         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
1381                                            within the ICMP */
1382         struct ip_vs_iphdr ciph;
1383         struct ip_vs_conn *cp;
1384         struct ip_vs_protocol *pp;
1385         struct ip_vs_proto_data *pd;
1386         unsigned int offset, verdict;
1387         struct rt6_info *rt;
1388
1389         *related = 1;
1390
1391         /* reassemble IP fragments */
1392         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1393                 if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
1394                         return NF_STOLEN;
1395         }
1396
1397         iph = ipv6_hdr(skb);
1398         offset = sizeof(struct ipv6hdr);
1399         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1400         if (ic == NULL)
1401                 return NF_DROP;
1402
1403         IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
1404                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
1405                   &iph->saddr, &iph->daddr);
1406
1407         /*
1408          * Work through seeing if this is for us.
1409          * These checks are supposed to be in an order that means easy
1410          * things are checked first to speed up processing.... however
1411          * this means that some packets will manage to get a long way
1412          * down this stack and then be rejected, but that's life.
1413          */
1414         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1415             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1416             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1417                 *related = 0;
1418                 return NF_ACCEPT;
1419         }
1420
1421         /* Now find the contained IP header */
1422         offset += sizeof(_icmph);
1423         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1424         if (cih == NULL)
1425                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1426
1427         net = skb_net(skb);
1428         pd = ip_vs_proto_data_get(net, cih->nexthdr);
1429         if (!pd)
1430                 return NF_ACCEPT;
1431         pp = pd->pp;
1432
1433         /* Is the embedded protocol header present? */
1434         /* TODO: we don't support fragmentation at the moment anyways */
1435         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1436                 return NF_ACCEPT;
1437
1438         IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
1439                       "Checking incoming ICMPv6 for");
1440
1441         offset += sizeof(struct ipv6hdr);
1442
1443         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1444         /* The embedded headers contain source and dest in reverse order */
1445         cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1);
1446         if (!cp)
1447                 return NF_ACCEPT;
1448
1449         verdict = NF_DROP;
1450
1451         /* do the statistics and put it back */
1452         ip_vs_in_stats(cp, skb);
1453         if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr ||
1454             IPPROTO_SCTP == cih->nexthdr)
1455                 offset += 2 * sizeof(__u16);
1456         verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
1457         /* LOCALNODE from FORWARD hook is not supported */
1458         if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD &&
1459             (rt = (struct rt6_info *) skb_dst(skb)) &&
1460             rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK) {
1461                 IP_VS_DBG(1, "%s(): "
1462                           "local delivery to %pI6 but in FORWARD\n",
1463                           __func__, &rt->rt6i_dst);
1464                 verdict = NF_DROP;
1465         }
1466
1467         __ip_vs_conn_put(cp);
1468
1469         return verdict;
1470 }
1471 #endif
1472
1473
1474 /*
1475  *      Check if it's for virtual services, look it up,
1476  *      and send it on its way...
1477  */
1478 static unsigned int
1479 ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
1480 {
1481         struct net *net;
1482         struct ip_vs_iphdr iph;
1483         struct ip_vs_protocol *pp;
1484         struct ip_vs_proto_data *pd;
1485         struct ip_vs_conn *cp;
1486         int ret, restart, pkts;
1487         struct netns_ipvs *ipvs;
1488
1489         /* Already marked as IPVS request or reply? */
1490         if (skb->ipvs_property)
1491                 return NF_ACCEPT;
1492
1493         /*
1494          *      Big tappo:
1495          *      - remote client: only PACKET_HOST
1496          *      - route: used for struct net when skb->dev is unset
1497          */
1498         if (unlikely((skb->pkt_type != PACKET_HOST &&
1499                       hooknum != NF_INET_LOCAL_OUT) ||
1500                      !skb_dst(skb))) {
1501                 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1502                 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
1503                               " ignored in hook %u\n",
1504                               skb->pkt_type, iph.protocol,
1505                               IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
1506                 return NF_ACCEPT;
1507         }
1508         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1509
1510         /* Bad... Do not break raw sockets */
1511         if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
1512                      af == AF_INET)) {
1513                 struct sock *sk = skb->sk;
1514                 struct inet_sock *inet = inet_sk(skb->sk);
1515
1516                 if (inet && sk->sk_family == PF_INET && inet->nodefrag)
1517                         return NF_ACCEPT;
1518         }
1519
1520 #ifdef CONFIG_IP_VS_IPV6
1521         if (af == AF_INET6) {
1522                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1523                         int related;
1524                         int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
1525
1526                         if (related)
1527                                 return verdict;
1528                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1529                 }
1530         } else
1531 #endif
1532                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1533                         int related;
1534                         int verdict = ip_vs_in_icmp(skb, &related, hooknum);
1535
1536                         if (related)
1537                                 return verdict;
1538                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1539                 }
1540
1541         net = skb_net(skb);
1542         /* Protocol supported? */
1543         pd = ip_vs_proto_data_get(net, iph.protocol);
1544         if (unlikely(!pd))
1545                 return NF_ACCEPT;
1546         pp = pd->pp;
1547         /*
1548          * Check if the packet belongs to an existing connection entry
1549          */
1550         cp = pp->conn_in_get(af, skb, &iph, iph.len, 0);
1551
1552         if (unlikely(!cp)) {
1553                 int v;
1554
1555                 if (!pp->conn_schedule(af, skb, pd, &v, &cp))
1556                         return v;
1557         }
1558
1559         if (unlikely(!cp)) {
1560                 /* sorry, all this trouble for a no-hit :) */
1561                 IP_VS_DBG_PKT(12, af, pp, skb, 0,
1562                               "ip_vs_in: packet continues traversal as normal");
1563                 return NF_ACCEPT;
1564         }
1565
1566         IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
1567         net = skb_net(skb);
1568         ipvs = net_ipvs(net);
1569         /* Check the server status */
1570         if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1571                 /* the destination server is not available */
1572
1573                 if (ipvs->sysctl_expire_nodest_conn) {
1574                         /* try to expire the connection immediately */
1575                         ip_vs_conn_expire_now(cp);
1576                 }
1577                 /* don't restart its timer, and silently
1578                    drop the packet. */
1579                 __ip_vs_conn_put(cp);
1580                 return NF_DROP;
1581         }
1582
1583         ip_vs_in_stats(cp, skb);
1584         restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
1585         if (cp->packet_xmit)
1586                 ret = cp->packet_xmit(skb, cp, pp);
1587                 /* do not touch skb anymore */
1588         else {
1589                 IP_VS_DBG_RL("warning: packet_xmit is null");
1590                 ret = NF_ACCEPT;
1591         }
1592
1593         /* Increase its packet counter and check if it is needed
1594          * to be synchronized
1595          *
1596          * Sync connection if it is about to close to
1597          * encorage the standby servers to update the connections timeout
1598          *
1599          * For ONE_PKT let ip_vs_sync_conn() do the filter work.
1600          */
1601
1602         if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
1603                 pkts = ipvs->sysctl_sync_threshold[0];
1604         else
1605                 pkts = atomic_add_return(1, &cp->in_pkts);
1606
1607         if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
1608             cp->protocol == IPPROTO_SCTP) {
1609                 if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
1610                         (pkts % ipvs->sysctl_sync_threshold[1]
1611                          == ipvs->sysctl_sync_threshold[0])) ||
1612                                 (cp->old_state != cp->state &&
1613                                  ((cp->state == IP_VS_SCTP_S_CLOSED) ||
1614                                   (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
1615                                   (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
1616                         ip_vs_sync_conn(net, cp);
1617                         goto out;
1618                 }
1619         }
1620
1621         /* Keep this block last: TCP and others with pp->num_states <= 1 */
1622         else if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
1623             (((cp->protocol != IPPROTO_TCP ||
1624                cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1625               (pkts % ipvs->sysctl_sync_threshold[1]
1626                == ipvs->sysctl_sync_threshold[0])) ||
1627              ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1628               ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
1629                (cp->state == IP_VS_TCP_S_CLOSE) ||
1630                (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1631                (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1632                 ip_vs_sync_conn(net, cp);
1633 out:
1634         cp->old_state = cp->state;
1635
1636         ip_vs_conn_put(cp);
1637         return ret;
1638 }
1639
1640 /*
1641  *      AF_INET handler in NF_INET_LOCAL_IN chain
1642  *      Schedule and forward packets from remote clients
1643  */
1644 static unsigned int
1645 ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb,
1646                       const struct net_device *in,
1647                       const struct net_device *out,
1648                       int (*okfn)(struct sk_buff *))
1649 {
1650         return ip_vs_in(hooknum, skb, AF_INET);
1651 }
1652
1653 /*
1654  *      AF_INET handler in NF_INET_LOCAL_OUT chain
1655  *      Schedule and forward packets from local clients
1656  */
1657 static unsigned int
1658 ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
1659                      const struct net_device *in, const struct net_device *out,
1660                      int (*okfn)(struct sk_buff *))
1661 {
1662         unsigned int verdict;
1663
1664         /* Disable BH in LOCAL_OUT until all places are fixed */
1665         local_bh_disable();
1666         verdict = ip_vs_in(hooknum, skb, AF_INET);
1667         local_bh_enable();
1668         return verdict;
1669 }
1670
1671 #ifdef CONFIG_IP_VS_IPV6
1672
1673 /*
1674  *      AF_INET6 handler in NF_INET_LOCAL_IN chain
1675  *      Schedule and forward packets from remote clients
1676  */
1677 static unsigned int
1678 ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb,
1679                       const struct net_device *in,
1680                       const struct net_device *out,
1681                       int (*okfn)(struct sk_buff *))
1682 {
1683         return ip_vs_in(hooknum, skb, AF_INET6);
1684 }
1685
1686 /*
1687  *      AF_INET6 handler in NF_INET_LOCAL_OUT chain
1688  *      Schedule and forward packets from local clients
1689  */
1690 static unsigned int
1691 ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
1692                      const struct net_device *in, const struct net_device *out,
1693                      int (*okfn)(struct sk_buff *))
1694 {
1695         unsigned int verdict;
1696
1697         /* Disable BH in LOCAL_OUT until all places are fixed */
1698         local_bh_disable();
1699         verdict = ip_vs_in(hooknum, skb, AF_INET6);
1700         local_bh_enable();
1701         return verdict;
1702 }
1703
1704 #endif
1705
1706
1707 /*
1708  *      It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1709  *      related packets destined for 0.0.0.0/0.
1710  *      When fwmark-based virtual service is used, such as transparent
1711  *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
1712  *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1713  *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1714  *      and send them to ip_vs_in_icmp.
1715  */
1716 static unsigned int
1717 ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1718                    const struct net_device *in, const struct net_device *out,
1719                    int (*okfn)(struct sk_buff *))
1720 {
1721         int r;
1722
1723         if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1724                 return NF_ACCEPT;
1725
1726         return ip_vs_in_icmp(skb, &r, hooknum);
1727 }
1728
1729 #ifdef CONFIG_IP_VS_IPV6
1730 static unsigned int
1731 ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1732                       const struct net_device *in, const struct net_device *out,
1733                       int (*okfn)(struct sk_buff *))
1734 {
1735         int r;
1736
1737         if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1738                 return NF_ACCEPT;
1739
1740         return ip_vs_in_icmp_v6(skb, &r, hooknum);
1741 }
1742 #endif
1743
1744
1745 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1746         /* After packet filtering, change source only for VS/NAT */
1747         {
1748                 .hook           = ip_vs_reply4,
1749                 .owner          = THIS_MODULE,
1750                 .pf             = PF_INET,
1751                 .hooknum        = NF_INET_LOCAL_IN,
1752                 .priority       = 99,
1753         },
1754         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1755          * or VS/NAT(change destination), so that filtering rules can be
1756          * applied to IPVS. */
1757         {
1758                 .hook           = ip_vs_remote_request4,
1759                 .owner          = THIS_MODULE,
1760                 .pf             = PF_INET,
1761                 .hooknum        = NF_INET_LOCAL_IN,
1762                 .priority       = 101,
1763         },
1764         /* Before ip_vs_in, change source only for VS/NAT */
1765         {
1766                 .hook           = ip_vs_local_reply4,
1767                 .owner          = THIS_MODULE,
1768                 .pf             = PF_INET,
1769                 .hooknum        = NF_INET_LOCAL_OUT,
1770                 .priority       = -99,
1771         },
1772         /* After mangle, schedule and forward local requests */
1773         {
1774                 .hook           = ip_vs_local_request4,
1775                 .owner          = THIS_MODULE,
1776                 .pf             = PF_INET,
1777                 .hooknum        = NF_INET_LOCAL_OUT,
1778                 .priority       = -98,
1779         },
1780         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1781          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1782         {
1783                 .hook           = ip_vs_forward_icmp,
1784                 .owner          = THIS_MODULE,
1785                 .pf             = PF_INET,
1786                 .hooknum        = NF_INET_FORWARD,
1787                 .priority       = 99,
1788         },
1789         /* After packet filtering, change source only for VS/NAT */
1790         {
1791                 .hook           = ip_vs_reply4,
1792                 .owner          = THIS_MODULE,
1793                 .pf             = PF_INET,
1794                 .hooknum        = NF_INET_FORWARD,
1795                 .priority       = 100,
1796         },
1797 #ifdef CONFIG_IP_VS_IPV6
1798         /* After packet filtering, change source only for VS/NAT */
1799         {
1800                 .hook           = ip_vs_reply6,
1801                 .owner          = THIS_MODULE,
1802                 .pf             = PF_INET6,
1803                 .hooknum        = NF_INET_LOCAL_IN,
1804                 .priority       = 99,
1805         },
1806         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1807          * or VS/NAT(change destination), so that filtering rules can be
1808          * applied to IPVS. */
1809         {
1810                 .hook           = ip_vs_remote_request6,
1811                 .owner          = THIS_MODULE,
1812                 .pf             = PF_INET6,
1813                 .hooknum        = NF_INET_LOCAL_IN,
1814                 .priority       = 101,
1815         },
1816         /* Before ip_vs_in, change source only for VS/NAT */
1817         {
1818                 .hook           = ip_vs_local_reply6,
1819                 .owner          = THIS_MODULE,
1820                 .pf             = PF_INET,
1821                 .hooknum        = NF_INET_LOCAL_OUT,
1822                 .priority       = -99,
1823         },
1824         /* After mangle, schedule and forward local requests */
1825         {
1826                 .hook           = ip_vs_local_request6,
1827                 .owner          = THIS_MODULE,
1828                 .pf             = PF_INET6,
1829                 .hooknum        = NF_INET_LOCAL_OUT,
1830                 .priority       = -98,
1831         },
1832         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1833          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1834         {
1835                 .hook           = ip_vs_forward_icmp_v6,
1836                 .owner          = THIS_MODULE,
1837                 .pf             = PF_INET6,
1838                 .hooknum        = NF_INET_FORWARD,
1839                 .priority       = 99,
1840         },
1841         /* After packet filtering, change source only for VS/NAT */
1842         {
1843                 .hook           = ip_vs_reply6,
1844                 .owner          = THIS_MODULE,
1845                 .pf             = PF_INET6,
1846                 .hooknum        = NF_INET_FORWARD,
1847                 .priority       = 100,
1848         },
1849 #endif
1850 };
1851 /*
1852  *      Initialize IP Virtual Server netns mem.
1853  */
1854 static int __net_init __ip_vs_init(struct net *net)
1855 {
1856         struct netns_ipvs *ipvs;
1857
1858         ipvs = net_generic(net, ip_vs_net_id);
1859         if (ipvs == NULL) {
1860                 pr_err("%s(): no memory.\n", __func__);
1861                 return -ENOMEM;
1862         }
1863         ipvs->net = net;
1864         /* Counters used for creating unique names */
1865         ipvs->gen = atomic_read(&ipvs_netns_cnt);
1866         atomic_inc(&ipvs_netns_cnt);
1867         net->ipvs = ipvs;
1868         printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
1869                          sizeof(struct netns_ipvs), ipvs->gen);
1870         return 0;
1871 }
1872
1873 static void __net_exit __ip_vs_cleanup(struct net *net)
1874 {
1875         IP_VS_DBG(10, "ipvs netns %d released\n", net_ipvs(net)->gen);
1876 }
1877
1878 static struct pernet_operations ipvs_core_ops = {
1879         .init = __ip_vs_init,
1880         .exit = __ip_vs_cleanup,
1881         .id   = &ip_vs_net_id,
1882         .size = sizeof(struct netns_ipvs),
1883 };
1884
1885 /*
1886  *      Initialize IP Virtual Server
1887  */
1888 static int __init ip_vs_init(void)
1889 {
1890         int ret;
1891
1892         ret = register_pernet_subsys(&ipvs_core_ops);   /* Alloc ip_vs struct */
1893         if (ret < 0)
1894                 return ret;
1895
1896         ip_vs_estimator_init();
1897         ret = ip_vs_control_init();
1898         if (ret < 0) {
1899                 pr_err("can't setup control.\n");
1900                 goto cleanup_estimator;
1901         }
1902
1903         ip_vs_protocol_init();
1904
1905         ret = ip_vs_app_init();
1906         if (ret < 0) {
1907                 pr_err("can't setup application helper.\n");
1908                 goto cleanup_protocol;
1909         }
1910
1911         ret = ip_vs_conn_init();
1912         if (ret < 0) {
1913                 pr_err("can't setup connection table.\n");
1914                 goto cleanup_app;
1915         }
1916
1917         ret = ip_vs_sync_init();
1918         if (ret < 0) {
1919                 pr_err("can't setup sync data.\n");
1920                 goto cleanup_conn;
1921         }
1922
1923         ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1924         if (ret < 0) {
1925                 pr_err("can't register hooks.\n");
1926                 goto cleanup_sync;
1927         }
1928
1929         pr_info("ipvs loaded.\n");
1930         return ret;
1931
1932 cleanup_sync:
1933         ip_vs_sync_cleanup();
1934   cleanup_conn:
1935         ip_vs_conn_cleanup();
1936   cleanup_app:
1937         ip_vs_app_cleanup();
1938   cleanup_protocol:
1939         ip_vs_protocol_cleanup();
1940         ip_vs_control_cleanup();
1941   cleanup_estimator:
1942         ip_vs_estimator_cleanup();
1943         unregister_pernet_subsys(&ipvs_core_ops);       /* free ip_vs struct */
1944         return ret;
1945 }
1946
1947 static void __exit ip_vs_cleanup(void)
1948 {
1949         nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1950         ip_vs_sync_cleanup();
1951         ip_vs_conn_cleanup();
1952         ip_vs_app_cleanup();
1953         ip_vs_protocol_cleanup();
1954         ip_vs_control_cleanup();
1955         ip_vs_estimator_cleanup();
1956         unregister_pernet_subsys(&ipvs_core_ops);       /* free ip_vs struct */
1957         pr_info("ipvs unloaded.\n");
1958 }
1959
1960 module_init(ip_vs_init);
1961 module_exit(ip_vs_cleanup);
1962 MODULE_LICENSE("GPL");