ipvs: More reliable synchronization on connection close
[linux-2.6.git] / net / ipv4 / ipvs / ip_vs_core.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the Netfilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19  * and others.
20  *
21  * Changes:
22  *      Paul `Rusty' Russell            properly handle non-linear skbs
23  *      Harald Welte                    don't use nfcache
24  *
25  */
26
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/ip.h>
30 #include <linux/tcp.h>
31 #include <linux/icmp.h>
32
33 #include <net/ip.h>
34 #include <net/tcp.h>
35 #include <net/udp.h>
36 #include <net/icmp.h>                   /* for icmp_send */
37 #include <net/route.h>
38
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv4.h>
41
42 #include <net/ip_vs.h>
43
44
45 EXPORT_SYMBOL(register_ip_vs_scheduler);
46 EXPORT_SYMBOL(unregister_ip_vs_scheduler);
47 EXPORT_SYMBOL(ip_vs_skb_replace);
48 EXPORT_SYMBOL(ip_vs_proto_name);
49 EXPORT_SYMBOL(ip_vs_conn_new);
50 EXPORT_SYMBOL(ip_vs_conn_in_get);
51 EXPORT_SYMBOL(ip_vs_conn_out_get);
52 #ifdef CONFIG_IP_VS_PROTO_TCP
53 EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
54 #endif
55 EXPORT_SYMBOL(ip_vs_conn_put);
56 #ifdef CONFIG_IP_VS_DEBUG
57 EXPORT_SYMBOL(ip_vs_get_debug_level);
58 #endif
59
60
61 /* ID used in ICMP lookups */
62 #define icmp_id(icmph)          (((icmph)->un).echo.id)
63
64 const char *ip_vs_proto_name(unsigned proto)
65 {
66         static char buf[20];
67
68         switch (proto) {
69         case IPPROTO_IP:
70                 return "IP";
71         case IPPROTO_UDP:
72                 return "UDP";
73         case IPPROTO_TCP:
74                 return "TCP";
75         case IPPROTO_ICMP:
76                 return "ICMP";
77         default:
78                 sprintf(buf, "IP_%d", proto);
79                 return buf;
80         }
81 }
82
83 void ip_vs_init_hash_table(struct list_head *table, int rows)
84 {
85         while (--rows >= 0)
86                 INIT_LIST_HEAD(&table[rows]);
87 }
88
89 static inline void
90 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
91 {
92         struct ip_vs_dest *dest = cp->dest;
93         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
94                 spin_lock(&dest->stats.lock);
95                 dest->stats.inpkts++;
96                 dest->stats.inbytes += skb->len;
97                 spin_unlock(&dest->stats.lock);
98
99                 spin_lock(&dest->svc->stats.lock);
100                 dest->svc->stats.inpkts++;
101                 dest->svc->stats.inbytes += skb->len;
102                 spin_unlock(&dest->svc->stats.lock);
103
104                 spin_lock(&ip_vs_stats.lock);
105                 ip_vs_stats.inpkts++;
106                 ip_vs_stats.inbytes += skb->len;
107                 spin_unlock(&ip_vs_stats.lock);
108         }
109 }
110
111
112 static inline void
113 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
114 {
115         struct ip_vs_dest *dest = cp->dest;
116         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
117                 spin_lock(&dest->stats.lock);
118                 dest->stats.outpkts++;
119                 dest->stats.outbytes += skb->len;
120                 spin_unlock(&dest->stats.lock);
121
122                 spin_lock(&dest->svc->stats.lock);
123                 dest->svc->stats.outpkts++;
124                 dest->svc->stats.outbytes += skb->len;
125                 spin_unlock(&dest->svc->stats.lock);
126
127                 spin_lock(&ip_vs_stats.lock);
128                 ip_vs_stats.outpkts++;
129                 ip_vs_stats.outbytes += skb->len;
130                 spin_unlock(&ip_vs_stats.lock);
131         }
132 }
133
134
135 static inline void
136 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
137 {
138         spin_lock(&cp->dest->stats.lock);
139         cp->dest->stats.conns++;
140         spin_unlock(&cp->dest->stats.lock);
141
142         spin_lock(&svc->stats.lock);
143         svc->stats.conns++;
144         spin_unlock(&svc->stats.lock);
145
146         spin_lock(&ip_vs_stats.lock);
147         ip_vs_stats.conns++;
148         spin_unlock(&ip_vs_stats.lock);
149 }
150
151
152 static inline int
153 ip_vs_set_state(struct ip_vs_conn *cp, int direction,
154                 const struct sk_buff *skb,
155                 struct ip_vs_protocol *pp)
156 {
157         if (unlikely(!pp->state_transition))
158                 return 0;
159         return pp->state_transition(cp, direction, skb, pp);
160 }
161
162
163 /*
164  *  IPVS persistent scheduling function
165  *  It creates a connection entry according to its template if exists,
166  *  or selects a server and creates a connection entry plus a template.
167  *  Locking: we are svc user (svc->refcnt), so we hold all dests too
168  *  Protocols supported: TCP, UDP
169  */
170 static struct ip_vs_conn *
171 ip_vs_sched_persist(struct ip_vs_service *svc,
172                     const struct sk_buff *skb,
173                     __be16 ports[2])
174 {
175         struct ip_vs_conn *cp = NULL;
176         struct iphdr *iph = ip_hdr(skb);
177         struct ip_vs_dest *dest;
178         struct ip_vs_conn *ct;
179         __be16  dport;   /* destination port to forward */
180         __be32  snet;    /* source network of the client, after masking */
181
182         /* Mask saddr with the netmask to adjust template granularity */
183         snet = iph->saddr & svc->netmask;
184
185         IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
186                   "mnet %u.%u.%u.%u\n",
187                   NIPQUAD(iph->saddr), ntohs(ports[0]),
188                   NIPQUAD(iph->daddr), ntohs(ports[1]),
189                   NIPQUAD(snet));
190
191         /*
192          * As far as we know, FTP is a very complicated network protocol, and
193          * it uses control connection and data connections. For active FTP,
194          * FTP server initialize data connection to the client, its source port
195          * is often 20. For passive FTP, FTP server tells the clients the port
196          * that it passively listens to,  and the client issues the data
197          * connection. In the tunneling or direct routing mode, the load
198          * balancer is on the client-to-server half of connection, the port
199          * number is unknown to the load balancer. So, a conn template like
200          * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
201          * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
202          * is created for other persistent services.
203          */
204         if (ports[1] == svc->port) {
205                 /* Check if a template already exists */
206                 if (svc->port != FTPPORT)
207                         ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
208                                                iph->daddr, ports[1]);
209                 else
210                         ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
211                                                iph->daddr, 0);
212
213                 if (!ct || !ip_vs_check_template(ct)) {
214                         /*
215                          * No template found or the dest of the connection
216                          * template is not available.
217                          */
218                         dest = svc->scheduler->schedule(svc, skb);
219                         if (dest == NULL) {
220                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
221                                 return NULL;
222                         }
223
224                         /*
225                          * Create a template like <protocol,caddr,0,
226                          * vaddr,vport,daddr,dport> for non-ftp service,
227                          * and <protocol,caddr,0,vaddr,0,daddr,0>
228                          * for ftp service.
229                          */
230                         if (svc->port != FTPPORT)
231                                 ct = ip_vs_conn_new(iph->protocol,
232                                                     snet, 0,
233                                                     iph->daddr,
234                                                     ports[1],
235                                                     dest->addr, dest->port,
236                                                     IP_VS_CONN_F_TEMPLATE,
237                                                     dest);
238                         else
239                                 ct = ip_vs_conn_new(iph->protocol,
240                                                     snet, 0,
241                                                     iph->daddr, 0,
242                                                     dest->addr, 0,
243                                                     IP_VS_CONN_F_TEMPLATE,
244                                                     dest);
245                         if (ct == NULL)
246                                 return NULL;
247
248                         ct->timeout = svc->timeout;
249                 } else {
250                         /* set destination with the found template */
251                         dest = ct->dest;
252                 }
253                 dport = dest->port;
254         } else {
255                 /*
256                  * Note: persistent fwmark-based services and persistent
257                  * port zero service are handled here.
258                  * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
259                  * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
260                  */
261                 if (svc->fwmark)
262                         ct = ip_vs_ct_in_get(IPPROTO_IP, snet, 0,
263                                                htonl(svc->fwmark), 0);
264                 else
265                         ct = ip_vs_ct_in_get(iph->protocol, snet, 0,
266                                                iph->daddr, 0);
267
268                 if (!ct || !ip_vs_check_template(ct)) {
269                         /*
270                          * If it is not persistent port zero, return NULL,
271                          * otherwise create a connection template.
272                          */
273                         if (svc->port)
274                                 return NULL;
275
276                         dest = svc->scheduler->schedule(svc, skb);
277                         if (dest == NULL) {
278                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
279                                 return NULL;
280                         }
281
282                         /*
283                          * Create a template according to the service
284                          */
285                         if (svc->fwmark)
286                                 ct = ip_vs_conn_new(IPPROTO_IP,
287                                                     snet, 0,
288                                                     htonl(svc->fwmark), 0,
289                                                     dest->addr, 0,
290                                                     IP_VS_CONN_F_TEMPLATE,
291                                                     dest);
292                         else
293                                 ct = ip_vs_conn_new(iph->protocol,
294                                                     snet, 0,
295                                                     iph->daddr, 0,
296                                                     dest->addr, 0,
297                                                     IP_VS_CONN_F_TEMPLATE,
298                                                     dest);
299                         if (ct == NULL)
300                                 return NULL;
301
302                         ct->timeout = svc->timeout;
303                 } else {
304                         /* set destination with the found template */
305                         dest = ct->dest;
306                 }
307                 dport = ports[1];
308         }
309
310         /*
311          *    Create a new connection according to the template
312          */
313         cp = ip_vs_conn_new(iph->protocol,
314                             iph->saddr, ports[0],
315                             iph->daddr, ports[1],
316                             dest->addr, dport,
317                             0,
318                             dest);
319         if (cp == NULL) {
320                 ip_vs_conn_put(ct);
321                 return NULL;
322         }
323
324         /*
325          *    Add its control
326          */
327         ip_vs_control_add(cp, ct);
328         ip_vs_conn_put(ct);
329
330         ip_vs_conn_stats(cp, svc);
331         return cp;
332 }
333
334
335 /*
336  *  IPVS main scheduling function
337  *  It selects a server according to the virtual service, and
338  *  creates a connection entry.
339  *  Protocols supported: TCP, UDP
340  */
341 struct ip_vs_conn *
342 ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
343 {
344         struct ip_vs_conn *cp = NULL;
345         struct iphdr *iph = ip_hdr(skb);
346         struct ip_vs_dest *dest;
347         __be16 _ports[2], *pptr;
348
349         pptr = skb_header_pointer(skb, iph->ihl*4,
350                                   sizeof(_ports), _ports);
351         if (pptr == NULL)
352                 return NULL;
353
354         /*
355          *    Persistent service
356          */
357         if (svc->flags & IP_VS_SVC_F_PERSISTENT)
358                 return ip_vs_sched_persist(svc, skb, pptr);
359
360         /*
361          *    Non-persistent service
362          */
363         if (!svc->fwmark && pptr[1] != svc->port) {
364                 if (!svc->port)
365                         IP_VS_ERR("Schedule: port zero only supported "
366                                   "in persistent services, "
367                                   "check your ipvs configuration\n");
368                 return NULL;
369         }
370
371         dest = svc->scheduler->schedule(svc, skb);
372         if (dest == NULL) {
373                 IP_VS_DBG(1, "Schedule: no dest found.\n");
374                 return NULL;
375         }
376
377         /*
378          *    Create a connection entry.
379          */
380         cp = ip_vs_conn_new(iph->protocol,
381                             iph->saddr, pptr[0],
382                             iph->daddr, pptr[1],
383                             dest->addr, dest->port?dest->port:pptr[1],
384                             0,
385                             dest);
386         if (cp == NULL)
387                 return NULL;
388
389         IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
390                   "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n",
391                   ip_vs_fwd_tag(cp),
392                   NIPQUAD(cp->caddr), ntohs(cp->cport),
393                   NIPQUAD(cp->vaddr), ntohs(cp->vport),
394                   NIPQUAD(cp->daddr), ntohs(cp->dport),
395                   cp->flags, atomic_read(&cp->refcnt));
396
397         ip_vs_conn_stats(cp, svc);
398         return cp;
399 }
400
401
402 /*
403  *  Pass or drop the packet.
404  *  Called by ip_vs_in, when the virtual service is available but
405  *  no destination is available for a new connection.
406  */
407 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
408                 struct ip_vs_protocol *pp)
409 {
410         __be16 _ports[2], *pptr;
411         struct iphdr *iph = ip_hdr(skb);
412
413         pptr = skb_header_pointer(skb, iph->ihl*4,
414                                   sizeof(_ports), _ports);
415         if (pptr == NULL) {
416                 ip_vs_service_put(svc);
417                 return NF_DROP;
418         }
419
420         /* if it is fwmark-based service, the cache_bypass sysctl is up
421            and the destination is RTN_UNICAST (and not local), then create
422            a cache_bypass connection entry */
423         if (sysctl_ip_vs_cache_bypass && svc->fwmark
424             && (inet_addr_type(&init_net, iph->daddr) == RTN_UNICAST)) {
425                 int ret, cs;
426                 struct ip_vs_conn *cp;
427
428                 ip_vs_service_put(svc);
429
430                 /* create a new connection entry */
431                 IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
432                 cp = ip_vs_conn_new(iph->protocol,
433                                     iph->saddr, pptr[0],
434                                     iph->daddr, pptr[1],
435                                     0, 0,
436                                     IP_VS_CONN_F_BYPASS,
437                                     NULL);
438                 if (cp == NULL)
439                         return NF_DROP;
440
441                 /* statistics */
442                 ip_vs_in_stats(cp, skb);
443
444                 /* set state */
445                 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
446
447                 /* transmit the first SYN packet */
448                 ret = cp->packet_xmit(skb, cp, pp);
449                 /* do not touch skb anymore */
450
451                 atomic_inc(&cp->in_pkts);
452                 ip_vs_conn_put(cp);
453                 return ret;
454         }
455
456         /*
457          * When the virtual ftp service is presented, packets destined
458          * for other services on the VIP may get here (except services
459          * listed in the ipvs table), pass the packets, because it is
460          * not ipvs job to decide to drop the packets.
461          */
462         if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
463                 ip_vs_service_put(svc);
464                 return NF_ACCEPT;
465         }
466
467         ip_vs_service_put(svc);
468
469         /*
470          * Notify the client that the destination is unreachable, and
471          * release the socket buffer.
472          * Since it is in IP layer, the TCP socket is not actually
473          * created, the TCP RST packet cannot be sent, instead that
474          * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
475          */
476         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
477         return NF_DROP;
478 }
479
480
481 /*
482  *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
483  *      chain, and is used for VS/NAT.
484  *      It detects packets for VS/NAT connections and sends the packets
485  *      immediately. This can avoid that iptable_nat mangles the packets
486  *      for VS/NAT.
487  */
488 static unsigned int ip_vs_post_routing(unsigned int hooknum,
489                                        struct sk_buff *skb,
490                                        const struct net_device *in,
491                                        const struct net_device *out,
492                                        int (*okfn)(struct sk_buff *))
493 {
494         if (!skb->ipvs_property)
495                 return NF_ACCEPT;
496         /* The packet was sent from IPVS, exit this chain */
497         return NF_STOP;
498 }
499
500 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
501 {
502         return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
503 }
504
505 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
506 {
507         int err = ip_defrag(skb, user);
508
509         if (!err)
510                 ip_send_check(ip_hdr(skb));
511
512         return err;
513 }
514
515 /*
516  * Packet has been made sufficiently writable in caller
517  * - inout: 1=in->out, 0=out->in
518  */
519 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
520                     struct ip_vs_conn *cp, int inout)
521 {
522         struct iphdr *iph        = ip_hdr(skb);
523         unsigned int icmp_offset = iph->ihl*4;
524         struct icmphdr *icmph    = (struct icmphdr *)(skb_network_header(skb) +
525                                                       icmp_offset);
526         struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
527
528         if (inout) {
529                 iph->saddr = cp->vaddr;
530                 ip_send_check(iph);
531                 ciph->daddr = cp->vaddr;
532                 ip_send_check(ciph);
533         } else {
534                 iph->daddr = cp->daddr;
535                 ip_send_check(iph);
536                 ciph->saddr = cp->daddr;
537                 ip_send_check(ciph);
538         }
539
540         /* the TCP/UDP port */
541         if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
542                 __be16 *ports = (void *)ciph + ciph->ihl*4;
543
544                 if (inout)
545                         ports[1] = cp->vport;
546                 else
547                         ports[0] = cp->dport;
548         }
549
550         /* And finally the ICMP checksum */
551         icmph->checksum = 0;
552         icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
553         skb->ip_summed = CHECKSUM_UNNECESSARY;
554
555         if (inout)
556                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
557                         "Forwarding altered outgoing ICMP");
558         else
559                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
560                         "Forwarding altered incoming ICMP");
561 }
562
563 /*
564  *      Handle ICMP messages in the inside-to-outside direction (outgoing).
565  *      Find any that might be relevant, check against existing connections,
566  *      forward to the right destination host if relevant.
567  *      Currently handles error types - unreachable, quench, ttl exceeded.
568  *      (Only used in VS/NAT)
569  */
570 static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
571 {
572         struct iphdr *iph;
573         struct icmphdr  _icmph, *ic;
574         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
575         struct ip_vs_conn *cp;
576         struct ip_vs_protocol *pp;
577         unsigned int offset, ihl, verdict;
578
579         *related = 1;
580
581         /* reassemble IP fragments */
582         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
583                 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
584                         return NF_STOLEN;
585         }
586
587         iph = ip_hdr(skb);
588         offset = ihl = iph->ihl * 4;
589         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
590         if (ic == NULL)
591                 return NF_DROP;
592
593         IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
594                   ic->type, ntohs(icmp_id(ic)),
595                   NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
596
597         /*
598          * Work through seeing if this is for us.
599          * These checks are supposed to be in an order that means easy
600          * things are checked first to speed up processing.... however
601          * this means that some packets will manage to get a long way
602          * down this stack and then be rejected, but that's life.
603          */
604         if ((ic->type != ICMP_DEST_UNREACH) &&
605             (ic->type != ICMP_SOURCE_QUENCH) &&
606             (ic->type != ICMP_TIME_EXCEEDED)) {
607                 *related = 0;
608                 return NF_ACCEPT;
609         }
610
611         /* Now find the contained IP header */
612         offset += sizeof(_icmph);
613         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
614         if (cih == NULL)
615                 return NF_ACCEPT; /* The packet looks wrong, ignore */
616
617         pp = ip_vs_proto_get(cih->protocol);
618         if (!pp)
619                 return NF_ACCEPT;
620
621         /* Is the embedded protocol header present? */
622         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
623                      pp->dont_defrag))
624                 return NF_ACCEPT;
625
626         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
627
628         offset += cih->ihl * 4;
629
630         /* The embedded headers contain source and dest in reverse order */
631         cp = pp->conn_out_get(skb, pp, cih, offset, 1);
632         if (!cp)
633                 return NF_ACCEPT;
634
635         verdict = NF_DROP;
636
637         if (IP_VS_FWD_METHOD(cp) != 0) {
638                 IP_VS_ERR("shouldn't reach here, because the box is on the "
639                           "half connection in the tun/dr module.\n");
640         }
641
642         /* Ensure the checksum is correct */
643         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
644                 /* Failed checksum! */
645                 IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n",
646                           NIPQUAD(iph->saddr));
647                 goto out;
648         }
649
650         if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
651                 offset += 2 * sizeof(__u16);
652         if (!skb_make_writable(skb, offset))
653                 goto out;
654
655         ip_vs_nat_icmp(skb, pp, cp, 1);
656
657         /* do the statistics and put it back */
658         ip_vs_out_stats(cp, skb);
659
660         skb->ipvs_property = 1;
661         verdict = NF_ACCEPT;
662
663   out:
664         __ip_vs_conn_put(cp);
665
666         return verdict;
667 }
668
669 static inline int is_tcp_reset(const struct sk_buff *skb)
670 {
671         struct tcphdr _tcph, *th;
672
673         th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
674         if (th == NULL)
675                 return 0;
676         return th->rst;
677 }
678
679 /*
680  *      It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
681  *      Check if outgoing packet belongs to the established ip_vs_conn,
682  *      rewrite addresses of the packet and send it on its way...
683  */
684 static unsigned int
685 ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
686           const struct net_device *in, const struct net_device *out,
687           int (*okfn)(struct sk_buff *))
688 {
689         struct iphdr    *iph;
690         struct ip_vs_protocol *pp;
691         struct ip_vs_conn *cp;
692         int ihl;
693
694         EnterFunction(11);
695
696         if (skb->ipvs_property)
697                 return NF_ACCEPT;
698
699         iph = ip_hdr(skb);
700         if (unlikely(iph->protocol == IPPROTO_ICMP)) {
701                 int related, verdict = ip_vs_out_icmp(skb, &related);
702
703                 if (related)
704                         return verdict;
705                 iph = ip_hdr(skb);
706         }
707
708         pp = ip_vs_proto_get(iph->protocol);
709         if (unlikely(!pp))
710                 return NF_ACCEPT;
711
712         /* reassemble IP fragments */
713         if (unlikely(iph->frag_off & htons(IP_MF|IP_OFFSET) &&
714                      !pp->dont_defrag)) {
715                 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
716                         return NF_STOLEN;
717                 iph = ip_hdr(skb);
718         }
719
720         ihl = iph->ihl << 2;
721
722         /*
723          * Check if the packet belongs to an existing entry
724          */
725         cp = pp->conn_out_get(skb, pp, iph, ihl, 0);
726
727         if (unlikely(!cp)) {
728                 if (sysctl_ip_vs_nat_icmp_send &&
729                     (pp->protocol == IPPROTO_TCP ||
730                      pp->protocol == IPPROTO_UDP)) {
731                         __be16 _ports[2], *pptr;
732
733                         pptr = skb_header_pointer(skb, ihl,
734                                                   sizeof(_ports), _ports);
735                         if (pptr == NULL)
736                                 return NF_ACCEPT;       /* Not for me */
737                         if (ip_vs_lookup_real_service(iph->protocol,
738                                                       iph->saddr, pptr[0])) {
739                                 /*
740                                  * Notify the real server: there is no
741                                  * existing entry if it is not RST
742                                  * packet or not TCP packet.
743                                  */
744                                 if (iph->protocol != IPPROTO_TCP
745                                     || !is_tcp_reset(skb)) {
746                                         icmp_send(skb,ICMP_DEST_UNREACH,
747                                                   ICMP_PORT_UNREACH, 0);
748                                         return NF_DROP;
749                                 }
750                         }
751                 }
752                 IP_VS_DBG_PKT(12, pp, skb, 0,
753                               "packet continues traversal as normal");
754                 return NF_ACCEPT;
755         }
756
757         IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
758
759         if (!skb_make_writable(skb, ihl))
760                 goto drop;
761
762         /* mangle the packet */
763         if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
764                 goto drop;
765         ip_hdr(skb)->saddr = cp->vaddr;
766         ip_send_check(ip_hdr(skb));
767
768         /* For policy routing, packets originating from this
769          * machine itself may be routed differently to packets
770          * passing through.  We want this packet to be routed as
771          * if it came from this machine itself.  So re-compute
772          * the routing information.
773          */
774         if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
775                 goto drop;
776
777         IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
778
779         ip_vs_out_stats(cp, skb);
780         ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
781         ip_vs_conn_put(cp);
782
783         skb->ipvs_property = 1;
784
785         LeaveFunction(11);
786         return NF_ACCEPT;
787
788   drop:
789         ip_vs_conn_put(cp);
790         kfree_skb(skb);
791         return NF_STOLEN;
792 }
793
794
795 /*
796  *      Handle ICMP messages in the outside-to-inside direction (incoming).
797  *      Find any that might be relevant, check against existing connections,
798  *      forward to the right destination host if relevant.
799  *      Currently handles error types - unreachable, quench, ttl exceeded.
800  */
801 static int
802 ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
803 {
804         struct iphdr *iph;
805         struct icmphdr  _icmph, *ic;
806         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
807         struct ip_vs_conn *cp;
808         struct ip_vs_protocol *pp;
809         unsigned int offset, ihl, verdict;
810
811         *related = 1;
812
813         /* reassemble IP fragments */
814         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
815                 if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
816                                             IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
817                         return NF_STOLEN;
818         }
819
820         iph = ip_hdr(skb);
821         offset = ihl = iph->ihl * 4;
822         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
823         if (ic == NULL)
824                 return NF_DROP;
825
826         IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
827                   ic->type, ntohs(icmp_id(ic)),
828                   NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
829
830         /*
831          * Work through seeing if this is for us.
832          * These checks are supposed to be in an order that means easy
833          * things are checked first to speed up processing.... however
834          * this means that some packets will manage to get a long way
835          * down this stack and then be rejected, but that's life.
836          */
837         if ((ic->type != ICMP_DEST_UNREACH) &&
838             (ic->type != ICMP_SOURCE_QUENCH) &&
839             (ic->type != ICMP_TIME_EXCEEDED)) {
840                 *related = 0;
841                 return NF_ACCEPT;
842         }
843
844         /* Now find the contained IP header */
845         offset += sizeof(_icmph);
846         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
847         if (cih == NULL)
848                 return NF_ACCEPT; /* The packet looks wrong, ignore */
849
850         pp = ip_vs_proto_get(cih->protocol);
851         if (!pp)
852                 return NF_ACCEPT;
853
854         /* Is the embedded protocol header present? */
855         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
856                      pp->dont_defrag))
857                 return NF_ACCEPT;
858
859         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
860
861         offset += cih->ihl * 4;
862
863         /* The embedded headers contain source and dest in reverse order */
864         cp = pp->conn_in_get(skb, pp, cih, offset, 1);
865         if (!cp)
866                 return NF_ACCEPT;
867
868         verdict = NF_DROP;
869
870         /* Ensure the checksum is correct */
871         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
872                 /* Failed checksum! */
873                 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
874                           NIPQUAD(iph->saddr));
875                 goto out;
876         }
877
878         /* do the statistics and put it back */
879         ip_vs_in_stats(cp, skb);
880         if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
881                 offset += 2 * sizeof(__u16);
882         verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
883         /* do not touch skb anymore */
884
885   out:
886         __ip_vs_conn_put(cp);
887
888         return verdict;
889 }
890
891 /*
892  *      Check if it's for virtual services, look it up,
893  *      and send it on its way...
894  */
895 static unsigned int
896 ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
897          const struct net_device *in, const struct net_device *out,
898          int (*okfn)(struct sk_buff *))
899 {
900         struct iphdr    *iph;
901         struct ip_vs_protocol *pp;
902         struct ip_vs_conn *cp;
903         int ret, restart;
904         int ihl;
905
906         /*
907          *      Big tappo: only PACKET_HOST (neither loopback nor mcasts)
908          *      ... don't know why 1st test DOES NOT include 2nd (?)
909          */
910         if (unlikely(skb->pkt_type != PACKET_HOST
911                      || skb->dev->flags & IFF_LOOPBACK || skb->sk)) {
912                 IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
913                           skb->pkt_type,
914                           ip_hdr(skb)->protocol,
915                           NIPQUAD(ip_hdr(skb)->daddr));
916                 return NF_ACCEPT;
917         }
918
919         iph = ip_hdr(skb);
920         if (unlikely(iph->protocol == IPPROTO_ICMP)) {
921                 int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
922
923                 if (related)
924                         return verdict;
925                 iph = ip_hdr(skb);
926         }
927
928         /* Protocol supported? */
929         pp = ip_vs_proto_get(iph->protocol);
930         if (unlikely(!pp))
931                 return NF_ACCEPT;
932
933         ihl = iph->ihl << 2;
934
935         /*
936          * Check if the packet belongs to an existing connection entry
937          */
938         cp = pp->conn_in_get(skb, pp, iph, ihl, 0);
939
940         if (unlikely(!cp)) {
941                 int v;
942
943                 if (!pp->conn_schedule(skb, pp, &v, &cp))
944                         return v;
945         }
946
947         if (unlikely(!cp)) {
948                 /* sorry, all this trouble for a no-hit :) */
949                 IP_VS_DBG_PKT(12, pp, skb, 0,
950                               "packet continues traversal as normal");
951                 return NF_ACCEPT;
952         }
953
954         IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
955
956         /* Check the server status */
957         if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
958                 /* the destination server is not available */
959
960                 if (sysctl_ip_vs_expire_nodest_conn) {
961                         /* try to expire the connection immediately */
962                         ip_vs_conn_expire_now(cp);
963                 }
964                 /* don't restart its timer, and silently
965                    drop the packet. */
966                 __ip_vs_conn_put(cp);
967                 return NF_DROP;
968         }
969
970         ip_vs_in_stats(cp, skb);
971         restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
972         if (cp->packet_xmit)
973                 ret = cp->packet_xmit(skb, cp, pp);
974                 /* do not touch skb anymore */
975         else {
976                 IP_VS_DBG_RL("warning: packet_xmit is null");
977                 ret = NF_ACCEPT;
978         }
979
980         /* Increase its packet counter and check if it is needed
981          * to be synchronized
982          *
983          * Sync connection if it is about to close to
984          * encorage the standby servers to update the connections timeout
985          */
986         atomic_inc(&cp->in_pkts);
987         if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
988             (((cp->protocol != IPPROTO_TCP ||
989                cp->state == IP_VS_TCP_S_ESTABLISHED) &&
990               (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
991                == sysctl_ip_vs_sync_threshold[0])) ||
992              ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
993               ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
994                (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
995                (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
996                 ip_vs_sync_conn(cp);
997         cp->old_state = cp->state;
998
999         ip_vs_conn_put(cp);
1000         return ret;
1001 }
1002
1003
1004 /*
1005  *      It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1006  *      related packets destined for 0.0.0.0/0.
1007  *      When fwmark-based virtual service is used, such as transparent
1008  *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
1009  *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1010  *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1011  *      and send them to ip_vs_in_icmp.
1012  */
1013 static unsigned int
1014 ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1015                    const struct net_device *in, const struct net_device *out,
1016                    int (*okfn)(struct sk_buff *))
1017 {
1018         int r;
1019
1020         if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1021                 return NF_ACCEPT;
1022
1023         return ip_vs_in_icmp(skb, &r, hooknum);
1024 }
1025
1026
1027 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1028         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1029          * or VS/NAT(change destination), so that filtering rules can be
1030          * applied to IPVS. */
1031         {
1032                 .hook           = ip_vs_in,
1033                 .owner          = THIS_MODULE,
1034                 .pf             = PF_INET,
1035                 .hooknum        = NF_INET_LOCAL_IN,
1036                 .priority       = 100,
1037         },
1038         /* After packet filtering, change source only for VS/NAT */
1039         {
1040                 .hook           = ip_vs_out,
1041                 .owner          = THIS_MODULE,
1042                 .pf             = PF_INET,
1043                 .hooknum        = NF_INET_FORWARD,
1044                 .priority       = 100,
1045         },
1046         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1047          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1048         {
1049                 .hook           = ip_vs_forward_icmp,
1050                 .owner          = THIS_MODULE,
1051                 .pf             = PF_INET,
1052                 .hooknum        = NF_INET_FORWARD,
1053                 .priority       = 99,
1054         },
1055         /* Before the netfilter connection tracking, exit from POST_ROUTING */
1056         {
1057                 .hook           = ip_vs_post_routing,
1058                 .owner          = THIS_MODULE,
1059                 .pf             = PF_INET,
1060                 .hooknum        = NF_INET_POST_ROUTING,
1061                 .priority       = NF_IP_PRI_NAT_SRC-1,
1062         },
1063 };
1064
1065
1066 /*
1067  *      Initialize IP Virtual Server
1068  */
1069 static int __init ip_vs_init(void)
1070 {
1071         int ret;
1072
1073         ret = ip_vs_control_init();
1074         if (ret < 0) {
1075                 IP_VS_ERR("can't setup control.\n");
1076                 goto cleanup_nothing;
1077         }
1078
1079         ip_vs_protocol_init();
1080
1081         ret = ip_vs_app_init();
1082         if (ret < 0) {
1083                 IP_VS_ERR("can't setup application helper.\n");
1084                 goto cleanup_protocol;
1085         }
1086
1087         ret = ip_vs_conn_init();
1088         if (ret < 0) {
1089                 IP_VS_ERR("can't setup connection table.\n");
1090                 goto cleanup_app;
1091         }
1092
1093         ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1094         if (ret < 0) {
1095                 IP_VS_ERR("can't register hooks.\n");
1096                 goto cleanup_conn;
1097         }
1098
1099         IP_VS_INFO("ipvs loaded.\n");
1100         return ret;
1101
1102   cleanup_conn:
1103         ip_vs_conn_cleanup();
1104   cleanup_app:
1105         ip_vs_app_cleanup();
1106   cleanup_protocol:
1107         ip_vs_protocol_cleanup();
1108         ip_vs_control_cleanup();
1109   cleanup_nothing:
1110         return ret;
1111 }
1112
1113 static void __exit ip_vs_cleanup(void)
1114 {
1115         nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1116         ip_vs_conn_cleanup();
1117         ip_vs_app_cleanup();
1118         ip_vs_protocol_cleanup();
1119         ip_vs_control_cleanup();
1120         IP_VS_INFO("ipvs unloaded.\n");
1121 }
1122
1123 module_init(ip_vs_init);
1124 module_exit(ip_vs_cleanup);
1125 MODULE_LICENSE("GPL");