38e001076a5f6039e893833e65d32499652b92a1
[linux-3.10.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen semantics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55 #include <linux/config.h>
56
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
65
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81
82 int sysctl_tcp_tw_reuse;
83 int sysctl_tcp_low_latency;
84
85 /* Check TCP sequence numbers in ICMP packets. */
86 #define ICMP_MIN_LENGTH 8
87
88 /* Socket used for sending RSTs */
89 static struct socket *tcp_socket;
90
91 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb);
92
93 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
94         .lhash_lock     = RW_LOCK_UNLOCKED,
95         .lhash_users    = ATOMIC_INIT(0),
96         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
97 };
98
99 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
100 {
101         return inet_csk_get_port(&tcp_hashinfo, sk, snum,
102                                  inet_csk_bind_conflict);
103 }
104
105 static void tcp_v4_hash(struct sock *sk)
106 {
107         inet_hash(&tcp_hashinfo, sk);
108 }
109
110 void tcp_unhash(struct sock *sk)
111 {
112         inet_unhash(&tcp_hashinfo, sk);
113 }
114
115 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
116 {
117         return secure_tcp_sequence_number(skb->nh.iph->daddr,
118                                           skb->nh.iph->saddr,
119                                           skb->h.th->dest,
120                                           skb->h.th->source);
121 }
122
123 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
124 {
125         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
126         struct tcp_sock *tp = tcp_sk(sk);
127
128         /* With PAWS, it is safe from the viewpoint
129            of data integrity. Even without PAWS it is safe provided sequence
130            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
131
132            Actually, the idea is close to VJ's one, only timestamp cache is
133            held not per host, but per port pair and TW bucket is used as state
134            holder.
135
136            If TW bucket has been already destroyed we fall back to VJ's scheme
137            and use initial timestamp retrieved from peer table.
138          */
139         if (tcptw->tw_ts_recent_stamp &&
140             (twp == NULL || (sysctl_tcp_tw_reuse &&
141                              xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) {
142                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
143                 if (tp->write_seq == 0)
144                         tp->write_seq = 1;
145                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
146                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
147                 sock_hold(sktw);
148                 return 1;
149         }
150
151         return 0;
152 }
153
154 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
155
156 /* This will initiate an outgoing connection. */
157 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
158 {
159         struct inet_sock *inet = inet_sk(sk);
160         struct tcp_sock *tp = tcp_sk(sk);
161         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
162         struct rtable *rt;
163         u32 daddr, nexthop;
164         int tmp;
165         int err;
166
167         if (addr_len < sizeof(struct sockaddr_in))
168                 return -EINVAL;
169
170         if (usin->sin_family != AF_INET)
171                 return -EAFNOSUPPORT;
172
173         nexthop = daddr = usin->sin_addr.s_addr;
174         if (inet->opt && inet->opt->srr) {
175                 if (!daddr)
176                         return -EINVAL;
177                 nexthop = inet->opt->faddr;
178         }
179
180         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
181                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
182                                IPPROTO_TCP,
183                                inet->sport, usin->sin_port, sk);
184         if (tmp < 0)
185                 return tmp;
186
187         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
188                 ip_rt_put(rt);
189                 return -ENETUNREACH;
190         }
191
192         if (!inet->opt || !inet->opt->srr)
193                 daddr = rt->rt_dst;
194
195         if (!inet->saddr)
196                 inet->saddr = rt->rt_src;
197         inet->rcv_saddr = inet->saddr;
198
199         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
200                 /* Reset inherited state */
201                 tp->rx_opt.ts_recent       = 0;
202                 tp->rx_opt.ts_recent_stamp = 0;
203                 tp->write_seq              = 0;
204         }
205
206         if (tcp_death_row.sysctl_tw_recycle &&
207             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
208                 struct inet_peer *peer = rt_get_peer(rt);
209
210                 /* VJ's idea. We save last timestamp seen from
211                  * the destination in peer table, when entering state TIME-WAIT
212                  * and initialize rx_opt.ts_recent from it, when trying new connection.
213                  */
214
215                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
216                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
217                         tp->rx_opt.ts_recent = peer->tcp_ts;
218                 }
219         }
220
221         inet->dport = usin->sin_port;
222         inet->daddr = daddr;
223
224         inet_csk(sk)->icsk_ext_hdr_len = 0;
225         if (inet->opt)
226                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
227
228         tp->rx_opt.mss_clamp = 536;
229
230         /* Socket identity is still unknown (sport may be zero).
231          * However we set state to SYN-SENT and not releasing socket
232          * lock select source port, enter ourselves into the hash tables and
233          * complete initialization after this.
234          */
235         tcp_set_state(sk, TCP_SYN_SENT);
236         err = inet_hash_connect(&tcp_death_row, sk);
237         if (err)
238                 goto failure;
239
240         err = ip_route_newports(&rt, IPPROTO_TCP, inet->sport, inet->dport, sk);
241         if (err)
242                 goto failure;
243
244         /* OK, now commit destination to socket.  */
245         sk->sk_gso_type = SKB_GSO_TCPV4;
246         sk_setup_caps(sk, &rt->u.dst);
247
248         if (!tp->write_seq)
249                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
250                                                            inet->daddr,
251                                                            inet->sport,
252                                                            usin->sin_port);
253
254         inet->id = tp->write_seq ^ jiffies;
255
256         err = tcp_connect(sk);
257         rt = NULL;
258         if (err)
259                 goto failure;
260
261         return 0;
262
263 failure:
264         /* This unhashes the socket and releases the local port, if necessary. */
265         tcp_set_state(sk, TCP_CLOSE);
266         ip_rt_put(rt);
267         sk->sk_route_caps = 0;
268         inet->dport = 0;
269         return err;
270 }
271
272 /*
273  * This routine does path mtu discovery as defined in RFC1191.
274  */
275 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
276 {
277         struct dst_entry *dst;
278         struct inet_sock *inet = inet_sk(sk);
279
280         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
281          * send out by Linux are always <576bytes so they should go through
282          * unfragmented).
283          */
284         if (sk->sk_state == TCP_LISTEN)
285                 return;
286
287         /* We don't check in the destentry if pmtu discovery is forbidden
288          * on this route. We just assume that no packet_to_big packets
289          * are send back when pmtu discovery is not active.
290          * There is a small race when the user changes this flag in the
291          * route, but I think that's acceptable.
292          */
293         if ((dst = __sk_dst_check(sk, 0)) == NULL)
294                 return;
295
296         dst->ops->update_pmtu(dst, mtu);
297
298         /* Something is about to be wrong... Remember soft error
299          * for the case, if this connection will not able to recover.
300          */
301         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
302                 sk->sk_err_soft = EMSGSIZE;
303
304         mtu = dst_mtu(dst);
305
306         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
307             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
308                 tcp_sync_mss(sk, mtu);
309
310                 /* Resend the TCP packet because it's
311                  * clear that the old packet has been
312                  * dropped. This is the new "fast" path mtu
313                  * discovery.
314                  */
315                 tcp_simple_retransmit(sk);
316         } /* else let the usual retransmit timer handle it */
317 }
318
319 /*
320  * This routine is called by the ICMP module when it gets some
321  * sort of error condition.  If err < 0 then the socket should
322  * be closed and the error returned to the user.  If err > 0
323  * it's just the icmp type << 8 | icmp code.  After adjustment
324  * header points to the first 8 bytes of the tcp header.  We need
325  * to find the appropriate port.
326  *
327  * The locking strategy used here is very "optimistic". When
328  * someone else accesses the socket the ICMP is just dropped
329  * and for some paths there is no check at all.
330  * A more general error queue to queue errors for later handling
331  * is probably better.
332  *
333  */
334
335 void tcp_v4_err(struct sk_buff *skb, u32 info)
336 {
337         struct iphdr *iph = (struct iphdr *)skb->data;
338         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
339         struct tcp_sock *tp;
340         struct inet_sock *inet;
341         int type = skb->h.icmph->type;
342         int code = skb->h.icmph->code;
343         struct sock *sk;
344         __u32 seq;
345         int err;
346
347         if (skb->len < (iph->ihl << 2) + 8) {
348                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
349                 return;
350         }
351
352         sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
353                          th->source, inet_iif(skb));
354         if (!sk) {
355                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
356                 return;
357         }
358         if (sk->sk_state == TCP_TIME_WAIT) {
359                 inet_twsk_put((struct inet_timewait_sock *)sk);
360                 return;
361         }
362
363         bh_lock_sock(sk);
364         /* If too many ICMPs get dropped on busy
365          * servers this needs to be solved differently.
366          */
367         if (sock_owned_by_user(sk))
368                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
369
370         if (sk->sk_state == TCP_CLOSE)
371                 goto out;
372
373         tp = tcp_sk(sk);
374         seq = ntohl(th->seq);
375         if (sk->sk_state != TCP_LISTEN &&
376             !between(seq, tp->snd_una, tp->snd_nxt)) {
377                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
378                 goto out;
379         }
380
381         switch (type) {
382         case ICMP_SOURCE_QUENCH:
383                 /* Just silently ignore these. */
384                 goto out;
385         case ICMP_PARAMETERPROB:
386                 err = EPROTO;
387                 break;
388         case ICMP_DEST_UNREACH:
389                 if (code > NR_ICMP_UNREACH)
390                         goto out;
391
392                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
393                         if (!sock_owned_by_user(sk))
394                                 do_pmtu_discovery(sk, iph, info);
395                         goto out;
396                 }
397
398                 err = icmp_err_convert[code].errno;
399                 break;
400         case ICMP_TIME_EXCEEDED:
401                 err = EHOSTUNREACH;
402                 break;
403         default:
404                 goto out;
405         }
406
407         switch (sk->sk_state) {
408                 struct request_sock *req, **prev;
409         case TCP_LISTEN:
410                 if (sock_owned_by_user(sk))
411                         goto out;
412
413                 req = inet_csk_search_req(sk, &prev, th->dest,
414                                           iph->daddr, iph->saddr);
415                 if (!req)
416                         goto out;
417
418                 /* ICMPs are not backlogged, hence we cannot get
419                    an established socket here.
420                  */
421                 BUG_TRAP(!req->sk);
422
423                 if (seq != tcp_rsk(req)->snt_isn) {
424                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
425                         goto out;
426                 }
427
428                 /*
429                  * Still in SYN_RECV, just remove it silently.
430                  * There is no good way to pass the error to the newly
431                  * created socket, and POSIX does not want network
432                  * errors returned from accept().
433                  */
434                 inet_csk_reqsk_queue_drop(sk, req, prev);
435                 goto out;
436
437         case TCP_SYN_SENT:
438         case TCP_SYN_RECV:  /* Cannot happen.
439                                It can f.e. if SYNs crossed.
440                              */
441                 if (!sock_owned_by_user(sk)) {
442                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
443                         sk->sk_err = err;
444
445                         sk->sk_error_report(sk);
446
447                         tcp_done(sk);
448                 } else {
449                         sk->sk_err_soft = err;
450                 }
451                 goto out;
452         }
453
454         /* If we've already connected we will keep trying
455          * until we time out, or the user gives up.
456          *
457          * rfc1122 4.2.3.9 allows to consider as hard errors
458          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
459          * but it is obsoleted by pmtu discovery).
460          *
461          * Note, that in modern internet, where routing is unreliable
462          * and in each dark corner broken firewalls sit, sending random
463          * errors ordered by their masters even this two messages finally lose
464          * their original sense (even Linux sends invalid PORT_UNREACHs)
465          *
466          * Now we are in compliance with RFCs.
467          *                                                      --ANK (980905)
468          */
469
470         inet = inet_sk(sk);
471         if (!sock_owned_by_user(sk) && inet->recverr) {
472                 sk->sk_err = err;
473                 sk->sk_error_report(sk);
474         } else  { /* Only an error on timeout */
475                 sk->sk_err_soft = err;
476         }
477
478 out:
479         bh_unlock_sock(sk);
480         sock_put(sk);
481 }
482
483 /* This routine computes an IPv4 TCP checksum. */
484 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
485 {
486         struct inet_sock *inet = inet_sk(sk);
487         struct tcphdr *th = skb->h.th;
488
489         if (skb->ip_summed == CHECKSUM_HW) {
490                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
491                 skb->csum = offsetof(struct tcphdr, check);
492         } else {
493                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
494                                          csum_partial((char *)th,
495                                                       th->doff << 2,
496                                                       skb->csum));
497         }
498 }
499
500 /*
501  *      This routine will send an RST to the other tcp.
502  *
503  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
504  *                    for reset.
505  *      Answer: if a packet caused RST, it is not for a socket
506  *              existing in our system, if it is matched to a socket,
507  *              it is just duplicate segment or bug in other side's TCP.
508  *              So that we build reply only basing on parameters
509  *              arrived with segment.
510  *      Exception: precedence violation. We do not implement it in any case.
511  */
512
513 static void tcp_v4_send_reset(struct sk_buff *skb)
514 {
515         struct tcphdr *th = skb->h.th;
516         struct tcphdr rth;
517         struct ip_reply_arg arg;
518
519         /* Never send a reset in response to a reset. */
520         if (th->rst)
521                 return;
522
523         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
524                 return;
525
526         /* Swap the send and the receive. */
527         memset(&rth, 0, sizeof(struct tcphdr));
528         rth.dest   = th->source;
529         rth.source = th->dest;
530         rth.doff   = sizeof(struct tcphdr) / 4;
531         rth.rst    = 1;
532
533         if (th->ack) {
534                 rth.seq = th->ack_seq;
535         } else {
536                 rth.ack = 1;
537                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
538                                     skb->len - (th->doff << 2));
539         }
540
541         memset(&arg, 0, sizeof arg);
542         arg.iov[0].iov_base = (unsigned char *)&rth;
543         arg.iov[0].iov_len  = sizeof rth;
544         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
545                                       skb->nh.iph->saddr, /*XXX*/
546                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
547         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
548
549         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
550
551         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
552         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
553 }
554
555 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
556    outside socket context is ugly, certainly. What can I do?
557  */
558
559 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
560                             u32 win, u32 ts)
561 {
562         struct tcphdr *th = skb->h.th;
563         struct {
564                 struct tcphdr th;
565                 u32 tsopt[3];
566         } rep;
567         struct ip_reply_arg arg;
568
569         memset(&rep.th, 0, sizeof(struct tcphdr));
570         memset(&arg, 0, sizeof arg);
571
572         arg.iov[0].iov_base = (unsigned char *)&rep;
573         arg.iov[0].iov_len  = sizeof(rep.th);
574         if (ts) {
575                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
576                                      (TCPOPT_TIMESTAMP << 8) |
577                                      TCPOLEN_TIMESTAMP);
578                 rep.tsopt[1] = htonl(tcp_time_stamp);
579                 rep.tsopt[2] = htonl(ts);
580                 arg.iov[0].iov_len = sizeof(rep);
581         }
582
583         /* Swap the send and the receive. */
584         rep.th.dest    = th->source;
585         rep.th.source  = th->dest;
586         rep.th.doff    = arg.iov[0].iov_len / 4;
587         rep.th.seq     = htonl(seq);
588         rep.th.ack_seq = htonl(ack);
589         rep.th.ack     = 1;
590         rep.th.window  = htons(win);
591
592         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
593                                       skb->nh.iph->saddr, /*XXX*/
594                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
595         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
596
597         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
598
599         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
600 }
601
602 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
603 {
604         struct inet_timewait_sock *tw = inet_twsk(sk);
605         const struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
606
607         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
608                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent);
609
610         inet_twsk_put(tw);
611 }
612
613 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
614 {
615         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
616                         req->ts_recent);
617 }
618
619 /*
620  *      Send a SYN-ACK after having received an ACK.
621  *      This still operates on a request_sock only, not on a big
622  *      socket.
623  */
624 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
625                               struct dst_entry *dst)
626 {
627         const struct inet_request_sock *ireq = inet_rsk(req);
628         int err = -1;
629         struct sk_buff * skb;
630
631         /* First, grab a route. */
632         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
633                 goto out;
634
635         skb = tcp_make_synack(sk, dst, req);
636
637         if (skb) {
638                 struct tcphdr *th = skb->h.th;
639
640                 th->check = tcp_v4_check(th, skb->len,
641                                          ireq->loc_addr,
642                                          ireq->rmt_addr,
643                                          csum_partial((char *)th, skb->len,
644                                                       skb->csum));
645
646                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
647                                             ireq->rmt_addr,
648                                             ireq->opt);
649                 if (err == NET_XMIT_CN)
650                         err = 0;
651         }
652
653 out:
654         dst_release(dst);
655         return err;
656 }
657
658 /*
659  *      IPv4 request_sock destructor.
660  */
661 static void tcp_v4_reqsk_destructor(struct request_sock *req)
662 {
663         kfree(inet_rsk(req)->opt);
664 }
665
666 #ifdef CONFIG_SYN_COOKIES
667 static void syn_flood_warning(struct sk_buff *skb)
668 {
669         static unsigned long warntime;
670
671         if (time_after(jiffies, (warntime + HZ * 60))) {
672                 warntime = jiffies;
673                 printk(KERN_INFO
674                        "possible SYN flooding on port %d. Sending cookies.\n",
675                        ntohs(skb->h.th->dest));
676         }
677 }
678 #endif
679
680 /*
681  * Save and compile IPv4 options into the request_sock if needed.
682  */
683 static struct ip_options *tcp_v4_save_options(struct sock *sk,
684                                               struct sk_buff *skb)
685 {
686         struct ip_options *opt = &(IPCB(skb)->opt);
687         struct ip_options *dopt = NULL;
688
689         if (opt && opt->optlen) {
690                 int opt_size = optlength(opt);
691                 dopt = kmalloc(opt_size, GFP_ATOMIC);
692                 if (dopt) {
693                         if (ip_options_echo(dopt, skb)) {
694                                 kfree(dopt);
695                                 dopt = NULL;
696                         }
697                 }
698         }
699         return dopt;
700 }
701
702 struct request_sock_ops tcp_request_sock_ops = {
703         .family         =       PF_INET,
704         .obj_size       =       sizeof(struct tcp_request_sock),
705         .rtx_syn_ack    =       tcp_v4_send_synack,
706         .send_ack       =       tcp_v4_reqsk_send_ack,
707         .destructor     =       tcp_v4_reqsk_destructor,
708         .send_reset     =       tcp_v4_send_reset,
709 };
710
711 static struct timewait_sock_ops tcp_timewait_sock_ops = {
712         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
713         .twsk_unique    = tcp_twsk_unique,
714 };
715
716 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
717 {
718         struct inet_request_sock *ireq;
719         struct tcp_options_received tmp_opt;
720         struct request_sock *req;
721         __u32 saddr = skb->nh.iph->saddr;
722         __u32 daddr = skb->nh.iph->daddr;
723         __u32 isn = TCP_SKB_CB(skb)->when;
724         struct dst_entry *dst = NULL;
725 #ifdef CONFIG_SYN_COOKIES
726         int want_cookie = 0;
727 #else
728 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
729 #endif
730
731         /* Never answer to SYNs send to broadcast or multicast */
732         if (((struct rtable *)skb->dst)->rt_flags &
733             (RTCF_BROADCAST | RTCF_MULTICAST))
734                 goto drop;
735
736         /* TW buckets are converted to open requests without
737          * limitations, they conserve resources and peer is
738          * evidently real one.
739          */
740         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
741 #ifdef CONFIG_SYN_COOKIES
742                 if (sysctl_tcp_syncookies) {
743                         want_cookie = 1;
744                 } else
745 #endif
746                 goto drop;
747         }
748
749         /* Accept backlog is full. If we have already queued enough
750          * of warm entries in syn queue, drop request. It is better than
751          * clogging syn queue with openreqs with exponentially increasing
752          * timeout.
753          */
754         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
755                 goto drop;
756
757         req = reqsk_alloc(&tcp_request_sock_ops);
758         if (!req)
759                 goto drop;
760
761         tcp_clear_options(&tmp_opt);
762         tmp_opt.mss_clamp = 536;
763         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
764
765         tcp_parse_options(skb, &tmp_opt, 0);
766
767         if (want_cookie) {
768                 tcp_clear_options(&tmp_opt);
769                 tmp_opt.saw_tstamp = 0;
770         }
771
772         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
773                 /* Some OSes (unknown ones, but I see them on web server, which
774                  * contains information interesting only for windows'
775                  * users) do not send their stamp in SYN. It is easy case.
776                  * We simply do not advertise TS support.
777                  */
778                 tmp_opt.saw_tstamp = 0;
779                 tmp_opt.tstamp_ok  = 0;
780         }
781         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
782
783         tcp_openreq_init(req, &tmp_opt, skb);
784
785         ireq = inet_rsk(req);
786         ireq->loc_addr = daddr;
787         ireq->rmt_addr = saddr;
788         ireq->opt = tcp_v4_save_options(sk, skb);
789         if (!want_cookie)
790                 TCP_ECN_create_request(req, skb->h.th);
791
792         if (want_cookie) {
793 #ifdef CONFIG_SYN_COOKIES
794                 syn_flood_warning(skb);
795 #endif
796                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
797         } else if (!isn) {
798                 struct inet_peer *peer = NULL;
799
800                 /* VJ's idea. We save last timestamp seen
801                  * from the destination in peer table, when entering
802                  * state TIME-WAIT, and check against it before
803                  * accepting new connection request.
804                  *
805                  * If "isn" is not zero, this request hit alive
806                  * timewait bucket, so that all the necessary checks
807                  * are made in the function processing timewait state.
808                  */
809                 if (tmp_opt.saw_tstamp &&
810                     tcp_death_row.sysctl_tw_recycle &&
811                     (dst = inet_csk_route_req(sk, req)) != NULL &&
812                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
813                     peer->v4daddr == saddr) {
814                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
815                             (s32)(peer->tcp_ts - req->ts_recent) >
816                                                         TCP_PAWS_WINDOW) {
817                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
818                                 dst_release(dst);
819                                 goto drop_and_free;
820                         }
821                 }
822                 /* Kill the following clause, if you dislike this way. */
823                 else if (!sysctl_tcp_syncookies &&
824                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
825                           (sysctl_max_syn_backlog >> 2)) &&
826                          (!peer || !peer->tcp_ts_stamp) &&
827                          (!dst || !dst_metric(dst, RTAX_RTT))) {
828                         /* Without syncookies last quarter of
829                          * backlog is filled with destinations,
830                          * proven to be alive.
831                          * It means that we continue to communicate
832                          * to destinations, already remembered
833                          * to the moment of synflood.
834                          */
835                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
836                                        "request from %u.%u.%u.%u/%u\n",
837                                        NIPQUAD(saddr),
838                                        ntohs(skb->h.th->source));
839                         dst_release(dst);
840                         goto drop_and_free;
841                 }
842
843                 isn = tcp_v4_init_sequence(sk, skb);
844         }
845         tcp_rsk(req)->snt_isn = isn;
846
847         if (tcp_v4_send_synack(sk, req, dst))
848                 goto drop_and_free;
849
850         if (want_cookie) {
851                 reqsk_free(req);
852         } else {
853                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
854         }
855         return 0;
856
857 drop_and_free:
858         reqsk_free(req);
859 drop:
860         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
861         return 0;
862 }
863
864
865 /*
866  * The three way handshake has completed - we got a valid synack -
867  * now create the new socket.
868  */
869 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
870                                   struct request_sock *req,
871                                   struct dst_entry *dst)
872 {
873         struct inet_request_sock *ireq;
874         struct inet_sock *newinet;
875         struct tcp_sock *newtp;
876         struct sock *newsk;
877
878         if (sk_acceptq_is_full(sk))
879                 goto exit_overflow;
880
881         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
882                 goto exit;
883
884         newsk = tcp_create_openreq_child(sk, req, skb);
885         if (!newsk)
886                 goto exit;
887
888         newsk->sk_gso_type = SKB_GSO_TCPV4;
889         sk_setup_caps(newsk, dst);
890
891         newtp                 = tcp_sk(newsk);
892         newinet               = inet_sk(newsk);
893         ireq                  = inet_rsk(req);
894         newinet->daddr        = ireq->rmt_addr;
895         newinet->rcv_saddr    = ireq->loc_addr;
896         newinet->saddr        = ireq->loc_addr;
897         newinet->opt          = ireq->opt;
898         ireq->opt             = NULL;
899         newinet->mc_index     = inet_iif(skb);
900         newinet->mc_ttl       = skb->nh.iph->ttl;
901         inet_csk(newsk)->icsk_ext_hdr_len = 0;
902         if (newinet->opt)
903                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
904         newinet->id = newtp->write_seq ^ jiffies;
905
906         tcp_mtup_init(newsk);
907         tcp_sync_mss(newsk, dst_mtu(dst));
908         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
909         tcp_initialize_rcv_mss(newsk);
910
911         __inet_hash(&tcp_hashinfo, newsk, 0);
912         __inet_inherit_port(&tcp_hashinfo, sk, newsk);
913
914         return newsk;
915
916 exit_overflow:
917         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
918 exit:
919         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
920         dst_release(dst);
921         return NULL;
922 }
923
924 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
925 {
926         struct tcphdr *th = skb->h.th;
927         struct iphdr *iph = skb->nh.iph;
928         struct sock *nsk;
929         struct request_sock **prev;
930         /* Find possible connection requests. */
931         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
932                                                        iph->saddr, iph->daddr);
933         if (req)
934                 return tcp_check_req(sk, skb, req, prev);
935
936         nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
937                                         th->source, skb->nh.iph->daddr,
938                                         ntohs(th->dest), inet_iif(skb));
939
940         if (nsk) {
941                 if (nsk->sk_state != TCP_TIME_WAIT) {
942                         bh_lock_sock(nsk);
943                         return nsk;
944                 }
945                 inet_twsk_put((struct inet_timewait_sock *)nsk);
946                 return NULL;
947         }
948
949 #ifdef CONFIG_SYN_COOKIES
950         if (!th->rst && !th->syn && th->ack)
951                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
952 #endif
953         return sk;
954 }
955
956 static int tcp_v4_checksum_init(struct sk_buff *skb)
957 {
958         if (skb->ip_summed == CHECKSUM_HW) {
959                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
960                                   skb->nh.iph->daddr, skb->csum)) {
961                         skb->ip_summed = CHECKSUM_UNNECESSARY;
962                         return 0;
963                 }
964         }
965
966         skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
967                                        skb->len, IPPROTO_TCP, 0);
968
969         if (skb->len <= 76) {
970                 return __skb_checksum_complete(skb);
971         }
972         return 0;
973 }
974
975
976 /* The socket must have it's spinlock held when we get
977  * here.
978  *
979  * We have a potential double-lock case here, so even when
980  * doing backlog processing we use the BH locking scheme.
981  * This is because we cannot sleep with the original spinlock
982  * held.
983  */
984 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
985 {
986         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
987                 TCP_CHECK_TIMER(sk);
988                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
989                         goto reset;
990                 TCP_CHECK_TIMER(sk);
991                 return 0;
992         }
993
994         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
995                 goto csum_err;
996
997         if (sk->sk_state == TCP_LISTEN) {
998                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
999                 if (!nsk)
1000                         goto discard;
1001
1002                 if (nsk != sk) {
1003                         if (tcp_child_process(sk, nsk, skb))
1004                                 goto reset;
1005                         return 0;
1006                 }
1007         }
1008
1009         TCP_CHECK_TIMER(sk);
1010         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1011                 goto reset;
1012         TCP_CHECK_TIMER(sk);
1013         return 0;
1014
1015 reset:
1016         tcp_v4_send_reset(skb);
1017 discard:
1018         kfree_skb(skb);
1019         /* Be careful here. If this function gets more complicated and
1020          * gcc suffers from register pressure on the x86, sk (in %ebx)
1021          * might be destroyed here. This current version compiles correctly,
1022          * but you have been warned.
1023          */
1024         return 0;
1025
1026 csum_err:
1027         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1028         goto discard;
1029 }
1030
1031 /*
1032  *      From tcp_input.c
1033  */
1034
1035 int tcp_v4_rcv(struct sk_buff *skb)
1036 {
1037         struct tcphdr *th;
1038         struct sock *sk;
1039         int ret;
1040
1041         if (skb->pkt_type != PACKET_HOST)
1042                 goto discard_it;
1043
1044         /* Count it even if it's bad */
1045         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1046
1047         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1048                 goto discard_it;
1049
1050         th = skb->h.th;
1051
1052         if (th->doff < sizeof(struct tcphdr) / 4)
1053                 goto bad_packet;
1054         if (!pskb_may_pull(skb, th->doff * 4))
1055                 goto discard_it;
1056
1057         /* An explanation is required here, I think.
1058          * Packet length and doff are validated by header prediction,
1059          * provided case of th->doff==0 is eliminated.
1060          * So, we defer the checks. */
1061         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1062              tcp_v4_checksum_init(skb)))
1063                 goto bad_packet;
1064
1065         th = skb->h.th;
1066         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1067         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1068                                     skb->len - th->doff * 4);
1069         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1070         TCP_SKB_CB(skb)->when    = 0;
1071         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1072         TCP_SKB_CB(skb)->sacked  = 0;
1073
1074         sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
1075                            skb->nh.iph->daddr, ntohs(th->dest),
1076                            inet_iif(skb));
1077
1078         if (!sk)
1079                 goto no_tcp_socket;
1080
1081 process:
1082         if (sk->sk_state == TCP_TIME_WAIT)
1083                 goto do_time_wait;
1084
1085         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1086                 goto discard_and_relse;
1087         nf_reset(skb);
1088
1089         if (sk_filter(sk, skb, 0))
1090                 goto discard_and_relse;
1091
1092         skb->dev = NULL;
1093
1094         bh_lock_sock(sk);
1095         ret = 0;
1096         if (!sock_owned_by_user(sk)) {
1097 #ifdef CONFIG_NET_DMA
1098                 struct tcp_sock *tp = tcp_sk(sk);
1099                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1100                         tp->ucopy.dma_chan = get_softnet_dma();
1101                 if (tp->ucopy.dma_chan)
1102                         ret = tcp_v4_do_rcv(sk, skb);
1103                 else
1104 #endif
1105                 {
1106                         if (!tcp_prequeue(sk, skb))
1107                         ret = tcp_v4_do_rcv(sk, skb);
1108                 }
1109         } else
1110                 sk_add_backlog(sk, skb);
1111         bh_unlock_sock(sk);
1112
1113         sock_put(sk);
1114
1115         return ret;
1116
1117 no_tcp_socket:
1118         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1119                 goto discard_it;
1120
1121         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1122 bad_packet:
1123                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1124         } else {
1125                 tcp_v4_send_reset(skb);
1126         }
1127
1128 discard_it:
1129         /* Discard frame. */
1130         kfree_skb(skb);
1131         return 0;
1132
1133 discard_and_relse:
1134         sock_put(sk);
1135         goto discard_it;
1136
1137 do_time_wait:
1138         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1139                 inet_twsk_put((struct inet_timewait_sock *) sk);
1140                 goto discard_it;
1141         }
1142
1143         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1144                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1145                 inet_twsk_put((struct inet_timewait_sock *) sk);
1146                 goto discard_it;
1147         }
1148         switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk,
1149                                            skb, th)) {
1150         case TCP_TW_SYN: {
1151                 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1152                                                         skb->nh.iph->daddr,
1153                                                         ntohs(th->dest),
1154                                                         inet_iif(skb));
1155                 if (sk2) {
1156                         inet_twsk_deschedule((struct inet_timewait_sock *)sk,
1157                                              &tcp_death_row);
1158                         inet_twsk_put((struct inet_timewait_sock *)sk);
1159                         sk = sk2;
1160                         goto process;
1161                 }
1162                 /* Fall through to ACK */
1163         }
1164         case TCP_TW_ACK:
1165                 tcp_v4_timewait_ack(sk, skb);
1166                 break;
1167         case TCP_TW_RST:
1168                 goto no_tcp_socket;
1169         case TCP_TW_SUCCESS:;
1170         }
1171         goto discard_it;
1172 }
1173
1174 /* VJ's idea. Save last timestamp seen from this destination
1175  * and hold it at least for normal timewait interval to use for duplicate
1176  * segment detection in subsequent connections, before they enter synchronized
1177  * state.
1178  */
1179
1180 int tcp_v4_remember_stamp(struct sock *sk)
1181 {
1182         struct inet_sock *inet = inet_sk(sk);
1183         struct tcp_sock *tp = tcp_sk(sk);
1184         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1185         struct inet_peer *peer = NULL;
1186         int release_it = 0;
1187
1188         if (!rt || rt->rt_dst != inet->daddr) {
1189                 peer = inet_getpeer(inet->daddr, 1);
1190                 release_it = 1;
1191         } else {
1192                 if (!rt->peer)
1193                         rt_bind_peer(rt, 1);
1194                 peer = rt->peer;
1195         }
1196
1197         if (peer) {
1198                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1199                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1200                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1201                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1202                         peer->tcp_ts = tp->rx_opt.ts_recent;
1203                 }
1204                 if (release_it)
1205                         inet_putpeer(peer);
1206                 return 1;
1207         }
1208
1209         return 0;
1210 }
1211
1212 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1213 {
1214         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1215
1216         if (peer) {
1217                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1218
1219                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1220                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1221                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1222                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1223                         peer->tcp_ts       = tcptw->tw_ts_recent;
1224                 }
1225                 inet_putpeer(peer);
1226                 return 1;
1227         }
1228
1229         return 0;
1230 }
1231
1232 struct inet_connection_sock_af_ops ipv4_specific = {
1233         .queue_xmit        = ip_queue_xmit,
1234         .send_check        = tcp_v4_send_check,
1235         .rebuild_header    = inet_sk_rebuild_header,
1236         .conn_request      = tcp_v4_conn_request,
1237         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1238         .remember_stamp    = tcp_v4_remember_stamp,
1239         .net_header_len    = sizeof(struct iphdr),
1240         .setsockopt        = ip_setsockopt,
1241         .getsockopt        = ip_getsockopt,
1242         .addr2sockaddr     = inet_csk_addr2sockaddr,
1243         .sockaddr_len      = sizeof(struct sockaddr_in),
1244 #ifdef CONFIG_COMPAT
1245         .compat_setsockopt = compat_ip_setsockopt,
1246         .compat_getsockopt = compat_ip_getsockopt,
1247 #endif
1248 };
1249
1250 /* NOTE: A lot of things set to zero explicitly by call to
1251  *       sk_alloc() so need not be done here.
1252  */
1253 static int tcp_v4_init_sock(struct sock *sk)
1254 {
1255         struct inet_connection_sock *icsk = inet_csk(sk);
1256         struct tcp_sock *tp = tcp_sk(sk);
1257
1258         skb_queue_head_init(&tp->out_of_order_queue);
1259         tcp_init_xmit_timers(sk);
1260         tcp_prequeue_init(tp);
1261
1262         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1263         tp->mdev = TCP_TIMEOUT_INIT;
1264
1265         /* So many TCP implementations out there (incorrectly) count the
1266          * initial SYN frame in their delayed-ACK and congestion control
1267          * algorithms that we must have the following bandaid to talk
1268          * efficiently to them.  -DaveM
1269          */
1270         tp->snd_cwnd = 2;
1271
1272         /* See draft-stevens-tcpca-spec-01 for discussion of the
1273          * initialization of these values.
1274          */
1275         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1276         tp->snd_cwnd_clamp = ~0;
1277         tp->mss_cache = 536;
1278
1279         tp->reordering = sysctl_tcp_reordering;
1280         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1281
1282         sk->sk_state = TCP_CLOSE;
1283
1284         sk->sk_write_space = sk_stream_write_space;
1285         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1286
1287         icsk->icsk_af_ops = &ipv4_specific;
1288         icsk->icsk_sync_mss = tcp_sync_mss;
1289
1290         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1291         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1292
1293         atomic_inc(&tcp_sockets_allocated);
1294
1295         return 0;
1296 }
1297
1298 int tcp_v4_destroy_sock(struct sock *sk)
1299 {
1300         struct tcp_sock *tp = tcp_sk(sk);
1301
1302         tcp_clear_xmit_timers(sk);
1303
1304         tcp_cleanup_congestion_control(sk);
1305
1306         /* Cleanup up the write buffer. */
1307         sk_stream_writequeue_purge(sk);
1308
1309         /* Cleans up our, hopefully empty, out_of_order_queue. */
1310         __skb_queue_purge(&tp->out_of_order_queue);
1311
1312 #ifdef CONFIG_NET_DMA
1313         /* Cleans up our sk_async_wait_queue */
1314         __skb_queue_purge(&sk->sk_async_wait_queue);
1315 #endif
1316
1317         /* Clean prequeue, it must be empty really */
1318         __skb_queue_purge(&tp->ucopy.prequeue);
1319
1320         /* Clean up a referenced TCP bind bucket. */
1321         if (inet_csk(sk)->icsk_bind_hash)
1322                 inet_put_port(&tcp_hashinfo, sk);
1323
1324         /*
1325          * If sendmsg cached page exists, toss it.
1326          */
1327         if (sk->sk_sndmsg_page) {
1328                 __free_page(sk->sk_sndmsg_page);
1329                 sk->sk_sndmsg_page = NULL;
1330         }
1331
1332         atomic_dec(&tcp_sockets_allocated);
1333
1334         return 0;
1335 }
1336
1337 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1338
1339 #ifdef CONFIG_PROC_FS
1340 /* Proc filesystem TCP sock list dumping. */
1341
1342 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1343 {
1344         return hlist_empty(head) ? NULL :
1345                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1346 }
1347
1348 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1349 {
1350         return tw->tw_node.next ?
1351                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1352 }
1353
1354 static void *listening_get_next(struct seq_file *seq, void *cur)
1355 {
1356         struct inet_connection_sock *icsk;
1357         struct hlist_node *node;
1358         struct sock *sk = cur;
1359         struct tcp_iter_state* st = seq->private;
1360
1361         if (!sk) {
1362                 st->bucket = 0;
1363                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1364                 goto get_sk;
1365         }
1366
1367         ++st->num;
1368
1369         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1370                 struct request_sock *req = cur;
1371
1372                 icsk = inet_csk(st->syn_wait_sk);
1373                 req = req->dl_next;
1374                 while (1) {
1375                         while (req) {
1376                                 if (req->rsk_ops->family == st->family) {
1377                                         cur = req;
1378                                         goto out;
1379                                 }
1380                                 req = req->dl_next;
1381                         }
1382                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
1383                                 break;
1384 get_req:
1385                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1386                 }
1387                 sk        = sk_next(st->syn_wait_sk);
1388                 st->state = TCP_SEQ_STATE_LISTENING;
1389                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1390         } else {
1391                 icsk = inet_csk(sk);
1392                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1393                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1394                         goto start_req;
1395                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1396                 sk = sk_next(sk);
1397         }
1398 get_sk:
1399         sk_for_each_from(sk, node) {
1400                 if (sk->sk_family == st->family) {
1401                         cur = sk;
1402                         goto out;
1403                 }
1404                 icsk = inet_csk(sk);
1405                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1406                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1407 start_req:
1408                         st->uid         = sock_i_uid(sk);
1409                         st->syn_wait_sk = sk;
1410                         st->state       = TCP_SEQ_STATE_OPENREQ;
1411                         st->sbucket     = 0;
1412                         goto get_req;
1413                 }
1414                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1415         }
1416         if (++st->bucket < INET_LHTABLE_SIZE) {
1417                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1418                 goto get_sk;
1419         }
1420         cur = NULL;
1421 out:
1422         return cur;
1423 }
1424
1425 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1426 {
1427         void *rc = listening_get_next(seq, NULL);
1428
1429         while (rc && *pos) {
1430                 rc = listening_get_next(seq, rc);
1431                 --*pos;
1432         }
1433         return rc;
1434 }
1435
1436 static void *established_get_first(struct seq_file *seq)
1437 {
1438         struct tcp_iter_state* st = seq->private;
1439         void *rc = NULL;
1440
1441         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1442                 struct sock *sk;
1443                 struct hlist_node *node;
1444                 struct inet_timewait_sock *tw;
1445
1446                 /* We can reschedule _before_ having picked the target: */
1447                 cond_resched_softirq();
1448
1449                 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1450                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1451                         if (sk->sk_family != st->family) {
1452                                 continue;
1453                         }
1454                         rc = sk;
1455                         goto out;
1456                 }
1457                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1458                 inet_twsk_for_each(tw, node,
1459                                    &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1460                         if (tw->tw_family != st->family) {
1461                                 continue;
1462                         }
1463                         rc = tw;
1464                         goto out;
1465                 }
1466                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1467                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1468         }
1469 out:
1470         return rc;
1471 }
1472
1473 static void *established_get_next(struct seq_file *seq, void *cur)
1474 {
1475         struct sock *sk = cur;
1476         struct inet_timewait_sock *tw;
1477         struct hlist_node *node;
1478         struct tcp_iter_state* st = seq->private;
1479
1480         ++st->num;
1481
1482         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1483                 tw = cur;
1484                 tw = tw_next(tw);
1485 get_tw:
1486                 while (tw && tw->tw_family != st->family) {
1487                         tw = tw_next(tw);
1488                 }
1489                 if (tw) {
1490                         cur = tw;
1491                         goto out;
1492                 }
1493                 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1494                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1495
1496                 /* We can reschedule between buckets: */
1497                 cond_resched_softirq();
1498
1499                 if (++st->bucket < tcp_hashinfo.ehash_size) {
1500                         read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1501                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1502                 } else {
1503                         cur = NULL;
1504                         goto out;
1505                 }
1506         } else
1507                 sk = sk_next(sk);
1508
1509         sk_for_each_from(sk, node) {
1510                 if (sk->sk_family == st->family)
1511                         goto found;
1512         }
1513
1514         st->state = TCP_SEQ_STATE_TIME_WAIT;
1515         tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1516         goto get_tw;
1517 found:
1518         cur = sk;
1519 out:
1520         return cur;
1521 }
1522
1523 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1524 {
1525         void *rc = established_get_first(seq);
1526
1527         while (rc && pos) {
1528                 rc = established_get_next(seq, rc);
1529                 --pos;
1530         }               
1531         return rc;
1532 }
1533
1534 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1535 {
1536         void *rc;
1537         struct tcp_iter_state* st = seq->private;
1538
1539         inet_listen_lock(&tcp_hashinfo);
1540         st->state = TCP_SEQ_STATE_LISTENING;
1541         rc        = listening_get_idx(seq, &pos);
1542
1543         if (!rc) {
1544                 inet_listen_unlock(&tcp_hashinfo);
1545                 local_bh_disable();
1546                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1547                 rc        = established_get_idx(seq, pos);
1548         }
1549
1550         return rc;
1551 }
1552
1553 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1554 {
1555         struct tcp_iter_state* st = seq->private;
1556         st->state = TCP_SEQ_STATE_LISTENING;
1557         st->num = 0;
1558         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1559 }
1560
1561 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1562 {
1563         void *rc = NULL;
1564         struct tcp_iter_state* st;
1565
1566         if (v == SEQ_START_TOKEN) {
1567                 rc = tcp_get_idx(seq, 0);
1568                 goto out;
1569         }
1570         st = seq->private;
1571
1572         switch (st->state) {
1573         case TCP_SEQ_STATE_OPENREQ:
1574         case TCP_SEQ_STATE_LISTENING:
1575                 rc = listening_get_next(seq, v);
1576                 if (!rc) {
1577                         inet_listen_unlock(&tcp_hashinfo);
1578                         local_bh_disable();
1579                         st->state = TCP_SEQ_STATE_ESTABLISHED;
1580                         rc        = established_get_first(seq);
1581                 }
1582                 break;
1583         case TCP_SEQ_STATE_ESTABLISHED:
1584         case TCP_SEQ_STATE_TIME_WAIT:
1585                 rc = established_get_next(seq, v);
1586                 break;
1587         }
1588 out:
1589         ++*pos;
1590         return rc;
1591 }
1592
1593 static void tcp_seq_stop(struct seq_file *seq, void *v)
1594 {
1595         struct tcp_iter_state* st = seq->private;
1596
1597         switch (st->state) {
1598         case TCP_SEQ_STATE_OPENREQ:
1599                 if (v) {
1600                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
1601                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1602                 }
1603         case TCP_SEQ_STATE_LISTENING:
1604                 if (v != SEQ_START_TOKEN)
1605                         inet_listen_unlock(&tcp_hashinfo);
1606                 break;
1607         case TCP_SEQ_STATE_TIME_WAIT:
1608         case TCP_SEQ_STATE_ESTABLISHED:
1609                 if (v)
1610                         read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1611                 local_bh_enable();
1612                 break;
1613         }
1614 }
1615
1616 static int tcp_seq_open(struct inode *inode, struct file *file)
1617 {
1618         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
1619         struct seq_file *seq;
1620         struct tcp_iter_state *s;
1621         int rc;
1622
1623         if (unlikely(afinfo == NULL))
1624                 return -EINVAL;
1625
1626         s = kmalloc(sizeof(*s), GFP_KERNEL);
1627         if (!s)
1628                 return -ENOMEM;
1629         memset(s, 0, sizeof(*s));
1630         s->family               = afinfo->family;
1631         s->seq_ops.start        = tcp_seq_start;
1632         s->seq_ops.next         = tcp_seq_next;
1633         s->seq_ops.show         = afinfo->seq_show;
1634         s->seq_ops.stop         = tcp_seq_stop;
1635
1636         rc = seq_open(file, &s->seq_ops);
1637         if (rc)
1638                 goto out_kfree;
1639         seq          = file->private_data;
1640         seq->private = s;
1641 out:
1642         return rc;
1643 out_kfree:
1644         kfree(s);
1645         goto out;
1646 }
1647
1648 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
1649 {
1650         int rc = 0;
1651         struct proc_dir_entry *p;
1652
1653         if (!afinfo)
1654                 return -EINVAL;
1655         afinfo->seq_fops->owner         = afinfo->owner;
1656         afinfo->seq_fops->open          = tcp_seq_open;
1657         afinfo->seq_fops->read          = seq_read;
1658         afinfo->seq_fops->llseek        = seq_lseek;
1659         afinfo->seq_fops->release       = seq_release_private;
1660         
1661         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
1662         if (p)
1663                 p->data = afinfo;
1664         else
1665                 rc = -ENOMEM;
1666         return rc;
1667 }
1668
1669 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
1670 {
1671         if (!afinfo)
1672                 return;
1673         proc_net_remove(afinfo->name);
1674         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
1675 }
1676
1677 static void get_openreq4(struct sock *sk, struct request_sock *req,
1678                          char *tmpbuf, int i, int uid)
1679 {
1680         const struct inet_request_sock *ireq = inet_rsk(req);
1681         int ttd = req->expires - jiffies;
1682
1683         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1684                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
1685                 i,
1686                 ireq->loc_addr,
1687                 ntohs(inet_sk(sk)->sport),
1688                 ireq->rmt_addr,
1689                 ntohs(ireq->rmt_port),
1690                 TCP_SYN_RECV,
1691                 0, 0, /* could print option size, but that is af dependent. */
1692                 1,    /* timers active (only the expire timer) */
1693                 jiffies_to_clock_t(ttd),
1694                 req->retrans,
1695                 uid,
1696                 0,  /* non standard timer */
1697                 0, /* open_requests have no inode */
1698                 atomic_read(&sk->sk_refcnt),
1699                 req);
1700 }
1701
1702 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
1703 {
1704         int timer_active;
1705         unsigned long timer_expires;
1706         struct tcp_sock *tp = tcp_sk(sp);
1707         const struct inet_connection_sock *icsk = inet_csk(sp);
1708         struct inet_sock *inet = inet_sk(sp);
1709         unsigned int dest = inet->daddr;
1710         unsigned int src = inet->rcv_saddr;
1711         __u16 destp = ntohs(inet->dport);
1712         __u16 srcp = ntohs(inet->sport);
1713
1714         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
1715                 timer_active    = 1;
1716                 timer_expires   = icsk->icsk_timeout;
1717         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
1718                 timer_active    = 4;
1719                 timer_expires   = icsk->icsk_timeout;
1720         } else if (timer_pending(&sp->sk_timer)) {
1721                 timer_active    = 2;
1722                 timer_expires   = sp->sk_timer.expires;
1723         } else {
1724                 timer_active    = 0;
1725                 timer_expires = jiffies;
1726         }
1727
1728         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
1729                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
1730                 i, src, srcp, dest, destp, sp->sk_state,
1731                 tp->write_seq - tp->snd_una,
1732                 (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq),
1733                 timer_active,
1734                 jiffies_to_clock_t(timer_expires - jiffies),
1735                 icsk->icsk_retransmits,
1736                 sock_i_uid(sp),
1737                 icsk->icsk_probes_out,
1738                 sock_i_ino(sp),
1739                 atomic_read(&sp->sk_refcnt), sp,
1740                 icsk->icsk_rto,
1741                 icsk->icsk_ack.ato,
1742                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
1743                 tp->snd_cwnd,
1744                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
1745 }
1746
1747 static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i)
1748 {
1749         unsigned int dest, src;
1750         __u16 destp, srcp;
1751         int ttd = tw->tw_ttd - jiffies;
1752
1753         if (ttd < 0)
1754                 ttd = 0;
1755
1756         dest  = tw->tw_daddr;
1757         src   = tw->tw_rcv_saddr;
1758         destp = ntohs(tw->tw_dport);
1759         srcp  = ntohs(tw->tw_sport);
1760
1761         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
1762                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
1763                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
1764                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
1765                 atomic_read(&tw->tw_refcnt), tw);
1766 }
1767
1768 #define TMPSZ 150
1769
1770 static int tcp4_seq_show(struct seq_file *seq, void *v)
1771 {
1772         struct tcp_iter_state* st;
1773         char tmpbuf[TMPSZ + 1];
1774
1775         if (v == SEQ_START_TOKEN) {
1776                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
1777                            "  sl  local_address rem_address   st tx_queue "
1778                            "rx_queue tr tm->when retrnsmt   uid  timeout "
1779                            "inode");
1780                 goto out;
1781         }
1782         st = seq->private;
1783
1784         switch (st->state) {
1785         case TCP_SEQ_STATE_LISTENING:
1786         case TCP_SEQ_STATE_ESTABLISHED:
1787                 get_tcp4_sock(v, tmpbuf, st->num);
1788                 break;
1789         case TCP_SEQ_STATE_OPENREQ:
1790                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
1791                 break;
1792         case TCP_SEQ_STATE_TIME_WAIT:
1793                 get_timewait4_sock(v, tmpbuf, st->num);
1794                 break;
1795         }
1796         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
1797 out:
1798         return 0;
1799 }
1800
1801 static struct file_operations tcp4_seq_fops;
1802 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
1803         .owner          = THIS_MODULE,
1804         .name           = "tcp",
1805         .family         = AF_INET,
1806         .seq_show       = tcp4_seq_show,
1807         .seq_fops       = &tcp4_seq_fops,
1808 };
1809
1810 int __init tcp4_proc_init(void)
1811 {
1812         return tcp_proc_register(&tcp4_seq_afinfo);
1813 }
1814
1815 void tcp4_proc_exit(void)
1816 {
1817         tcp_proc_unregister(&tcp4_seq_afinfo);
1818 }
1819 #endif /* CONFIG_PROC_FS */
1820
1821 struct proto tcp_prot = {
1822         .name                   = "TCP",
1823         .owner                  = THIS_MODULE,
1824         .close                  = tcp_close,
1825         .connect                = tcp_v4_connect,
1826         .disconnect             = tcp_disconnect,
1827         .accept                 = inet_csk_accept,
1828         .ioctl                  = tcp_ioctl,
1829         .init                   = tcp_v4_init_sock,
1830         .destroy                = tcp_v4_destroy_sock,
1831         .shutdown               = tcp_shutdown,
1832         .setsockopt             = tcp_setsockopt,
1833         .getsockopt             = tcp_getsockopt,
1834         .sendmsg                = tcp_sendmsg,
1835         .recvmsg                = tcp_recvmsg,
1836         .backlog_rcv            = tcp_v4_do_rcv,
1837         .hash                   = tcp_v4_hash,
1838         .unhash                 = tcp_unhash,
1839         .get_port               = tcp_v4_get_port,
1840         .enter_memory_pressure  = tcp_enter_memory_pressure,
1841         .sockets_allocated      = &tcp_sockets_allocated,
1842         .orphan_count           = &tcp_orphan_count,
1843         .memory_allocated       = &tcp_memory_allocated,
1844         .memory_pressure        = &tcp_memory_pressure,
1845         .sysctl_mem             = sysctl_tcp_mem,
1846         .sysctl_wmem            = sysctl_tcp_wmem,
1847         .sysctl_rmem            = sysctl_tcp_rmem,
1848         .max_header             = MAX_TCP_HEADER,
1849         .obj_size               = sizeof(struct tcp_sock),
1850         .twsk_prot              = &tcp_timewait_sock_ops,
1851         .rsk_prot               = &tcp_request_sock_ops,
1852 #ifdef CONFIG_COMPAT
1853         .compat_setsockopt      = compat_tcp_setsockopt,
1854         .compat_getsockopt      = compat_tcp_getsockopt,
1855 #endif
1856 };
1857
1858 void __init tcp_v4_init(struct net_proto_family *ops)
1859 {
1860         if (inet_csk_ctl_sock_create(&tcp_socket, PF_INET, SOCK_RAW, IPPROTO_TCP) < 0)
1861                 panic("Failed to create the TCP control socket.\n");
1862 }
1863
1864 EXPORT_SYMBOL(ipv4_specific);
1865 EXPORT_SYMBOL(tcp_hashinfo);
1866 EXPORT_SYMBOL(tcp_prot);
1867 EXPORT_SYMBOL(tcp_unhash);
1868 EXPORT_SYMBOL(tcp_v4_conn_request);
1869 EXPORT_SYMBOL(tcp_v4_connect);
1870 EXPORT_SYMBOL(tcp_v4_do_rcv);
1871 EXPORT_SYMBOL(tcp_v4_remember_stamp);
1872 EXPORT_SYMBOL(tcp_v4_send_check);
1873 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1874
1875 #ifdef CONFIG_PROC_FS
1876 EXPORT_SYMBOL(tcp_proc_register);
1877 EXPORT_SYMBOL(tcp_proc_unregister);
1878 #endif
1879 EXPORT_SYMBOL(sysctl_local_port_range);
1880 EXPORT_SYMBOL(sysctl_tcp_low_latency);
1881