a82df6307567e61675720a9bc6f5f03dcecbdb4b
[linux-3.10.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/types.h>
55 #include <linux/fcntl.h>
56 #include <linux/module.h>
57 #include <linux/random.h>
58 #include <linux/cache.h>
59 #include <linux/jhash.h>
60 #include <linux/init.h>
61 #include <linux/times.h>
62
63 #include <net/net_namespace.h>
64 #include <net/icmp.h>
65 #include <net/inet_hashtables.h>
66 #include <net/tcp.h>
67 #include <net/transp_v6.h>
68 #include <net/ipv6.h>
69 #include <net/inet_common.h>
70 #include <net/timewait_sock.h>
71 #include <net/xfrm.h>
72 #include <net/netdma.h>
73
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79
80 #include <linux/crypto.h>
81 #include <linux/scatterlist.h>
82
83 int sysctl_tcp_tw_reuse __read_mostly;
84 int sysctl_tcp_low_latency __read_mostly;
85
86
87 #ifdef CONFIG_TCP_MD5SIG
88 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
89                                                    __be32 addr);
90 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
91                                __be32 daddr, __be32 saddr, struct tcphdr *th);
92 #else
93 static inline
94 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
95 {
96         return NULL;
97 }
98 #endif
99
100 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
101         .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
102         .lhash_users = ATOMIC_INIT(0),
103         .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
104 };
105
106 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107 {
108         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109                                           ip_hdr(skb)->saddr,
110                                           tcp_hdr(skb)->dest,
111                                           tcp_hdr(skb)->source);
112 }
113
114 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115 {
116         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117         struct tcp_sock *tp = tcp_sk(sk);
118
119         /* With PAWS, it is safe from the viewpoint
120            of data integrity. Even without PAWS it is safe provided sequence
121            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122
123            Actually, the idea is close to VJ's one, only timestamp cache is
124            held not per host, but per port pair and TW bucket is used as state
125            holder.
126
127            If TW bucket has been already destroyed we fall back to VJ's scheme
128            and use initial timestamp retrieved from peer table.
129          */
130         if (tcptw->tw_ts_recent_stamp &&
131             (twp == NULL || (sysctl_tcp_tw_reuse &&
132                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
133                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134                 if (tp->write_seq == 0)
135                         tp->write_seq = 1;
136                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
137                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138                 sock_hold(sktw);
139                 return 1;
140         }
141
142         return 0;
143 }
144
145 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
146
147 /* This will initiate an outgoing connection. */
148 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
149 {
150         struct inet_sock *inet = inet_sk(sk);
151         struct tcp_sock *tp = tcp_sk(sk);
152         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
153         struct rtable *rt;
154         __be32 daddr, nexthop;
155         int tmp;
156         int err;
157
158         if (addr_len < sizeof(struct sockaddr_in))
159                 return -EINVAL;
160
161         if (usin->sin_family != AF_INET)
162                 return -EAFNOSUPPORT;
163
164         nexthop = daddr = usin->sin_addr.s_addr;
165         if (inet->opt && inet->opt->srr) {
166                 if (!daddr)
167                         return -EINVAL;
168                 nexthop = inet->opt->faddr;
169         }
170
171         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
172                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
173                                IPPROTO_TCP,
174                                inet->sport, usin->sin_port, sk, 1);
175         if (tmp < 0) {
176                 if (tmp == -ENETUNREACH)
177                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
178                 return tmp;
179         }
180
181         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
182                 ip_rt_put(rt);
183                 return -ENETUNREACH;
184         }
185
186         if (!inet->opt || !inet->opt->srr)
187                 daddr = rt->rt_dst;
188
189         if (!inet->saddr)
190                 inet->saddr = rt->rt_src;
191         inet->rcv_saddr = inet->saddr;
192
193         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
194                 /* Reset inherited state */
195                 tp->rx_opt.ts_recent       = 0;
196                 tp->rx_opt.ts_recent_stamp = 0;
197                 tp->write_seq              = 0;
198         }
199
200         if (tcp_death_row.sysctl_tw_recycle &&
201             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
202                 struct inet_peer *peer = rt_get_peer(rt);
203                 /*
204                  * VJ's idea. We save last timestamp seen from
205                  * the destination in peer table, when entering state
206                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
207                  * when trying new connection.
208                  */
209                 if (peer != NULL &&
210                     peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
211                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
212                         tp->rx_opt.ts_recent = peer->tcp_ts;
213                 }
214         }
215
216         inet->dport = usin->sin_port;
217         inet->daddr = daddr;
218
219         inet_csk(sk)->icsk_ext_hdr_len = 0;
220         if (inet->opt)
221                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
222
223         tp->rx_opt.mss_clamp = 536;
224
225         /* Socket identity is still unknown (sport may be zero).
226          * However we set state to SYN-SENT and not releasing socket
227          * lock select source port, enter ourselves into the hash tables and
228          * complete initialization after this.
229          */
230         tcp_set_state(sk, TCP_SYN_SENT);
231         err = inet_hash_connect(&tcp_death_row, sk);
232         if (err)
233                 goto failure;
234
235         err = ip_route_newports(&rt, IPPROTO_TCP,
236                                 inet->sport, inet->dport, sk);
237         if (err)
238                 goto failure;
239
240         /* OK, now commit destination to socket.  */
241         sk->sk_gso_type = SKB_GSO_TCPV4;
242         sk_setup_caps(sk, &rt->u.dst);
243
244         if (!tp->write_seq)
245                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
246                                                            inet->daddr,
247                                                            inet->sport,
248                                                            usin->sin_port);
249
250         inet->id = tp->write_seq ^ jiffies;
251
252         err = tcp_connect(sk);
253         rt = NULL;
254         if (err)
255                 goto failure;
256
257         return 0;
258
259 failure:
260         /*
261          * This unhashes the socket and releases the local port,
262          * if necessary.
263          */
264         tcp_set_state(sk, TCP_CLOSE);
265         ip_rt_put(rt);
266         sk->sk_route_caps = 0;
267         inet->dport = 0;
268         return err;
269 }
270
271 /*
272  * This routine does path mtu discovery as defined in RFC1191.
273  */
274 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
275 {
276         struct dst_entry *dst;
277         struct inet_sock *inet = inet_sk(sk);
278
279         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
280          * send out by Linux are always <576bytes so they should go through
281          * unfragmented).
282          */
283         if (sk->sk_state == TCP_LISTEN)
284                 return;
285
286         /* We don't check in the destentry if pmtu discovery is forbidden
287          * on this route. We just assume that no packet_to_big packets
288          * are send back when pmtu discovery is not active.
289          * There is a small race when the user changes this flag in the
290          * route, but I think that's acceptable.
291          */
292         if ((dst = __sk_dst_check(sk, 0)) == NULL)
293                 return;
294
295         dst->ops->update_pmtu(dst, mtu);
296
297         /* Something is about to be wrong... Remember soft error
298          * for the case, if this connection will not able to recover.
299          */
300         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
301                 sk->sk_err_soft = EMSGSIZE;
302
303         mtu = dst_mtu(dst);
304
305         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
306             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
307                 tcp_sync_mss(sk, mtu);
308
309                 /* Resend the TCP packet because it's
310                  * clear that the old packet has been
311                  * dropped. This is the new "fast" path mtu
312                  * discovery.
313                  */
314                 tcp_simple_retransmit(sk);
315         } /* else let the usual retransmit timer handle it */
316 }
317
318 /*
319  * This routine is called by the ICMP module when it gets some
320  * sort of error condition.  If err < 0 then the socket should
321  * be closed and the error returned to the user.  If err > 0
322  * it's just the icmp type << 8 | icmp code.  After adjustment
323  * header points to the first 8 bytes of the tcp header.  We need
324  * to find the appropriate port.
325  *
326  * The locking strategy used here is very "optimistic". When
327  * someone else accesses the socket the ICMP is just dropped
328  * and for some paths there is no check at all.
329  * A more general error queue to queue errors for later handling
330  * is probably better.
331  *
332  */
333
334 void tcp_v4_err(struct sk_buff *skb, u32 info)
335 {
336         struct iphdr *iph = (struct iphdr *)skb->data;
337         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
338         struct tcp_sock *tp;
339         struct inet_sock *inet;
340         const int type = icmp_hdr(skb)->type;
341         const int code = icmp_hdr(skb)->code;
342         struct sock *sk;
343         __u32 seq;
344         int err;
345         struct net *net = dev_net(skb->dev);
346
347         if (skb->len < (iph->ihl << 2) + 8) {
348                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
349                 return;
350         }
351
352         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
353                         iph->saddr, th->source, inet_iif(skb));
354         if (!sk) {
355                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
356                 return;
357         }
358         if (sk->sk_state == TCP_TIME_WAIT) {
359                 inet_twsk_put(inet_twsk(sk));
360                 return;
361         }
362
363         bh_lock_sock(sk);
364         /* If too many ICMPs get dropped on busy
365          * servers this needs to be solved differently.
366          */
367         if (sock_owned_by_user(sk))
368                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
369
370         if (sk->sk_state == TCP_CLOSE)
371                 goto out;
372
373         tp = tcp_sk(sk);
374         seq = ntohl(th->seq);
375         if (sk->sk_state != TCP_LISTEN &&
376             !between(seq, tp->snd_una, tp->snd_nxt)) {
377                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
378                 goto out;
379         }
380
381         switch (type) {
382         case ICMP_SOURCE_QUENCH:
383                 /* Just silently ignore these. */
384                 goto out;
385         case ICMP_PARAMETERPROB:
386                 err = EPROTO;
387                 break;
388         case ICMP_DEST_UNREACH:
389                 if (code > NR_ICMP_UNREACH)
390                         goto out;
391
392                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
393                         if (!sock_owned_by_user(sk))
394                                 do_pmtu_discovery(sk, iph, info);
395                         goto out;
396                 }
397
398                 err = icmp_err_convert[code].errno;
399                 break;
400         case ICMP_TIME_EXCEEDED:
401                 err = EHOSTUNREACH;
402                 break;
403         default:
404                 goto out;
405         }
406
407         switch (sk->sk_state) {
408                 struct request_sock *req, **prev;
409         case TCP_LISTEN:
410                 if (sock_owned_by_user(sk))
411                         goto out;
412
413                 req = inet_csk_search_req(sk, &prev, th->dest,
414                                           iph->daddr, iph->saddr);
415                 if (!req)
416                         goto out;
417
418                 /* ICMPs are not backlogged, hence we cannot get
419                    an established socket here.
420                  */
421                 BUG_TRAP(!req->sk);
422
423                 if (seq != tcp_rsk(req)->snt_isn) {
424                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
425                         goto out;
426                 }
427
428                 /*
429                  * Still in SYN_RECV, just remove it silently.
430                  * There is no good way to pass the error to the newly
431                  * created socket, and POSIX does not want network
432                  * errors returned from accept().
433                  */
434                 inet_csk_reqsk_queue_drop(sk, req, prev);
435                 goto out;
436
437         case TCP_SYN_SENT:
438         case TCP_SYN_RECV:  /* Cannot happen.
439                                It can f.e. if SYNs crossed.
440                              */
441                 if (!sock_owned_by_user(sk)) {
442                         sk->sk_err = err;
443
444                         sk->sk_error_report(sk);
445
446                         tcp_done(sk);
447                 } else {
448                         sk->sk_err_soft = err;
449                 }
450                 goto out;
451         }
452
453         /* If we've already connected we will keep trying
454          * until we time out, or the user gives up.
455          *
456          * rfc1122 4.2.3.9 allows to consider as hard errors
457          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
458          * but it is obsoleted by pmtu discovery).
459          *
460          * Note, that in modern internet, where routing is unreliable
461          * and in each dark corner broken firewalls sit, sending random
462          * errors ordered by their masters even this two messages finally lose
463          * their original sense (even Linux sends invalid PORT_UNREACHs)
464          *
465          * Now we are in compliance with RFCs.
466          *                                                      --ANK (980905)
467          */
468
469         inet = inet_sk(sk);
470         if (!sock_owned_by_user(sk) && inet->recverr) {
471                 sk->sk_err = err;
472                 sk->sk_error_report(sk);
473         } else  { /* Only an error on timeout */
474                 sk->sk_err_soft = err;
475         }
476
477 out:
478         bh_unlock_sock(sk);
479         sock_put(sk);
480 }
481
482 /* This routine computes an IPv4 TCP checksum. */
483 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
484 {
485         struct inet_sock *inet = inet_sk(sk);
486         struct tcphdr *th = tcp_hdr(skb);
487
488         if (skb->ip_summed == CHECKSUM_PARTIAL) {
489                 th->check = ~tcp_v4_check(len, inet->saddr,
490                                           inet->daddr, 0);
491                 skb->csum_start = skb_transport_header(skb) - skb->head;
492                 skb->csum_offset = offsetof(struct tcphdr, check);
493         } else {
494                 th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
495                                          csum_partial((char *)th,
496                                                       th->doff << 2,
497                                                       skb->csum));
498         }
499 }
500
501 int tcp_v4_gso_send_check(struct sk_buff *skb)
502 {
503         const struct iphdr *iph;
504         struct tcphdr *th;
505
506         if (!pskb_may_pull(skb, sizeof(*th)))
507                 return -EINVAL;
508
509         iph = ip_hdr(skb);
510         th = tcp_hdr(skb);
511
512         th->check = 0;
513         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
514         skb->csum_start = skb_transport_header(skb) - skb->head;
515         skb->csum_offset = offsetof(struct tcphdr, check);
516         skb->ip_summed = CHECKSUM_PARTIAL;
517         return 0;
518 }
519
520 /*
521  *      This routine will send an RST to the other tcp.
522  *
523  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
524  *                    for reset.
525  *      Answer: if a packet caused RST, it is not for a socket
526  *              existing in our system, if it is matched to a socket,
527  *              it is just duplicate segment or bug in other side's TCP.
528  *              So that we build reply only basing on parameters
529  *              arrived with segment.
530  *      Exception: precedence violation. We do not implement it in any case.
531  */
532
533 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
534 {
535         struct tcphdr *th = tcp_hdr(skb);
536         struct {
537                 struct tcphdr th;
538 #ifdef CONFIG_TCP_MD5SIG
539                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
540 #endif
541         } rep;
542         struct ip_reply_arg arg;
543 #ifdef CONFIG_TCP_MD5SIG
544         struct tcp_md5sig_key *key;
545 #endif
546         struct net *net;
547
548         /* Never send a reset in response to a reset. */
549         if (th->rst)
550                 return;
551
552         if (skb->rtable->rt_type != RTN_LOCAL)
553                 return;
554
555         /* Swap the send and the receive. */
556         memset(&rep, 0, sizeof(rep));
557         rep.th.dest   = th->source;
558         rep.th.source = th->dest;
559         rep.th.doff   = sizeof(struct tcphdr) / 4;
560         rep.th.rst    = 1;
561
562         if (th->ack) {
563                 rep.th.seq = th->ack_seq;
564         } else {
565                 rep.th.ack = 1;
566                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
567                                        skb->len - (th->doff << 2));
568         }
569
570         memset(&arg, 0, sizeof(arg));
571         arg.iov[0].iov_base = (unsigned char *)&rep;
572         arg.iov[0].iov_len  = sizeof(rep.th);
573
574 #ifdef CONFIG_TCP_MD5SIG
575         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
576         if (key) {
577                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
578                                    (TCPOPT_NOP << 16) |
579                                    (TCPOPT_MD5SIG << 8) |
580                                    TCPOLEN_MD5SIG);
581                 /* Update length and the length the header thinks exists */
582                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
583                 rep.th.doff = arg.iov[0].iov_len / 4;
584
585                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
586                                      key, ip_hdr(skb)->daddr,
587                                      ip_hdr(skb)->saddr, &rep.th);
588         }
589 #endif
590         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
591                                       ip_hdr(skb)->saddr, /* XXX */
592                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
593         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
594
595         net = dev_net(skb->dst->dev);
596         ip_send_reply(net->ipv4.tcp_sock, skb,
597                       &arg, arg.iov[0].iov_len);
598
599         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
600         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
601 }
602
603 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
604    outside socket context is ugly, certainly. What can I do?
605  */
606
607 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
608                             u32 win, u32 ts, int oif,
609                             struct tcp_md5sig_key *key)
610 {
611         struct tcphdr *th = tcp_hdr(skb);
612         struct {
613                 struct tcphdr th;
614                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
615 #ifdef CONFIG_TCP_MD5SIG
616                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
617 #endif
618                         ];
619         } rep;
620         struct ip_reply_arg arg;
621         struct net *net = dev_net(skb->dev);
622
623         memset(&rep.th, 0, sizeof(struct tcphdr));
624         memset(&arg, 0, sizeof(arg));
625
626         arg.iov[0].iov_base = (unsigned char *)&rep;
627         arg.iov[0].iov_len  = sizeof(rep.th);
628         if (ts) {
629                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
630                                    (TCPOPT_TIMESTAMP << 8) |
631                                    TCPOLEN_TIMESTAMP);
632                 rep.opt[1] = htonl(tcp_time_stamp);
633                 rep.opt[2] = htonl(ts);
634                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
635         }
636
637         /* Swap the send and the receive. */
638         rep.th.dest    = th->source;
639         rep.th.source  = th->dest;
640         rep.th.doff    = arg.iov[0].iov_len / 4;
641         rep.th.seq     = htonl(seq);
642         rep.th.ack_seq = htonl(ack);
643         rep.th.ack     = 1;
644         rep.th.window  = htons(win);
645
646 #ifdef CONFIG_TCP_MD5SIG
647         if (key) {
648                 int offset = (ts) ? 3 : 0;
649
650                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
651                                           (TCPOPT_NOP << 16) |
652                                           (TCPOPT_MD5SIG << 8) |
653                                           TCPOLEN_MD5SIG);
654                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
655                 rep.th.doff = arg.iov[0].iov_len/4;
656
657                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
658                                     key, ip_hdr(skb)->daddr,
659                                     ip_hdr(skb)->saddr, &rep.th);
660         }
661 #endif
662         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
663                                       ip_hdr(skb)->saddr, /* XXX */
664                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
665         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
666         if (oif)
667                 arg.bound_dev_if = oif;
668
669         ip_send_reply(net->ipv4.tcp_sock, skb,
670                       &arg, arg.iov[0].iov_len);
671
672         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
673 }
674
675 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
676 {
677         struct inet_timewait_sock *tw = inet_twsk(sk);
678         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
679
680         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
681                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
682                         tcptw->tw_ts_recent,
683                         tw->tw_bound_dev_if,
684                         tcp_twsk_md5_key(tcptw)
685                         );
686
687         inet_twsk_put(tw);
688 }
689
690 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb,
691                                   struct request_sock *req)
692 {
693         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
694                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
695                         req->ts_recent,
696                         0,
697                         tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr));
698 }
699
700 /*
701  *      Send a SYN-ACK after having received a SYN.
702  *      This still operates on a request_sock only, not on a big
703  *      socket.
704  */
705 static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
706                                 struct dst_entry *dst)
707 {
708         const struct inet_request_sock *ireq = inet_rsk(req);
709         int err = -1;
710         struct sk_buff * skb;
711
712         /* First, grab a route. */
713         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
714                 return -1;
715
716         skb = tcp_make_synack(sk, dst, req);
717
718         if (skb) {
719                 struct tcphdr *th = tcp_hdr(skb);
720
721                 th->check = tcp_v4_check(skb->len,
722                                          ireq->loc_addr,
723                                          ireq->rmt_addr,
724                                          csum_partial((char *)th, skb->len,
725                                                       skb->csum));
726
727                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
728                                             ireq->rmt_addr,
729                                             ireq->opt);
730                 err = net_xmit_eval(err);
731         }
732
733         dst_release(dst);
734         return err;
735 }
736
737 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
738 {
739         return __tcp_v4_send_synack(sk, req, NULL);
740 }
741
742 /*
743  *      IPv4 request_sock destructor.
744  */
745 static void tcp_v4_reqsk_destructor(struct request_sock *req)
746 {
747         kfree(inet_rsk(req)->opt);
748 }
749
750 #ifdef CONFIG_SYN_COOKIES
751 static void syn_flood_warning(struct sk_buff *skb)
752 {
753         static unsigned long warntime;
754
755         if (time_after(jiffies, (warntime + HZ * 60))) {
756                 warntime = jiffies;
757                 printk(KERN_INFO
758                        "possible SYN flooding on port %d. Sending cookies.\n",
759                        ntohs(tcp_hdr(skb)->dest));
760         }
761 }
762 #endif
763
764 /*
765  * Save and compile IPv4 options into the request_sock if needed.
766  */
767 static struct ip_options *tcp_v4_save_options(struct sock *sk,
768                                               struct sk_buff *skb)
769 {
770         struct ip_options *opt = &(IPCB(skb)->opt);
771         struct ip_options *dopt = NULL;
772
773         if (opt && opt->optlen) {
774                 int opt_size = optlength(opt);
775                 dopt = kmalloc(opt_size, GFP_ATOMIC);
776                 if (dopt) {
777                         if (ip_options_echo(dopt, skb)) {
778                                 kfree(dopt);
779                                 dopt = NULL;
780                         }
781                 }
782         }
783         return dopt;
784 }
785
786 #ifdef CONFIG_TCP_MD5SIG
787 /*
788  * RFC2385 MD5 checksumming requires a mapping of
789  * IP address->MD5 Key.
790  * We need to maintain these in the sk structure.
791  */
792
793 /* Find the Key structure for an address.  */
794 static struct tcp_md5sig_key *
795                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
796 {
797         struct tcp_sock *tp = tcp_sk(sk);
798         int i;
799
800         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
801                 return NULL;
802         for (i = 0; i < tp->md5sig_info->entries4; i++) {
803                 if (tp->md5sig_info->keys4[i].addr == addr)
804                         return &tp->md5sig_info->keys4[i].base;
805         }
806         return NULL;
807 }
808
809 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
810                                          struct sock *addr_sk)
811 {
812         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
813 }
814
815 EXPORT_SYMBOL(tcp_v4_md5_lookup);
816
817 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
818                                                       struct request_sock *req)
819 {
820         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
821 }
822
823 /* This can be called on a newly created socket, from other files */
824 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
825                       u8 *newkey, u8 newkeylen)
826 {
827         /* Add Key to the list */
828         struct tcp_md5sig_key *key;
829         struct tcp_sock *tp = tcp_sk(sk);
830         struct tcp4_md5sig_key *keys;
831
832         key = tcp_v4_md5_do_lookup(sk, addr);
833         if (key) {
834                 /* Pre-existing entry - just update that one. */
835                 kfree(key->key);
836                 key->key = newkey;
837                 key->keylen = newkeylen;
838         } else {
839                 struct tcp_md5sig_info *md5sig;
840
841                 if (!tp->md5sig_info) {
842                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
843                                                   GFP_ATOMIC);
844                         if (!tp->md5sig_info) {
845                                 kfree(newkey);
846                                 return -ENOMEM;
847                         }
848                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
849                 }
850                 if (tcp_alloc_md5sig_pool() == NULL) {
851                         kfree(newkey);
852                         return -ENOMEM;
853                 }
854                 md5sig = tp->md5sig_info;
855
856                 if (md5sig->alloced4 == md5sig->entries4) {
857                         keys = kmalloc((sizeof(*keys) *
858                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
859                         if (!keys) {
860                                 kfree(newkey);
861                                 tcp_free_md5sig_pool();
862                                 return -ENOMEM;
863                         }
864
865                         if (md5sig->entries4)
866                                 memcpy(keys, md5sig->keys4,
867                                        sizeof(*keys) * md5sig->entries4);
868
869                         /* Free old key list, and reference new one */
870                         kfree(md5sig->keys4);
871                         md5sig->keys4 = keys;
872                         md5sig->alloced4++;
873                 }
874                 md5sig->entries4++;
875                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
876                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
877                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
878         }
879         return 0;
880 }
881
882 EXPORT_SYMBOL(tcp_v4_md5_do_add);
883
884 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
885                                u8 *newkey, u8 newkeylen)
886 {
887         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
888                                  newkey, newkeylen);
889 }
890
891 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
892 {
893         struct tcp_sock *tp = tcp_sk(sk);
894         int i;
895
896         for (i = 0; i < tp->md5sig_info->entries4; i++) {
897                 if (tp->md5sig_info->keys4[i].addr == addr) {
898                         /* Free the key */
899                         kfree(tp->md5sig_info->keys4[i].base.key);
900                         tp->md5sig_info->entries4--;
901
902                         if (tp->md5sig_info->entries4 == 0) {
903                                 kfree(tp->md5sig_info->keys4);
904                                 tp->md5sig_info->keys4 = NULL;
905                                 tp->md5sig_info->alloced4 = 0;
906                         } else if (tp->md5sig_info->entries4 != i) {
907                                 /* Need to do some manipulation */
908                                 memmove(&tp->md5sig_info->keys4[i],
909                                         &tp->md5sig_info->keys4[i+1],
910                                         (tp->md5sig_info->entries4 - i) *
911                                          sizeof(struct tcp4_md5sig_key));
912                         }
913                         tcp_free_md5sig_pool();
914                         return 0;
915                 }
916         }
917         return -ENOENT;
918 }
919
920 EXPORT_SYMBOL(tcp_v4_md5_do_del);
921
922 static void tcp_v4_clear_md5_list(struct sock *sk)
923 {
924         struct tcp_sock *tp = tcp_sk(sk);
925
926         /* Free each key, then the set of key keys,
927          * the crypto element, and then decrement our
928          * hold on the last resort crypto.
929          */
930         if (tp->md5sig_info->entries4) {
931                 int i;
932                 for (i = 0; i < tp->md5sig_info->entries4; i++)
933                         kfree(tp->md5sig_info->keys4[i].base.key);
934                 tp->md5sig_info->entries4 = 0;
935                 tcp_free_md5sig_pool();
936         }
937         if (tp->md5sig_info->keys4) {
938                 kfree(tp->md5sig_info->keys4);
939                 tp->md5sig_info->keys4 = NULL;
940                 tp->md5sig_info->alloced4  = 0;
941         }
942 }
943
944 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
945                                  int optlen)
946 {
947         struct tcp_md5sig cmd;
948         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
949         u8 *newkey;
950
951         if (optlen < sizeof(cmd))
952                 return -EINVAL;
953
954         if (copy_from_user(&cmd, optval, sizeof(cmd)))
955                 return -EFAULT;
956
957         if (sin->sin_family != AF_INET)
958                 return -EINVAL;
959
960         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
961                 if (!tcp_sk(sk)->md5sig_info)
962                         return -ENOENT;
963                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
964         }
965
966         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
967                 return -EINVAL;
968
969         if (!tcp_sk(sk)->md5sig_info) {
970                 struct tcp_sock *tp = tcp_sk(sk);
971                 struct tcp_md5sig_info *p = kzalloc(sizeof(*p), GFP_KERNEL);
972
973                 if (!p)
974                         return -EINVAL;
975
976                 tp->md5sig_info = p;
977                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
978         }
979
980         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
981         if (!newkey)
982                 return -ENOMEM;
983         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
984                                  newkey, cmd.tcpm_keylen);
985 }
986
987 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
988                                         __be32 daddr, __be32 saddr, int nbytes)
989 {
990         struct tcp4_pseudohdr *bp;
991         struct scatterlist sg;
992
993         bp = &hp->md5_blk.ip4;
994
995         /*
996          * 1. the TCP pseudo-header (in the order: source IP address,
997          * destination IP address, zero-padded protocol number, and
998          * segment length)
999          */
1000         bp->saddr = saddr;
1001         bp->daddr = daddr;
1002         bp->pad = 0;
1003         bp->protocol = IPPROTO_TCP;
1004         bp->len = cpu_to_be16(nbytes);
1005
1006         sg_init_one(&sg, bp, sizeof(*bp));
1007         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1008 }
1009
1010 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1011                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1012 {
1013         struct tcp_md5sig_pool *hp;
1014         struct hash_desc *desc;
1015
1016         hp = tcp_get_md5sig_pool();
1017         if (!hp)
1018                 goto clear_hash_noput;
1019         desc = &hp->md5_desc;
1020
1021         if (crypto_hash_init(desc))
1022                 goto clear_hash;
1023         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1024                 goto clear_hash;
1025         if (tcp_md5_hash_header(hp, th))
1026                 goto clear_hash;
1027         if (tcp_md5_hash_key(hp, key))
1028                 goto clear_hash;
1029         if (crypto_hash_final(desc, md5_hash))
1030                 goto clear_hash;
1031
1032         tcp_put_md5sig_pool();
1033         return 0;
1034
1035 clear_hash:
1036         tcp_put_md5sig_pool();
1037 clear_hash_noput:
1038         memset(md5_hash, 0, 16);
1039         return 1;
1040 }
1041
1042 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1043                         struct sock *sk, struct request_sock *req,
1044                         struct sk_buff *skb)
1045 {
1046         struct tcp_md5sig_pool *hp;
1047         struct hash_desc *desc;
1048         struct tcphdr *th = tcp_hdr(skb);
1049         __be32 saddr, daddr;
1050
1051         if (sk) {
1052                 saddr = inet_sk(sk)->saddr;
1053                 daddr = inet_sk(sk)->daddr;
1054         } else if (req) {
1055                 saddr = inet_rsk(req)->loc_addr;
1056                 daddr = inet_rsk(req)->rmt_addr;
1057         } else {
1058                 const struct iphdr *iph = ip_hdr(skb);
1059                 saddr = iph->saddr;
1060                 daddr = iph->daddr;
1061         }
1062
1063         hp = tcp_get_md5sig_pool();
1064         if (!hp)
1065                 goto clear_hash_noput;
1066         desc = &hp->md5_desc;
1067
1068         if (crypto_hash_init(desc))
1069                 goto clear_hash;
1070
1071         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1072                 goto clear_hash;
1073         if (tcp_md5_hash_header(hp, th))
1074                 goto clear_hash;
1075         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1076                 goto clear_hash;
1077         if (tcp_md5_hash_key(hp, key))
1078                 goto clear_hash;
1079         if (crypto_hash_final(desc, md5_hash))
1080                 goto clear_hash;
1081
1082         tcp_put_md5sig_pool();
1083         return 0;
1084
1085 clear_hash:
1086         tcp_put_md5sig_pool();
1087 clear_hash_noput:
1088         memset(md5_hash, 0, 16);
1089         return 1;
1090 }
1091
1092 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1093
1094 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1095 {
1096         /*
1097          * This gets called for each TCP segment that arrives
1098          * so we want to be efficient.
1099          * We have 3 drop cases:
1100          * o No MD5 hash and one expected.
1101          * o MD5 hash and we're not expecting one.
1102          * o MD5 hash and its wrong.
1103          */
1104         __u8 *hash_location = NULL;
1105         struct tcp_md5sig_key *hash_expected;
1106         const struct iphdr *iph = ip_hdr(skb);
1107         struct tcphdr *th = tcp_hdr(skb);
1108         int genhash;
1109         unsigned char newhash[16];
1110
1111         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1112         hash_location = tcp_parse_md5sig_option(th);
1113
1114         /* We've parsed the options - do we have a hash? */
1115         if (!hash_expected && !hash_location)
1116                 return 0;
1117
1118         if (hash_expected && !hash_location) {
1119                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash expected but NOT found "
1120                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1121                                NIPQUAD(iph->saddr), ntohs(th->source),
1122                                NIPQUAD(iph->daddr), ntohs(th->dest));
1123                 return 1;
1124         }
1125
1126         if (!hash_expected && hash_location) {
1127                 LIMIT_NETDEBUG(KERN_INFO "MD5 Hash NOT expected but found "
1128                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)\n",
1129                                NIPQUAD(iph->saddr), ntohs(th->source),
1130                                NIPQUAD(iph->daddr), ntohs(th->dest));
1131                 return 1;
1132         }
1133
1134         /* Okay, so this is hash_expected and hash_location -
1135          * so we need to calculate the checksum.
1136          */
1137         genhash = tcp_v4_md5_hash_skb(newhash,
1138                                       hash_expected,
1139                                       NULL, NULL, skb);
1140
1141         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1142                 if (net_ratelimit()) {
1143                         printk(KERN_INFO "MD5 Hash failed for "
1144                                "(" NIPQUAD_FMT ", %d)->(" NIPQUAD_FMT ", %d)%s\n",
1145                                NIPQUAD(iph->saddr), ntohs(th->source),
1146                                NIPQUAD(iph->daddr), ntohs(th->dest),
1147                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1148                 }
1149                 return 1;
1150         }
1151         return 0;
1152 }
1153
1154 #endif
1155
1156 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1157         .family         =       PF_INET,
1158         .obj_size       =       sizeof(struct tcp_request_sock),
1159         .rtx_syn_ack    =       tcp_v4_send_synack,
1160         .send_ack       =       tcp_v4_reqsk_send_ack,
1161         .destructor     =       tcp_v4_reqsk_destructor,
1162         .send_reset     =       tcp_v4_send_reset,
1163 };
1164
1165 #ifdef CONFIG_TCP_MD5SIG
1166 static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1167         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1168 };
1169 #endif
1170
1171 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1172         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1173         .twsk_unique    = tcp_twsk_unique,
1174         .twsk_destructor= tcp_twsk_destructor,
1175 };
1176
1177 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1178 {
1179         struct inet_request_sock *ireq;
1180         struct tcp_options_received tmp_opt;
1181         struct request_sock *req;
1182         __be32 saddr = ip_hdr(skb)->saddr;
1183         __be32 daddr = ip_hdr(skb)->daddr;
1184         __u32 isn = TCP_SKB_CB(skb)->when;
1185         struct dst_entry *dst = NULL;
1186 #ifdef CONFIG_SYN_COOKIES
1187         int want_cookie = 0;
1188 #else
1189 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1190 #endif
1191
1192         /* Never answer to SYNs send to broadcast or multicast */
1193         if (skb->rtable->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1194                 goto drop;
1195
1196         /* TW buckets are converted to open requests without
1197          * limitations, they conserve resources and peer is
1198          * evidently real one.
1199          */
1200         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1201 #ifdef CONFIG_SYN_COOKIES
1202                 if (sysctl_tcp_syncookies) {
1203                         want_cookie = 1;
1204                 } else
1205 #endif
1206                 goto drop;
1207         }
1208
1209         /* Accept backlog is full. If we have already queued enough
1210          * of warm entries in syn queue, drop request. It is better than
1211          * clogging syn queue with openreqs with exponentially increasing
1212          * timeout.
1213          */
1214         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1215                 goto drop;
1216
1217         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1218         if (!req)
1219                 goto drop;
1220
1221 #ifdef CONFIG_TCP_MD5SIG
1222         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1223 #endif
1224
1225         tcp_clear_options(&tmp_opt);
1226         tmp_opt.mss_clamp = 536;
1227         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1228
1229         tcp_parse_options(skb, &tmp_opt, 0);
1230
1231         if (want_cookie && !tmp_opt.saw_tstamp)
1232                 tcp_clear_options(&tmp_opt);
1233
1234         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1235                 /* Some OSes (unknown ones, but I see them on web server, which
1236                  * contains information interesting only for windows'
1237                  * users) do not send their stamp in SYN. It is easy case.
1238                  * We simply do not advertise TS support.
1239                  */
1240                 tmp_opt.saw_tstamp = 0;
1241                 tmp_opt.tstamp_ok  = 0;
1242         }
1243         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1244
1245         tcp_openreq_init(req, &tmp_opt, skb);
1246
1247         if (security_inet_conn_request(sk, skb, req))
1248                 goto drop_and_free;
1249
1250         ireq = inet_rsk(req);
1251         ireq->loc_addr = daddr;
1252         ireq->rmt_addr = saddr;
1253         ireq->opt = tcp_v4_save_options(sk, skb);
1254         if (!want_cookie)
1255                 TCP_ECN_create_request(req, tcp_hdr(skb));
1256
1257         if (want_cookie) {
1258 #ifdef CONFIG_SYN_COOKIES
1259                 syn_flood_warning(skb);
1260                 req->cookie_ts = tmp_opt.tstamp_ok;
1261 #endif
1262                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1263         } else if (!isn) {
1264                 struct inet_peer *peer = NULL;
1265
1266                 /* VJ's idea. We save last timestamp seen
1267                  * from the destination in peer table, when entering
1268                  * state TIME-WAIT, and check against it before
1269                  * accepting new connection request.
1270                  *
1271                  * If "isn" is not zero, this request hit alive
1272                  * timewait bucket, so that all the necessary checks
1273                  * are made in the function processing timewait state.
1274                  */
1275                 if (tmp_opt.saw_tstamp &&
1276                     tcp_death_row.sysctl_tw_recycle &&
1277                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1278                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1279                     peer->v4daddr == saddr) {
1280                         if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1281                             (s32)(peer->tcp_ts - req->ts_recent) >
1282                                                         TCP_PAWS_WINDOW) {
1283                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1284                                 goto drop_and_release;
1285                         }
1286                 }
1287                 /* Kill the following clause, if you dislike this way. */
1288                 else if (!sysctl_tcp_syncookies &&
1289                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1290                           (sysctl_max_syn_backlog >> 2)) &&
1291                          (!peer || !peer->tcp_ts_stamp) &&
1292                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1293                         /* Without syncookies last quarter of
1294                          * backlog is filled with destinations,
1295                          * proven to be alive.
1296                          * It means that we continue to communicate
1297                          * to destinations, already remembered
1298                          * to the moment of synflood.
1299                          */
1300                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open "
1301                                        "request from " NIPQUAD_FMT "/%u\n",
1302                                        NIPQUAD(saddr),
1303                                        ntohs(tcp_hdr(skb)->source));
1304                         goto drop_and_release;
1305                 }
1306
1307                 isn = tcp_v4_init_sequence(skb);
1308         }
1309         tcp_rsk(req)->snt_isn = isn;
1310
1311         if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
1312                 goto drop_and_free;
1313
1314         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1315         return 0;
1316
1317 drop_and_release:
1318         dst_release(dst);
1319 drop_and_free:
1320         reqsk_free(req);
1321 drop:
1322         return 0;
1323 }
1324
1325
1326 /*
1327  * The three way handshake has completed - we got a valid synack -
1328  * now create the new socket.
1329  */
1330 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1331                                   struct request_sock *req,
1332                                   struct dst_entry *dst)
1333 {
1334         struct inet_request_sock *ireq;
1335         struct inet_sock *newinet;
1336         struct tcp_sock *newtp;
1337         struct sock *newsk;
1338 #ifdef CONFIG_TCP_MD5SIG
1339         struct tcp_md5sig_key *key;
1340 #endif
1341
1342         if (sk_acceptq_is_full(sk))
1343                 goto exit_overflow;
1344
1345         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1346                 goto exit;
1347
1348         newsk = tcp_create_openreq_child(sk, req, skb);
1349         if (!newsk)
1350                 goto exit;
1351
1352         newsk->sk_gso_type = SKB_GSO_TCPV4;
1353         sk_setup_caps(newsk, dst);
1354
1355         newtp                 = tcp_sk(newsk);
1356         newinet               = inet_sk(newsk);
1357         ireq                  = inet_rsk(req);
1358         newinet->daddr        = ireq->rmt_addr;
1359         newinet->rcv_saddr    = ireq->loc_addr;
1360         newinet->saddr        = ireq->loc_addr;
1361         newinet->opt          = ireq->opt;
1362         ireq->opt             = NULL;
1363         newinet->mc_index     = inet_iif(skb);
1364         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1365         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1366         if (newinet->opt)
1367                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1368         newinet->id = newtp->write_seq ^ jiffies;
1369
1370         tcp_mtup_init(newsk);
1371         tcp_sync_mss(newsk, dst_mtu(dst));
1372         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1373         tcp_initialize_rcv_mss(newsk);
1374
1375 #ifdef CONFIG_TCP_MD5SIG
1376         /* Copy over the MD5 key from the original socket */
1377         if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
1378                 /*
1379                  * We're using one, so create a matching key
1380                  * on the newsk structure. If we fail to get
1381                  * memory, then we end up not copying the key
1382                  * across. Shucks.
1383                  */
1384                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1385                 if (newkey != NULL)
1386                         tcp_v4_md5_do_add(newsk, inet_sk(sk)->daddr,
1387                                           newkey, key->keylen);
1388                 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1389         }
1390 #endif
1391
1392         __inet_hash_nolisten(newsk);
1393         __inet_inherit_port(sk, newsk);
1394
1395         return newsk;
1396
1397 exit_overflow:
1398         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1399 exit:
1400         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1401         dst_release(dst);
1402         return NULL;
1403 }
1404
1405 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1406 {
1407         struct tcphdr *th = tcp_hdr(skb);
1408         const struct iphdr *iph = ip_hdr(skb);
1409         struct sock *nsk;
1410         struct request_sock **prev;
1411         /* Find possible connection requests. */
1412         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1413                                                        iph->saddr, iph->daddr);
1414         if (req)
1415                 return tcp_check_req(sk, skb, req, prev);
1416
1417         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1418                         th->source, iph->daddr, th->dest, inet_iif(skb));
1419
1420         if (nsk) {
1421                 if (nsk->sk_state != TCP_TIME_WAIT) {
1422                         bh_lock_sock(nsk);
1423                         return nsk;
1424                 }
1425                 inet_twsk_put(inet_twsk(nsk));
1426                 return NULL;
1427         }
1428
1429 #ifdef CONFIG_SYN_COOKIES
1430         if (!th->rst && !th->syn && th->ack)
1431                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1432 #endif
1433         return sk;
1434 }
1435
1436 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1437 {
1438         const struct iphdr *iph = ip_hdr(skb);
1439
1440         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1441                 if (!tcp_v4_check(skb->len, iph->saddr,
1442                                   iph->daddr, skb->csum)) {
1443                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1444                         return 0;
1445                 }
1446         }
1447
1448         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1449                                        skb->len, IPPROTO_TCP, 0);
1450
1451         if (skb->len <= 76) {
1452                 return __skb_checksum_complete(skb);
1453         }
1454         return 0;
1455 }
1456
1457
1458 /* The socket must have it's spinlock held when we get
1459  * here.
1460  *
1461  * We have a potential double-lock case here, so even when
1462  * doing backlog processing we use the BH locking scheme.
1463  * This is because we cannot sleep with the original spinlock
1464  * held.
1465  */
1466 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1467 {
1468         struct sock *rsk;
1469 #ifdef CONFIG_TCP_MD5SIG
1470         /*
1471          * We really want to reject the packet as early as possible
1472          * if:
1473          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1474          *  o There is an MD5 option and we're not expecting one
1475          */
1476         if (tcp_v4_inbound_md5_hash(sk, skb))
1477                 goto discard;
1478 #endif
1479
1480         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1481                 TCP_CHECK_TIMER(sk);
1482                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1483                         rsk = sk;
1484                         goto reset;
1485                 }
1486                 TCP_CHECK_TIMER(sk);
1487                 return 0;
1488         }
1489
1490         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1491                 goto csum_err;
1492
1493         if (sk->sk_state == TCP_LISTEN) {
1494                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1495                 if (!nsk)
1496                         goto discard;
1497
1498                 if (nsk != sk) {
1499                         if (tcp_child_process(sk, nsk, skb)) {
1500                                 rsk = nsk;
1501                                 goto reset;
1502                         }
1503                         return 0;
1504                 }
1505         }
1506
1507         TCP_CHECK_TIMER(sk);
1508         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1509                 rsk = sk;
1510                 goto reset;
1511         }
1512         TCP_CHECK_TIMER(sk);
1513         return 0;
1514
1515 reset:
1516         tcp_v4_send_reset(rsk, skb);
1517 discard:
1518         kfree_skb(skb);
1519         /* Be careful here. If this function gets more complicated and
1520          * gcc suffers from register pressure on the x86, sk (in %ebx)
1521          * might be destroyed here. This current version compiles correctly,
1522          * but you have been warned.
1523          */
1524         return 0;
1525
1526 csum_err:
1527         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1528         goto discard;
1529 }
1530
1531 /*
1532  *      From tcp_input.c
1533  */
1534
1535 int tcp_v4_rcv(struct sk_buff *skb)
1536 {
1537         const struct iphdr *iph;
1538         struct tcphdr *th;
1539         struct sock *sk;
1540         int ret;
1541         struct net *net = dev_net(skb->dev);
1542
1543         if (skb->pkt_type != PACKET_HOST)
1544                 goto discard_it;
1545
1546         /* Count it even if it's bad */
1547         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1548
1549         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1550                 goto discard_it;
1551
1552         th = tcp_hdr(skb);
1553
1554         if (th->doff < sizeof(struct tcphdr) / 4)
1555                 goto bad_packet;
1556         if (!pskb_may_pull(skb, th->doff * 4))
1557                 goto discard_it;
1558
1559         /* An explanation is required here, I think.
1560          * Packet length and doff are validated by header prediction,
1561          * provided case of th->doff==0 is eliminated.
1562          * So, we defer the checks. */
1563         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1564                 goto bad_packet;
1565
1566         th = tcp_hdr(skb);
1567         iph = ip_hdr(skb);
1568         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1569         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1570                                     skb->len - th->doff * 4);
1571         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1572         TCP_SKB_CB(skb)->when    = 0;
1573         TCP_SKB_CB(skb)->flags   = iph->tos;
1574         TCP_SKB_CB(skb)->sacked  = 0;
1575
1576         sk = __inet_lookup(net, &tcp_hashinfo, iph->saddr,
1577                         th->source, iph->daddr, th->dest, inet_iif(skb));
1578         if (!sk)
1579                 goto no_tcp_socket;
1580
1581 process:
1582         if (sk->sk_state == TCP_TIME_WAIT)
1583                 goto do_time_wait;
1584
1585         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1586                 goto discard_and_relse;
1587         nf_reset(skb);
1588
1589         if (sk_filter(sk, skb))
1590                 goto discard_and_relse;
1591
1592         skb->dev = NULL;
1593
1594         bh_lock_sock_nested(sk);
1595         ret = 0;
1596         if (!sock_owned_by_user(sk)) {
1597 #ifdef CONFIG_NET_DMA
1598                 struct tcp_sock *tp = tcp_sk(sk);
1599                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1600                         tp->ucopy.dma_chan = get_softnet_dma();
1601                 if (tp->ucopy.dma_chan)
1602                         ret = tcp_v4_do_rcv(sk, skb);
1603                 else
1604 #endif
1605                 {
1606                         if (!tcp_prequeue(sk, skb))
1607                         ret = tcp_v4_do_rcv(sk, skb);
1608                 }
1609         } else
1610                 sk_add_backlog(sk, skb);
1611         bh_unlock_sock(sk);
1612
1613         sock_put(sk);
1614
1615         return ret;
1616
1617 no_tcp_socket:
1618         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1619                 goto discard_it;
1620
1621         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1622 bad_packet:
1623                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1624         } else {
1625                 tcp_v4_send_reset(NULL, skb);
1626         }
1627
1628 discard_it:
1629         /* Discard frame. */
1630         kfree_skb(skb);
1631         return 0;
1632
1633 discard_and_relse:
1634         sock_put(sk);
1635         goto discard_it;
1636
1637 do_time_wait:
1638         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1639                 inet_twsk_put(inet_twsk(sk));
1640                 goto discard_it;
1641         }
1642
1643         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1644                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1645                 inet_twsk_put(inet_twsk(sk));
1646                 goto discard_it;
1647         }
1648         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1649         case TCP_TW_SYN: {
1650                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1651                                                         &tcp_hashinfo,
1652                                                         iph->daddr, th->dest,
1653                                                         inet_iif(skb));
1654                 if (sk2) {
1655                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1656                         inet_twsk_put(inet_twsk(sk));
1657                         sk = sk2;
1658                         goto process;
1659                 }
1660                 /* Fall through to ACK */
1661         }
1662         case TCP_TW_ACK:
1663                 tcp_v4_timewait_ack(sk, skb);
1664                 break;
1665         case TCP_TW_RST:
1666                 goto no_tcp_socket;
1667         case TCP_TW_SUCCESS:;
1668         }
1669         goto discard_it;
1670 }
1671
1672 /* VJ's idea. Save last timestamp seen from this destination
1673  * and hold it at least for normal timewait interval to use for duplicate
1674  * segment detection in subsequent connections, before they enter synchronized
1675  * state.
1676  */
1677
1678 int tcp_v4_remember_stamp(struct sock *sk)
1679 {
1680         struct inet_sock *inet = inet_sk(sk);
1681         struct tcp_sock *tp = tcp_sk(sk);
1682         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1683         struct inet_peer *peer = NULL;
1684         int release_it = 0;
1685
1686         if (!rt || rt->rt_dst != inet->daddr) {
1687                 peer = inet_getpeer(inet->daddr, 1);
1688                 release_it = 1;
1689         } else {
1690                 if (!rt->peer)
1691                         rt_bind_peer(rt, 1);
1692                 peer = rt->peer;
1693         }
1694
1695         if (peer) {
1696                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1697                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1698                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1699                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1700                         peer->tcp_ts = tp->rx_opt.ts_recent;
1701                 }
1702                 if (release_it)
1703                         inet_putpeer(peer);
1704                 return 1;
1705         }
1706
1707         return 0;
1708 }
1709
1710 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1711 {
1712         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1713
1714         if (peer) {
1715                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1716
1717                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1718                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
1719                      peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
1720                         peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
1721                         peer->tcp_ts       = tcptw->tw_ts_recent;
1722                 }
1723                 inet_putpeer(peer);
1724                 return 1;
1725         }
1726
1727         return 0;
1728 }
1729
1730 struct inet_connection_sock_af_ops ipv4_specific = {
1731         .queue_xmit        = ip_queue_xmit,
1732         .send_check        = tcp_v4_send_check,
1733         .rebuild_header    = inet_sk_rebuild_header,
1734         .conn_request      = tcp_v4_conn_request,
1735         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1736         .remember_stamp    = tcp_v4_remember_stamp,
1737         .net_header_len    = sizeof(struct iphdr),
1738         .setsockopt        = ip_setsockopt,
1739         .getsockopt        = ip_getsockopt,
1740         .addr2sockaddr     = inet_csk_addr2sockaddr,
1741         .sockaddr_len      = sizeof(struct sockaddr_in),
1742         .bind_conflict     = inet_csk_bind_conflict,
1743 #ifdef CONFIG_COMPAT
1744         .compat_setsockopt = compat_ip_setsockopt,
1745         .compat_getsockopt = compat_ip_getsockopt,
1746 #endif
1747 };
1748
1749 #ifdef CONFIG_TCP_MD5SIG
1750 static struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1751         .md5_lookup             = tcp_v4_md5_lookup,
1752         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1753         .md5_add                = tcp_v4_md5_add_func,
1754         .md5_parse              = tcp_v4_parse_md5_keys,
1755 };
1756 #endif
1757
1758 /* NOTE: A lot of things set to zero explicitly by call to
1759  *       sk_alloc() so need not be done here.
1760  */
1761 static int tcp_v4_init_sock(struct sock *sk)
1762 {
1763         struct inet_connection_sock *icsk = inet_csk(sk);
1764         struct tcp_sock *tp = tcp_sk(sk);
1765
1766         skb_queue_head_init(&tp->out_of_order_queue);
1767         tcp_init_xmit_timers(sk);
1768         tcp_prequeue_init(tp);
1769
1770         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1771         tp->mdev = TCP_TIMEOUT_INIT;
1772
1773         /* So many TCP implementations out there (incorrectly) count the
1774          * initial SYN frame in their delayed-ACK and congestion control
1775          * algorithms that we must have the following bandaid to talk
1776          * efficiently to them.  -DaveM
1777          */
1778         tp->snd_cwnd = 2;
1779
1780         /* See draft-stevens-tcpca-spec-01 for discussion of the
1781          * initialization of these values.
1782          */
1783         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1784         tp->snd_cwnd_clamp = ~0;
1785         tp->mss_cache = 536;
1786
1787         tp->reordering = sysctl_tcp_reordering;
1788         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1789
1790         sk->sk_state = TCP_CLOSE;
1791
1792         sk->sk_write_space = sk_stream_write_space;
1793         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1794
1795         icsk->icsk_af_ops = &ipv4_specific;
1796         icsk->icsk_sync_mss = tcp_sync_mss;
1797 #ifdef CONFIG_TCP_MD5SIG
1798         tp->af_specific = &tcp_sock_ipv4_specific;
1799 #endif
1800
1801         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1802         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1803
1804         atomic_inc(&tcp_sockets_allocated);
1805
1806         return 0;
1807 }
1808
1809 void tcp_v4_destroy_sock(struct sock *sk)
1810 {
1811         struct tcp_sock *tp = tcp_sk(sk);
1812
1813         tcp_clear_xmit_timers(sk);
1814
1815         tcp_cleanup_congestion_control(sk);
1816
1817         /* Cleanup up the write buffer. */
1818         tcp_write_queue_purge(sk);
1819
1820         /* Cleans up our, hopefully empty, out_of_order_queue. */
1821         __skb_queue_purge(&tp->out_of_order_queue);
1822
1823 #ifdef CONFIG_TCP_MD5SIG
1824         /* Clean up the MD5 key list, if any */
1825         if (tp->md5sig_info) {
1826                 tcp_v4_clear_md5_list(sk);
1827                 kfree(tp->md5sig_info);
1828                 tp->md5sig_info = NULL;
1829         }
1830 #endif
1831
1832 #ifdef CONFIG_NET_DMA
1833         /* Cleans up our sk_async_wait_queue */
1834         __skb_queue_purge(&sk->sk_async_wait_queue);
1835 #endif
1836
1837         /* Clean prequeue, it must be empty really */
1838         __skb_queue_purge(&tp->ucopy.prequeue);
1839
1840         /* Clean up a referenced TCP bind bucket. */
1841         if (inet_csk(sk)->icsk_bind_hash)
1842                 inet_put_port(sk);
1843
1844         /*
1845          * If sendmsg cached page exists, toss it.
1846          */
1847         if (sk->sk_sndmsg_page) {
1848                 __free_page(sk->sk_sndmsg_page);
1849                 sk->sk_sndmsg_page = NULL;
1850         }
1851
1852         atomic_dec(&tcp_sockets_allocated);
1853 }
1854
1855 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1856
1857 #ifdef CONFIG_PROC_FS
1858 /* Proc filesystem TCP sock list dumping. */
1859
1860 static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
1861 {
1862         return hlist_empty(head) ? NULL :
1863                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1864 }
1865
1866 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1867 {
1868         return tw->tw_node.next ?
1869                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1870 }
1871
1872 static void *listening_get_next(struct seq_file *seq, void *cur)
1873 {
1874         struct inet_connection_sock *icsk;
1875         struct hlist_node *node;
1876         struct sock *sk = cur;
1877         struct tcp_iter_state* st = seq->private;
1878         struct net *net = seq_file_net(seq);
1879
1880         if (!sk) {
1881                 st->bucket = 0;
1882                 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1883                 goto get_sk;
1884         }
1885
1886         ++st->num;
1887
1888         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1889                 struct request_sock *req = cur;
1890
1891                 icsk = inet_csk(st->syn_wait_sk);
1892                 req = req->dl_next;
1893                 while (1) {
1894                         while (req) {
1895                                 if (req->rsk_ops->family == st->family) {
1896                                         cur = req;
1897                                         goto out;
1898                                 }
1899                                 req = req->dl_next;
1900                         }
1901                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1902                                 break;
1903 get_req:
1904                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1905                 }
1906                 sk        = sk_next(st->syn_wait_sk);
1907                 st->state = TCP_SEQ_STATE_LISTENING;
1908                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1909         } else {
1910                 icsk = inet_csk(sk);
1911                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1912                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1913                         goto start_req;
1914                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1915                 sk = sk_next(sk);
1916         }
1917 get_sk:
1918         sk_for_each_from(sk, node) {
1919                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1920                         cur = sk;
1921                         goto out;
1922                 }
1923                 icsk = inet_csk(sk);
1924                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1925                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1926 start_req:
1927                         st->uid         = sock_i_uid(sk);
1928                         st->syn_wait_sk = sk;
1929                         st->state       = TCP_SEQ_STATE_OPENREQ;
1930                         st->sbucket     = 0;
1931                         goto get_req;
1932                 }
1933                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1934         }
1935         if (++st->bucket < INET_LHTABLE_SIZE) {
1936                 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1937                 goto get_sk;
1938         }
1939         cur = NULL;
1940 out:
1941         return cur;
1942 }
1943
1944 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1945 {
1946         void *rc = listening_get_next(seq, NULL);
1947
1948         while (rc && *pos) {
1949                 rc = listening_get_next(seq, rc);
1950                 --*pos;
1951         }
1952         return rc;
1953 }
1954
1955 static void *established_get_first(struct seq_file *seq)
1956 {
1957         struct tcp_iter_state* st = seq->private;
1958         struct net *net = seq_file_net(seq);
1959         void *rc = NULL;
1960
1961         for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1962                 struct sock *sk;
1963                 struct hlist_node *node;
1964                 struct inet_timewait_sock *tw;
1965                 rwlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
1966
1967                 read_lock_bh(lock);
1968                 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1969                         if (sk->sk_family != st->family ||
1970                             !net_eq(sock_net(sk), net)) {
1971                                 continue;
1972                         }
1973                         rc = sk;
1974                         goto out;
1975                 }
1976                 st->state = TCP_SEQ_STATE_TIME_WAIT;
1977                 inet_twsk_for_each(tw, node,
1978                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
1979                         if (tw->tw_family != st->family ||
1980                             !net_eq(twsk_net(tw), net)) {
1981                                 continue;
1982                         }
1983                         rc = tw;
1984                         goto out;
1985                 }
1986                 read_unlock_bh(lock);
1987                 st->state = TCP_SEQ_STATE_ESTABLISHED;
1988         }
1989 out:
1990         return rc;
1991 }
1992
1993 static void *established_get_next(struct seq_file *seq, void *cur)
1994 {
1995         struct sock *sk = cur;
1996         struct inet_timewait_sock *tw;
1997         struct hlist_node *node;
1998         struct tcp_iter_state* st = seq->private;
1999         struct net *net = seq_file_net(seq);
2000
2001         ++st->num;
2002
2003         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2004                 tw = cur;
2005                 tw = tw_next(tw);
2006 get_tw:
2007                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2008                         tw = tw_next(tw);
2009                 }
2010                 if (tw) {
2011                         cur = tw;
2012                         goto out;
2013                 }
2014                 read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2015                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2016
2017                 if (++st->bucket < tcp_hashinfo.ehash_size) {
2018                         read_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2019                         sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
2020                 } else {
2021                         cur = NULL;
2022                         goto out;
2023                 }
2024         } else
2025                 sk = sk_next(sk);
2026
2027         sk_for_each_from(sk, node) {
2028                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2029                         goto found;
2030         }
2031
2032         st->state = TCP_SEQ_STATE_TIME_WAIT;
2033         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2034         goto get_tw;
2035 found:
2036         cur = sk;
2037 out:
2038         return cur;
2039 }
2040
2041 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2042 {
2043         void *rc = established_get_first(seq);
2044
2045         while (rc && pos) {
2046                 rc = established_get_next(seq, rc);
2047                 --pos;
2048         }
2049         return rc;
2050 }
2051
2052 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2053 {
2054         void *rc;
2055         struct tcp_iter_state* st = seq->private;
2056
2057         inet_listen_lock(&tcp_hashinfo);
2058         st->state = TCP_SEQ_STATE_LISTENING;
2059         rc        = listening_get_idx(seq, &pos);
2060
2061         if (!rc) {
2062                 inet_listen_unlock(&tcp_hashinfo);
2063                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2064                 rc        = established_get_idx(seq, pos);
2065         }
2066
2067         return rc;
2068 }
2069
2070 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2071 {
2072         struct tcp_iter_state* st = seq->private;
2073         st->state = TCP_SEQ_STATE_LISTENING;
2074         st->num = 0;
2075         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2076 }
2077
2078 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2079 {
2080         void *rc = NULL;
2081         struct tcp_iter_state* st;
2082
2083         if (v == SEQ_START_TOKEN) {
2084                 rc = tcp_get_idx(seq, 0);
2085                 goto out;
2086         }
2087         st = seq->private;
2088
2089         switch (st->state) {
2090         case TCP_SEQ_STATE_OPENREQ:
2091         case TCP_SEQ_STATE_LISTENING:
2092                 rc = listening_get_next(seq, v);
2093                 if (!rc) {
2094                         inet_listen_unlock(&tcp_hashinfo);
2095                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2096                         rc        = established_get_first(seq);
2097                 }
2098                 break;
2099         case TCP_SEQ_STATE_ESTABLISHED:
2100         case TCP_SEQ_STATE_TIME_WAIT:
2101                 rc = established_get_next(seq, v);
2102                 break;
2103         }
2104 out:
2105         ++*pos;
2106         return rc;
2107 }
2108
2109 static void tcp_seq_stop(struct seq_file *seq, void *v)
2110 {
2111         struct tcp_iter_state* st = seq->private;
2112
2113         switch (st->state) {
2114         case TCP_SEQ_STATE_OPENREQ:
2115                 if (v) {
2116                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2117                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2118                 }
2119         case TCP_SEQ_STATE_LISTENING:
2120                 if (v != SEQ_START_TOKEN)
2121                         inet_listen_unlock(&tcp_hashinfo);
2122                 break;
2123         case TCP_SEQ_STATE_TIME_WAIT:
2124         case TCP_SEQ_STATE_ESTABLISHED:
2125                 if (v)
2126                         read_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2127                 break;
2128         }
2129 }
2130
2131 static int tcp_seq_open(struct inode *inode, struct file *file)
2132 {
2133         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2134         struct tcp_iter_state *s;
2135         int err;
2136
2137         err = seq_open_net(inode, file, &afinfo->seq_ops,
2138                           sizeof(struct tcp_iter_state));
2139         if (err < 0)
2140                 return err;
2141
2142         s = ((struct seq_file *)file->private_data)->private;
2143         s->family               = afinfo->family;
2144         return 0;
2145 }
2146
2147 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2148 {
2149         int rc = 0;
2150         struct proc_dir_entry *p;
2151
2152         afinfo->seq_fops.open           = tcp_seq_open;
2153         afinfo->seq_fops.read           = seq_read;
2154         afinfo->seq_fops.llseek         = seq_lseek;
2155         afinfo->seq_fops.release        = seq_release_net;
2156
2157         afinfo->seq_ops.start           = tcp_seq_start;
2158         afinfo->seq_ops.next            = tcp_seq_next;
2159         afinfo->seq_ops.stop            = tcp_seq_stop;
2160
2161         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2162                              &afinfo->seq_fops, afinfo);
2163         if (!p)
2164                 rc = -ENOMEM;
2165         return rc;
2166 }
2167
2168 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2169 {
2170         proc_net_remove(net, afinfo->name);
2171 }
2172
2173 static void get_openreq4(struct sock *sk, struct request_sock *req,
2174                          struct seq_file *f, int i, int uid, int *len)
2175 {
2176         const struct inet_request_sock *ireq = inet_rsk(req);
2177         int ttd = req->expires - jiffies;
2178
2179         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2180                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2181                 i,
2182                 ireq->loc_addr,
2183                 ntohs(inet_sk(sk)->sport),
2184                 ireq->rmt_addr,
2185                 ntohs(ireq->rmt_port),
2186                 TCP_SYN_RECV,
2187                 0, 0, /* could print option size, but that is af dependent. */
2188                 1,    /* timers active (only the expire timer) */
2189                 jiffies_to_clock_t(ttd),
2190                 req->retrans,
2191                 uid,
2192                 0,  /* non standard timer */
2193                 0, /* open_requests have no inode */
2194                 atomic_read(&sk->sk_refcnt),
2195                 req,
2196                 len);
2197 }
2198
2199 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2200 {
2201         int timer_active;
2202         unsigned long timer_expires;
2203         struct tcp_sock *tp = tcp_sk(sk);
2204         const struct inet_connection_sock *icsk = inet_csk(sk);
2205         struct inet_sock *inet = inet_sk(sk);
2206         __be32 dest = inet->daddr;
2207         __be32 src = inet->rcv_saddr;
2208         __u16 destp = ntohs(inet->dport);
2209         __u16 srcp = ntohs(inet->sport);
2210
2211         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2212                 timer_active    = 1;
2213                 timer_expires   = icsk->icsk_timeout;
2214         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2215                 timer_active    = 4;
2216                 timer_expires   = icsk->icsk_timeout;
2217         } else if (timer_pending(&sk->sk_timer)) {
2218                 timer_active    = 2;
2219                 timer_expires   = sk->sk_timer.expires;
2220         } else {
2221                 timer_active    = 0;
2222                 timer_expires = jiffies;
2223         }
2224
2225         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2226                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2227                 i, src, srcp, dest, destp, sk->sk_state,
2228                 tp->write_seq - tp->snd_una,
2229                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2230                                              (tp->rcv_nxt - tp->copied_seq),
2231                 timer_active,
2232                 jiffies_to_clock_t(timer_expires - jiffies),
2233                 icsk->icsk_retransmits,
2234                 sock_i_uid(sk),
2235                 icsk->icsk_probes_out,
2236                 sock_i_ino(sk),
2237                 atomic_read(&sk->sk_refcnt), sk,
2238                 jiffies_to_clock_t(icsk->icsk_rto),
2239                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2240                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2241                 tp->snd_cwnd,
2242                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh,
2243                 len);
2244 }
2245
2246 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2247                                struct seq_file *f, int i, int *len)
2248 {
2249         __be32 dest, src;
2250         __u16 destp, srcp;
2251         int ttd = tw->tw_ttd - jiffies;
2252
2253         if (ttd < 0)
2254                 ttd = 0;
2255
2256         dest  = tw->tw_daddr;
2257         src   = tw->tw_rcv_saddr;
2258         destp = ntohs(tw->tw_dport);
2259         srcp  = ntohs(tw->tw_sport);
2260
2261         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2262                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2263                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2264                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2265                 atomic_read(&tw->tw_refcnt), tw, len);
2266 }
2267
2268 #define TMPSZ 150
2269
2270 static int tcp4_seq_show(struct seq_file *seq, void *v)
2271 {
2272         struct tcp_iter_state* st;
2273         int len;
2274
2275         if (v == SEQ_START_TOKEN) {
2276                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2277                            "  sl  local_address rem_address   st tx_queue "
2278                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2279                            "inode");
2280                 goto out;
2281         }
2282         st = seq->private;
2283
2284         switch (st->state) {
2285         case TCP_SEQ_STATE_LISTENING:
2286         case TCP_SEQ_STATE_ESTABLISHED:
2287                 get_tcp4_sock(v, seq, st->num, &len);
2288                 break;
2289         case TCP_SEQ_STATE_OPENREQ:
2290                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2291                 break;
2292         case TCP_SEQ_STATE_TIME_WAIT:
2293                 get_timewait4_sock(v, seq, st->num, &len);
2294                 break;
2295         }
2296         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2297 out:
2298         return 0;
2299 }
2300
2301 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2302         .name           = "tcp",
2303         .family         = AF_INET,
2304         .seq_fops       = {
2305                 .owner          = THIS_MODULE,
2306         },
2307         .seq_ops        = {
2308                 .show           = tcp4_seq_show,
2309         },
2310 };
2311
2312 static int tcp4_proc_init_net(struct net *net)
2313 {
2314         return tcp_proc_register(net, &tcp4_seq_afinfo);
2315 }
2316
2317 static void tcp4_proc_exit_net(struct net *net)
2318 {
2319         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2320 }
2321
2322 static struct pernet_operations tcp4_net_ops = {
2323         .init = tcp4_proc_init_net,
2324         .exit = tcp4_proc_exit_net,
2325 };
2326
2327 int __init tcp4_proc_init(void)
2328 {
2329         return register_pernet_subsys(&tcp4_net_ops);
2330 }
2331
2332 void tcp4_proc_exit(void)
2333 {
2334         unregister_pernet_subsys(&tcp4_net_ops);
2335 }
2336 #endif /* CONFIG_PROC_FS */
2337
2338 struct proto tcp_prot = {
2339         .name                   = "TCP",
2340         .owner                  = THIS_MODULE,
2341         .close                  = tcp_close,
2342         .connect                = tcp_v4_connect,
2343         .disconnect             = tcp_disconnect,
2344         .accept                 = inet_csk_accept,
2345         .ioctl                  = tcp_ioctl,
2346         .init                   = tcp_v4_init_sock,
2347         .destroy                = tcp_v4_destroy_sock,
2348         .shutdown               = tcp_shutdown,
2349         .setsockopt             = tcp_setsockopt,
2350         .getsockopt             = tcp_getsockopt,
2351         .recvmsg                = tcp_recvmsg,
2352         .backlog_rcv            = tcp_v4_do_rcv,
2353         .hash                   = inet_hash,
2354         .unhash                 = inet_unhash,
2355         .get_port               = inet_csk_get_port,
2356         .enter_memory_pressure  = tcp_enter_memory_pressure,
2357         .sockets_allocated      = &tcp_sockets_allocated,
2358         .orphan_count           = &tcp_orphan_count,
2359         .memory_allocated       = &tcp_memory_allocated,
2360         .memory_pressure        = &tcp_memory_pressure,
2361         .sysctl_mem             = sysctl_tcp_mem,
2362         .sysctl_wmem            = sysctl_tcp_wmem,
2363         .sysctl_rmem            = sysctl_tcp_rmem,
2364         .max_header             = MAX_TCP_HEADER,
2365         .obj_size               = sizeof(struct tcp_sock),
2366         .twsk_prot              = &tcp_timewait_sock_ops,
2367         .rsk_prot               = &tcp_request_sock_ops,
2368         .h.hashinfo             = &tcp_hashinfo,
2369 #ifdef CONFIG_COMPAT
2370         .compat_setsockopt      = compat_tcp_setsockopt,
2371         .compat_getsockopt      = compat_tcp_getsockopt,
2372 #endif
2373 };
2374
2375
2376 static int __net_init tcp_sk_init(struct net *net)
2377 {
2378         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2379                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2380 }
2381
2382 static void __net_exit tcp_sk_exit(struct net *net)
2383 {
2384         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2385 }
2386
2387 static struct pernet_operations __net_initdata tcp_sk_ops = {
2388        .init = tcp_sk_init,
2389        .exit = tcp_sk_exit,
2390 };
2391
2392 void __init tcp_v4_init(void)
2393 {
2394         if (register_pernet_device(&tcp_sk_ops))
2395                 panic("Failed to create the TCP control socket.\n");
2396 }
2397
2398 EXPORT_SYMBOL(ipv4_specific);
2399 EXPORT_SYMBOL(tcp_hashinfo);
2400 EXPORT_SYMBOL(tcp_prot);
2401 EXPORT_SYMBOL(tcp_v4_conn_request);
2402 EXPORT_SYMBOL(tcp_v4_connect);
2403 EXPORT_SYMBOL(tcp_v4_do_rcv);
2404 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2405 EXPORT_SYMBOL(tcp_v4_send_check);
2406 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2407
2408 #ifdef CONFIG_PROC_FS
2409 EXPORT_SYMBOL(tcp_proc_register);
2410 EXPORT_SYMBOL(tcp_proc_unregister);
2411 #endif
2412 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2413