29002ab26e0d3debe3b3945a770756a25c0c7e2b
[linux-2.6.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63
64 #include <net/net_namespace.h>
65 #include <net/icmp.h>
66 #include <net/inet_hashtables.h>
67 #include <net/tcp.h>
68 #include <net/transp_v6.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/timewait_sock.h>
72 #include <net/xfrm.h>
73 #include <net/netdma.h>
74
75 #include <linux/inet.h>
76 #include <linux/ipv6.h>
77 #include <linux/stddef.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80
81 #include <linux/crypto.h>
82 #include <linux/scatterlist.h>
83
84 int sysctl_tcp_tw_reuse __read_mostly;
85 int sysctl_tcp_low_latency __read_mostly;
86
87
88 #ifdef CONFIG_TCP_MD5SIG
89 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
90                                                    __be32 addr);
91 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
92                                __be32 daddr, __be32 saddr, struct tcphdr *th);
93 #else
94 static inline
95 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
96 {
97         return NULL;
98 }
99 #endif
100
101 struct inet_hashinfo tcp_hashinfo;
102
103 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
104 {
105         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
106                                           ip_hdr(skb)->saddr,
107                                           tcp_hdr(skb)->dest,
108                                           tcp_hdr(skb)->source);
109 }
110
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114         struct tcp_sock *tp = tcp_sk(sk);
115
116         /* With PAWS, it is safe from the viewpoint
117            of data integrity. Even without PAWS it is safe provided sequence
118            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119
120            Actually, the idea is close to VJ's one, only timestamp cache is
121            held not per host, but per port pair and TW bucket is used as state
122            holder.
123
124            If TW bucket has been already destroyed we fall back to VJ's scheme
125            and use initial timestamp retrieved from peer table.
126          */
127         if (tcptw->tw_ts_recent_stamp &&
128             (twp == NULL || (sysctl_tcp_tw_reuse &&
129                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131                 if (tp->write_seq == 0)
132                         tp->write_seq = 1;
133                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
134                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135                 sock_hold(sktw);
136                 return 1;
137         }
138
139         return 0;
140 }
141
142 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
143
144 /* This will initiate an outgoing connection. */
145 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
146 {
147         struct inet_sock *inet = inet_sk(sk);
148         struct tcp_sock *tp = tcp_sk(sk);
149         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
150         struct rtable *rt;
151         __be32 daddr, nexthop;
152         int tmp;
153         int err;
154
155         if (addr_len < sizeof(struct sockaddr_in))
156                 return -EINVAL;
157
158         if (usin->sin_family != AF_INET)
159                 return -EAFNOSUPPORT;
160
161         nexthop = daddr = usin->sin_addr.s_addr;
162         if (inet->opt && inet->opt->srr) {
163                 if (!daddr)
164                         return -EINVAL;
165                 nexthop = inet->opt->faddr;
166         }
167
168         tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
169                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
170                                IPPROTO_TCP,
171                                inet->inet_sport, usin->sin_port, sk, 1);
172         if (tmp < 0) {
173                 if (tmp == -ENETUNREACH)
174                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
175                 return tmp;
176         }
177
178         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
179                 ip_rt_put(rt);
180                 return -ENETUNREACH;
181         }
182
183         if (!inet->opt || !inet->opt->srr)
184                 daddr = rt->rt_dst;
185
186         if (!inet->inet_saddr)
187                 inet->inet_saddr = rt->rt_src;
188         inet->inet_rcv_saddr = inet->inet_saddr;
189
190         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
191                 /* Reset inherited state */
192                 tp->rx_opt.ts_recent       = 0;
193                 tp->rx_opt.ts_recent_stamp = 0;
194                 tp->write_seq              = 0;
195         }
196
197         if (tcp_death_row.sysctl_tw_recycle &&
198             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
199                 struct inet_peer *peer = rt_get_peer(rt);
200                 /*
201                  * VJ's idea. We save last timestamp seen from
202                  * the destination in peer table, when entering state
203                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
204                  * when trying new connection.
205                  */
206                 if (peer != NULL &&
207                     (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
208                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
209                         tp->rx_opt.ts_recent = peer->tcp_ts;
210                 }
211         }
212
213         inet->inet_dport = usin->sin_port;
214         inet->inet_daddr = daddr;
215
216         inet_csk(sk)->icsk_ext_hdr_len = 0;
217         if (inet->opt)
218                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
219
220         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
221
222         /* Socket identity is still unknown (sport may be zero).
223          * However we set state to SYN-SENT and not releasing socket
224          * lock select source port, enter ourselves into the hash tables and
225          * complete initialization after this.
226          */
227         tcp_set_state(sk, TCP_SYN_SENT);
228         err = inet_hash_connect(&tcp_death_row, sk);
229         if (err)
230                 goto failure;
231
232         err = ip_route_newports(&rt, IPPROTO_TCP,
233                                 inet->inet_sport, inet->inet_dport, sk);
234         if (err)
235                 goto failure;
236
237         /* OK, now commit destination to socket.  */
238         sk->sk_gso_type = SKB_GSO_TCPV4;
239         sk_setup_caps(sk, &rt->u.dst);
240
241         if (!tp->write_seq)
242                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
243                                                            inet->inet_daddr,
244                                                            inet->inet_sport,
245                                                            usin->sin_port);
246
247         inet->inet_id = tp->write_seq ^ jiffies;
248
249         err = tcp_connect(sk);
250         rt = NULL;
251         if (err)
252                 goto failure;
253
254         return 0;
255
256 failure:
257         /*
258          * This unhashes the socket and releases the local port,
259          * if necessary.
260          */
261         tcp_set_state(sk, TCP_CLOSE);
262         ip_rt_put(rt);
263         sk->sk_route_caps = 0;
264         inet->inet_dport = 0;
265         return err;
266 }
267
268 /*
269  * This routine does path mtu discovery as defined in RFC1191.
270  */
271 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
272 {
273         struct dst_entry *dst;
274         struct inet_sock *inet = inet_sk(sk);
275
276         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
277          * send out by Linux are always <576bytes so they should go through
278          * unfragmented).
279          */
280         if (sk->sk_state == TCP_LISTEN)
281                 return;
282
283         /* We don't check in the destentry if pmtu discovery is forbidden
284          * on this route. We just assume that no packet_to_big packets
285          * are send back when pmtu discovery is not active.
286          * There is a small race when the user changes this flag in the
287          * route, but I think that's acceptable.
288          */
289         if ((dst = __sk_dst_check(sk, 0)) == NULL)
290                 return;
291
292         dst->ops->update_pmtu(dst, mtu);
293
294         /* Something is about to be wrong... Remember soft error
295          * for the case, if this connection will not able to recover.
296          */
297         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
298                 sk->sk_err_soft = EMSGSIZE;
299
300         mtu = dst_mtu(dst);
301
302         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
303             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
304                 tcp_sync_mss(sk, mtu);
305
306                 /* Resend the TCP packet because it's
307                  * clear that the old packet has been
308                  * dropped. This is the new "fast" path mtu
309                  * discovery.
310                  */
311                 tcp_simple_retransmit(sk);
312         } /* else let the usual retransmit timer handle it */
313 }
314
315 /*
316  * This routine is called by the ICMP module when it gets some
317  * sort of error condition.  If err < 0 then the socket should
318  * be closed and the error returned to the user.  If err > 0
319  * it's just the icmp type << 8 | icmp code.  After adjustment
320  * header points to the first 8 bytes of the tcp header.  We need
321  * to find the appropriate port.
322  *
323  * The locking strategy used here is very "optimistic". When
324  * someone else accesses the socket the ICMP is just dropped
325  * and for some paths there is no check at all.
326  * A more general error queue to queue errors for later handling
327  * is probably better.
328  *
329  */
330
331 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
332 {
333         struct iphdr *iph = (struct iphdr *)icmp_skb->data;
334         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
335         struct inet_connection_sock *icsk;
336         struct tcp_sock *tp;
337         struct inet_sock *inet;
338         const int type = icmp_hdr(icmp_skb)->type;
339         const int code = icmp_hdr(icmp_skb)->code;
340         struct sock *sk;
341         struct sk_buff *skb;
342         __u32 seq;
343         __u32 remaining;
344         int err;
345         struct net *net = dev_net(icmp_skb->dev);
346
347         if (icmp_skb->len < (iph->ihl << 2) + 8) {
348                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
349                 return;
350         }
351
352         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
353                         iph->saddr, th->source, inet_iif(icmp_skb));
354         if (!sk) {
355                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
356                 return;
357         }
358         if (sk->sk_state == TCP_TIME_WAIT) {
359                 inet_twsk_put(inet_twsk(sk));
360                 return;
361         }
362
363         bh_lock_sock(sk);
364         /* If too many ICMPs get dropped on busy
365          * servers this needs to be solved differently.
366          */
367         if (sock_owned_by_user(sk))
368                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
369
370         if (sk->sk_state == TCP_CLOSE)
371                 goto out;
372
373         icsk = inet_csk(sk);
374         tp = tcp_sk(sk);
375         seq = ntohl(th->seq);
376         if (sk->sk_state != TCP_LISTEN &&
377             !between(seq, tp->snd_una, tp->snd_nxt)) {
378                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
379                 goto out;
380         }
381
382         switch (type) {
383         case ICMP_SOURCE_QUENCH:
384                 /* Just silently ignore these. */
385                 goto out;
386         case ICMP_PARAMETERPROB:
387                 err = EPROTO;
388                 break;
389         case ICMP_DEST_UNREACH:
390                 if (code > NR_ICMP_UNREACH)
391                         goto out;
392
393                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
394                         if (!sock_owned_by_user(sk))
395                                 do_pmtu_discovery(sk, iph, info);
396                         goto out;
397                 }
398
399                 err = icmp_err_convert[code].errno;
400                 /* check if icmp_skb allows revert of backoff
401                  * (see draft-zimmermann-tcp-lcd) */
402                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
403                         break;
404                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
405                     !icsk->icsk_backoff)
406                         break;
407
408                 icsk->icsk_backoff--;
409                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
410                                          icsk->icsk_backoff;
411                 tcp_bound_rto(sk);
412
413                 skb = tcp_write_queue_head(sk);
414                 BUG_ON(!skb);
415
416                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
417                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
418
419                 if (remaining) {
420                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
421                                                   remaining, TCP_RTO_MAX);
422                 } else if (sock_owned_by_user(sk)) {
423                         /* RTO revert clocked out retransmission,
424                          * but socket is locked. Will defer. */
425                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
426                                                   HZ/20, TCP_RTO_MAX);
427                 } else {
428                         /* RTO revert clocked out retransmission.
429                          * Will retransmit now */
430                         tcp_retransmit_timer(sk);
431                 }
432
433                 break;
434         case ICMP_TIME_EXCEEDED:
435                 err = EHOSTUNREACH;
436                 break;
437         default:
438                 goto out;
439         }
440
441         switch (sk->sk_state) {
442                 struct request_sock *req, **prev;
443         case TCP_LISTEN:
444                 if (sock_owned_by_user(sk))
445                         goto out;
446
447                 req = inet_csk_search_req(sk, &prev, th->dest,
448                                           iph->daddr, iph->saddr);
449                 if (!req)
450                         goto out;
451
452                 /* ICMPs are not backlogged, hence we cannot get
453                    an established socket here.
454                  */
455                 WARN_ON(req->sk);
456
457                 if (seq != tcp_rsk(req)->snt_isn) {
458                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
459                         goto out;
460                 }
461
462                 /*
463                  * Still in SYN_RECV, just remove it silently.
464                  * There is no good way to pass the error to the newly
465                  * created socket, and POSIX does not want network
466                  * errors returned from accept().
467                  */
468                 inet_csk_reqsk_queue_drop(sk, req, prev);
469                 goto out;
470
471         case TCP_SYN_SENT:
472         case TCP_SYN_RECV:  /* Cannot happen.
473                                It can f.e. if SYNs crossed.
474                              */
475                 if (!sock_owned_by_user(sk)) {
476                         sk->sk_err = err;
477
478                         sk->sk_error_report(sk);
479
480                         tcp_done(sk);
481                 } else {
482                         sk->sk_err_soft = err;
483                 }
484                 goto out;
485         }
486
487         /* If we've already connected we will keep trying
488          * until we time out, or the user gives up.
489          *
490          * rfc1122 4.2.3.9 allows to consider as hard errors
491          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
492          * but it is obsoleted by pmtu discovery).
493          *
494          * Note, that in modern internet, where routing is unreliable
495          * and in each dark corner broken firewalls sit, sending random
496          * errors ordered by their masters even this two messages finally lose
497          * their original sense (even Linux sends invalid PORT_UNREACHs)
498          *
499          * Now we are in compliance with RFCs.
500          *                                                      --ANK (980905)
501          */
502
503         inet = inet_sk(sk);
504         if (!sock_owned_by_user(sk) && inet->recverr) {
505                 sk->sk_err = err;
506                 sk->sk_error_report(sk);
507         } else  { /* Only an error on timeout */
508                 sk->sk_err_soft = err;
509         }
510
511 out:
512         bh_unlock_sock(sk);
513         sock_put(sk);
514 }
515
516 /* This routine computes an IPv4 TCP checksum. */
517 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
518 {
519         struct inet_sock *inet = inet_sk(sk);
520         struct tcphdr *th = tcp_hdr(skb);
521
522         if (skb->ip_summed == CHECKSUM_PARTIAL) {
523                 th->check = ~tcp_v4_check(len, inet->inet_saddr,
524                                           inet->inet_daddr, 0);
525                 skb->csum_start = skb_transport_header(skb) - skb->head;
526                 skb->csum_offset = offsetof(struct tcphdr, check);
527         } else {
528                 th->check = tcp_v4_check(len, inet->inet_saddr,
529                                          inet->inet_daddr,
530                                          csum_partial(th,
531                                                       th->doff << 2,
532                                                       skb->csum));
533         }
534 }
535
536 int tcp_v4_gso_send_check(struct sk_buff *skb)
537 {
538         const struct iphdr *iph;
539         struct tcphdr *th;
540
541         if (!pskb_may_pull(skb, sizeof(*th)))
542                 return -EINVAL;
543
544         iph = ip_hdr(skb);
545         th = tcp_hdr(skb);
546
547         th->check = 0;
548         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
549         skb->csum_start = skb_transport_header(skb) - skb->head;
550         skb->csum_offset = offsetof(struct tcphdr, check);
551         skb->ip_summed = CHECKSUM_PARTIAL;
552         return 0;
553 }
554
555 /*
556  *      This routine will send an RST to the other tcp.
557  *
558  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
559  *                    for reset.
560  *      Answer: if a packet caused RST, it is not for a socket
561  *              existing in our system, if it is matched to a socket,
562  *              it is just duplicate segment or bug in other side's TCP.
563  *              So that we build reply only basing on parameters
564  *              arrived with segment.
565  *      Exception: precedence violation. We do not implement it in any case.
566  */
567
568 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
569 {
570         struct tcphdr *th = tcp_hdr(skb);
571         struct {
572                 struct tcphdr th;
573 #ifdef CONFIG_TCP_MD5SIG
574                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
575 #endif
576         } rep;
577         struct ip_reply_arg arg;
578 #ifdef CONFIG_TCP_MD5SIG
579         struct tcp_md5sig_key *key;
580 #endif
581         struct net *net;
582
583         /* Never send a reset in response to a reset. */
584         if (th->rst)
585                 return;
586
587         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
588                 return;
589
590         /* Swap the send and the receive. */
591         memset(&rep, 0, sizeof(rep));
592         rep.th.dest   = th->source;
593         rep.th.source = th->dest;
594         rep.th.doff   = sizeof(struct tcphdr) / 4;
595         rep.th.rst    = 1;
596
597         if (th->ack) {
598                 rep.th.seq = th->ack_seq;
599         } else {
600                 rep.th.ack = 1;
601                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
602                                        skb->len - (th->doff << 2));
603         }
604
605         memset(&arg, 0, sizeof(arg));
606         arg.iov[0].iov_base = (unsigned char *)&rep;
607         arg.iov[0].iov_len  = sizeof(rep.th);
608
609 #ifdef CONFIG_TCP_MD5SIG
610         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
611         if (key) {
612                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
613                                    (TCPOPT_NOP << 16) |
614                                    (TCPOPT_MD5SIG << 8) |
615                                    TCPOLEN_MD5SIG);
616                 /* Update length and the length the header thinks exists */
617                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
618                 rep.th.doff = arg.iov[0].iov_len / 4;
619
620                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
621                                      key, ip_hdr(skb)->saddr,
622                                      ip_hdr(skb)->daddr, &rep.th);
623         }
624 #endif
625         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
626                                       ip_hdr(skb)->saddr, /* XXX */
627                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
628         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
629         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
630
631         net = dev_net(skb_dst(skb)->dev);
632         ip_send_reply(net->ipv4.tcp_sock, skb,
633                       &arg, arg.iov[0].iov_len);
634
635         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
636         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
637 }
638
639 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
640    outside socket context is ugly, certainly. What can I do?
641  */
642
643 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
644                             u32 win, u32 ts, int oif,
645                             struct tcp_md5sig_key *key,
646                             int reply_flags)
647 {
648         struct tcphdr *th = tcp_hdr(skb);
649         struct {
650                 struct tcphdr th;
651                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
652 #ifdef CONFIG_TCP_MD5SIG
653                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
654 #endif
655                         ];
656         } rep;
657         struct ip_reply_arg arg;
658         struct net *net = dev_net(skb_dst(skb)->dev);
659
660         memset(&rep.th, 0, sizeof(struct tcphdr));
661         memset(&arg, 0, sizeof(arg));
662
663         arg.iov[0].iov_base = (unsigned char *)&rep;
664         arg.iov[0].iov_len  = sizeof(rep.th);
665         if (ts) {
666                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
667                                    (TCPOPT_TIMESTAMP << 8) |
668                                    TCPOLEN_TIMESTAMP);
669                 rep.opt[1] = htonl(tcp_time_stamp);
670                 rep.opt[2] = htonl(ts);
671                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
672         }
673
674         /* Swap the send and the receive. */
675         rep.th.dest    = th->source;
676         rep.th.source  = th->dest;
677         rep.th.doff    = arg.iov[0].iov_len / 4;
678         rep.th.seq     = htonl(seq);
679         rep.th.ack_seq = htonl(ack);
680         rep.th.ack     = 1;
681         rep.th.window  = htons(win);
682
683 #ifdef CONFIG_TCP_MD5SIG
684         if (key) {
685                 int offset = (ts) ? 3 : 0;
686
687                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
688                                           (TCPOPT_NOP << 16) |
689                                           (TCPOPT_MD5SIG << 8) |
690                                           TCPOLEN_MD5SIG);
691                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
692                 rep.th.doff = arg.iov[0].iov_len/4;
693
694                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
695                                     key, ip_hdr(skb)->saddr,
696                                     ip_hdr(skb)->daddr, &rep.th);
697         }
698 #endif
699         arg.flags = reply_flags;
700         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
701                                       ip_hdr(skb)->saddr, /* XXX */
702                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
703         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
704         if (oif)
705                 arg.bound_dev_if = oif;
706
707         ip_send_reply(net->ipv4.tcp_sock, skb,
708                       &arg, arg.iov[0].iov_len);
709
710         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
711 }
712
713 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
714 {
715         struct inet_timewait_sock *tw = inet_twsk(sk);
716         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
717
718         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
719                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
720                         tcptw->tw_ts_recent,
721                         tw->tw_bound_dev_if,
722                         tcp_twsk_md5_key(tcptw),
723                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
724                         );
725
726         inet_twsk_put(tw);
727 }
728
729 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
730                                   struct request_sock *req)
731 {
732         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
733                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
734                         req->ts_recent,
735                         0,
736                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
737                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
738 }
739
740 /*
741  *      Send a SYN-ACK after having received a SYN.
742  *      This still operates on a request_sock only, not on a big
743  *      socket.
744  */
745 static int __tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
746                                 struct request_sock *req,
747                                 struct request_values *rvp)
748 {
749         const struct inet_request_sock *ireq = inet_rsk(req);
750         int err = -1;
751         struct sk_buff * skb;
752
753         /* First, grab a route. */
754         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
755                 return -1;
756
757         skb = tcp_make_synack(sk, dst, req, rvp);
758
759         if (skb) {
760                 struct tcphdr *th = tcp_hdr(skb);
761
762                 th->check = tcp_v4_check(skb->len,
763                                          ireq->loc_addr,
764                                          ireq->rmt_addr,
765                                          csum_partial(th, skb->len,
766                                                       skb->csum));
767
768                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
769                                             ireq->rmt_addr,
770                                             ireq->opt);
771                 err = net_xmit_eval(err);
772         }
773
774         dst_release(dst);
775         return err;
776 }
777
778 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
779                               struct request_values *rvp)
780 {
781         return __tcp_v4_send_synack(sk, NULL, req, rvp);
782 }
783
784 /*
785  *      IPv4 request_sock destructor.
786  */
787 static void tcp_v4_reqsk_destructor(struct request_sock *req)
788 {
789         kfree(inet_rsk(req)->opt);
790 }
791
792 #ifdef CONFIG_SYN_COOKIES
793 static void syn_flood_warning(struct sk_buff *skb)
794 {
795         static unsigned long warntime;
796
797         if (time_after(jiffies, (warntime + HZ * 60))) {
798                 warntime = jiffies;
799                 printk(KERN_INFO
800                        "possible SYN flooding on port %d. Sending cookies.\n",
801                        ntohs(tcp_hdr(skb)->dest));
802         }
803 }
804 #endif
805
806 /*
807  * Save and compile IPv4 options into the request_sock if needed.
808  */
809 static struct ip_options *tcp_v4_save_options(struct sock *sk,
810                                               struct sk_buff *skb)
811 {
812         struct ip_options *opt = &(IPCB(skb)->opt);
813         struct ip_options *dopt = NULL;
814
815         if (opt && opt->optlen) {
816                 int opt_size = optlength(opt);
817                 dopt = kmalloc(opt_size, GFP_ATOMIC);
818                 if (dopt) {
819                         if (ip_options_echo(dopt, skb)) {
820                                 kfree(dopt);
821                                 dopt = NULL;
822                         }
823                 }
824         }
825         return dopt;
826 }
827
828 #ifdef CONFIG_TCP_MD5SIG
829 /*
830  * RFC2385 MD5 checksumming requires a mapping of
831  * IP address->MD5 Key.
832  * We need to maintain these in the sk structure.
833  */
834
835 /* Find the Key structure for an address.  */
836 static struct tcp_md5sig_key *
837                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
838 {
839         struct tcp_sock *tp = tcp_sk(sk);
840         int i;
841
842         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
843                 return NULL;
844         for (i = 0; i < tp->md5sig_info->entries4; i++) {
845                 if (tp->md5sig_info->keys4[i].addr == addr)
846                         return &tp->md5sig_info->keys4[i].base;
847         }
848         return NULL;
849 }
850
851 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
852                                          struct sock *addr_sk)
853 {
854         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
855 }
856
857 EXPORT_SYMBOL(tcp_v4_md5_lookup);
858
859 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
860                                                       struct request_sock *req)
861 {
862         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
863 }
864
865 /* This can be called on a newly created socket, from other files */
866 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
867                       u8 *newkey, u8 newkeylen)
868 {
869         /* Add Key to the list */
870         struct tcp_md5sig_key *key;
871         struct tcp_sock *tp = tcp_sk(sk);
872         struct tcp4_md5sig_key *keys;
873
874         key = tcp_v4_md5_do_lookup(sk, addr);
875         if (key) {
876                 /* Pre-existing entry - just update that one. */
877                 kfree(key->key);
878                 key->key = newkey;
879                 key->keylen = newkeylen;
880         } else {
881                 struct tcp_md5sig_info *md5sig;
882
883                 if (!tp->md5sig_info) {
884                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
885                                                   GFP_ATOMIC);
886                         if (!tp->md5sig_info) {
887                                 kfree(newkey);
888                                 return -ENOMEM;
889                         }
890                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
891                 }
892                 if (tcp_alloc_md5sig_pool(sk) == NULL) {
893                         kfree(newkey);
894                         return -ENOMEM;
895                 }
896                 md5sig = tp->md5sig_info;
897
898                 if (md5sig->alloced4 == md5sig->entries4) {
899                         keys = kmalloc((sizeof(*keys) *
900                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
901                         if (!keys) {
902                                 kfree(newkey);
903                                 tcp_free_md5sig_pool();
904                                 return -ENOMEM;
905                         }
906
907                         if (md5sig->entries4)
908                                 memcpy(keys, md5sig->keys4,
909                                        sizeof(*keys) * md5sig->entries4);
910
911                         /* Free old key list, and reference new one */
912                         kfree(md5sig->keys4);
913                         md5sig->keys4 = keys;
914                         md5sig->alloced4++;
915                 }
916                 md5sig->entries4++;
917                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
918                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
919                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
920         }
921         return 0;
922 }
923
924 EXPORT_SYMBOL(tcp_v4_md5_do_add);
925
926 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
927                                u8 *newkey, u8 newkeylen)
928 {
929         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
930                                  newkey, newkeylen);
931 }
932
933 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
934 {
935         struct tcp_sock *tp = tcp_sk(sk);
936         int i;
937
938         for (i = 0; i < tp->md5sig_info->entries4; i++) {
939                 if (tp->md5sig_info->keys4[i].addr == addr) {
940                         /* Free the key */
941                         kfree(tp->md5sig_info->keys4[i].base.key);
942                         tp->md5sig_info->entries4--;
943
944                         if (tp->md5sig_info->entries4 == 0) {
945                                 kfree(tp->md5sig_info->keys4);
946                                 tp->md5sig_info->keys4 = NULL;
947                                 tp->md5sig_info->alloced4 = 0;
948                         } else if (tp->md5sig_info->entries4 != i) {
949                                 /* Need to do some manipulation */
950                                 memmove(&tp->md5sig_info->keys4[i],
951                                         &tp->md5sig_info->keys4[i+1],
952                                         (tp->md5sig_info->entries4 - i) *
953                                          sizeof(struct tcp4_md5sig_key));
954                         }
955                         tcp_free_md5sig_pool();
956                         return 0;
957                 }
958         }
959         return -ENOENT;
960 }
961
962 EXPORT_SYMBOL(tcp_v4_md5_do_del);
963
964 static void tcp_v4_clear_md5_list(struct sock *sk)
965 {
966         struct tcp_sock *tp = tcp_sk(sk);
967
968         /* Free each key, then the set of key keys,
969          * the crypto element, and then decrement our
970          * hold on the last resort crypto.
971          */
972         if (tp->md5sig_info->entries4) {
973                 int i;
974                 for (i = 0; i < tp->md5sig_info->entries4; i++)
975                         kfree(tp->md5sig_info->keys4[i].base.key);
976                 tp->md5sig_info->entries4 = 0;
977                 tcp_free_md5sig_pool();
978         }
979         if (tp->md5sig_info->keys4) {
980                 kfree(tp->md5sig_info->keys4);
981                 tp->md5sig_info->keys4 = NULL;
982                 tp->md5sig_info->alloced4  = 0;
983         }
984 }
985
986 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
987                                  int optlen)
988 {
989         struct tcp_md5sig cmd;
990         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
991         u8 *newkey;
992
993         if (optlen < sizeof(cmd))
994                 return -EINVAL;
995
996         if (copy_from_user(&cmd, optval, sizeof(cmd)))
997                 return -EFAULT;
998
999         if (sin->sin_family != AF_INET)
1000                 return -EINVAL;
1001
1002         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1003                 if (!tcp_sk(sk)->md5sig_info)
1004                         return -ENOENT;
1005                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1006         }
1007
1008         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1009                 return -EINVAL;
1010
1011         if (!tcp_sk(sk)->md5sig_info) {
1012                 struct tcp_sock *tp = tcp_sk(sk);
1013                 struct tcp_md5sig_info *p;
1014
1015                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1016                 if (!p)
1017                         return -EINVAL;
1018
1019                 tp->md5sig_info = p;
1020                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1021         }
1022
1023         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1024         if (!newkey)
1025                 return -ENOMEM;
1026         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1027                                  newkey, cmd.tcpm_keylen);
1028 }
1029
1030 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1031                                         __be32 daddr, __be32 saddr, int nbytes)
1032 {
1033         struct tcp4_pseudohdr *bp;
1034         struct scatterlist sg;
1035
1036         bp = &hp->md5_blk.ip4;
1037
1038         /*
1039          * 1. the TCP pseudo-header (in the order: source IP address,
1040          * destination IP address, zero-padded protocol number, and
1041          * segment length)
1042          */
1043         bp->saddr = saddr;
1044         bp->daddr = daddr;
1045         bp->pad = 0;
1046         bp->protocol = IPPROTO_TCP;
1047         bp->len = cpu_to_be16(nbytes);
1048
1049         sg_init_one(&sg, bp, sizeof(*bp));
1050         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1051 }
1052
1053 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1054                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1055 {
1056         struct tcp_md5sig_pool *hp;
1057         struct hash_desc *desc;
1058
1059         hp = tcp_get_md5sig_pool();
1060         if (!hp)
1061                 goto clear_hash_noput;
1062         desc = &hp->md5_desc;
1063
1064         if (crypto_hash_init(desc))
1065                 goto clear_hash;
1066         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1067                 goto clear_hash;
1068         if (tcp_md5_hash_header(hp, th))
1069                 goto clear_hash;
1070         if (tcp_md5_hash_key(hp, key))
1071                 goto clear_hash;
1072         if (crypto_hash_final(desc, md5_hash))
1073                 goto clear_hash;
1074
1075         tcp_put_md5sig_pool();
1076         return 0;
1077
1078 clear_hash:
1079         tcp_put_md5sig_pool();
1080 clear_hash_noput:
1081         memset(md5_hash, 0, 16);
1082         return 1;
1083 }
1084
1085 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1086                         struct sock *sk, struct request_sock *req,
1087                         struct sk_buff *skb)
1088 {
1089         struct tcp_md5sig_pool *hp;
1090         struct hash_desc *desc;
1091         struct tcphdr *th = tcp_hdr(skb);
1092         __be32 saddr, daddr;
1093
1094         if (sk) {
1095                 saddr = inet_sk(sk)->inet_saddr;
1096                 daddr = inet_sk(sk)->inet_daddr;
1097         } else if (req) {
1098                 saddr = inet_rsk(req)->loc_addr;
1099                 daddr = inet_rsk(req)->rmt_addr;
1100         } else {
1101                 const struct iphdr *iph = ip_hdr(skb);
1102                 saddr = iph->saddr;
1103                 daddr = iph->daddr;
1104         }
1105
1106         hp = tcp_get_md5sig_pool();
1107         if (!hp)
1108                 goto clear_hash_noput;
1109         desc = &hp->md5_desc;
1110
1111         if (crypto_hash_init(desc))
1112                 goto clear_hash;
1113
1114         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1115                 goto clear_hash;
1116         if (tcp_md5_hash_header(hp, th))
1117                 goto clear_hash;
1118         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1119                 goto clear_hash;
1120         if (tcp_md5_hash_key(hp, key))
1121                 goto clear_hash;
1122         if (crypto_hash_final(desc, md5_hash))
1123                 goto clear_hash;
1124
1125         tcp_put_md5sig_pool();
1126         return 0;
1127
1128 clear_hash:
1129         tcp_put_md5sig_pool();
1130 clear_hash_noput:
1131         memset(md5_hash, 0, 16);
1132         return 1;
1133 }
1134
1135 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1136
1137 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1138 {
1139         /*
1140          * This gets called for each TCP segment that arrives
1141          * so we want to be efficient.
1142          * We have 3 drop cases:
1143          * o No MD5 hash and one expected.
1144          * o MD5 hash and we're not expecting one.
1145          * o MD5 hash and its wrong.
1146          */
1147         __u8 *hash_location = NULL;
1148         struct tcp_md5sig_key *hash_expected;
1149         const struct iphdr *iph = ip_hdr(skb);
1150         struct tcphdr *th = tcp_hdr(skb);
1151         int genhash;
1152         unsigned char newhash[16];
1153
1154         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1155         hash_location = tcp_parse_md5sig_option(th);
1156
1157         /* We've parsed the options - do we have a hash? */
1158         if (!hash_expected && !hash_location)
1159                 return 0;
1160
1161         if (hash_expected && !hash_location) {
1162                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1163                 return 1;
1164         }
1165
1166         if (!hash_expected && hash_location) {
1167                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1168                 return 1;
1169         }
1170
1171         /* Okay, so this is hash_expected and hash_location -
1172          * so we need to calculate the checksum.
1173          */
1174         genhash = tcp_v4_md5_hash_skb(newhash,
1175                                       hash_expected,
1176                                       NULL, NULL, skb);
1177
1178         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1179                 if (net_ratelimit()) {
1180                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1181                                &iph->saddr, ntohs(th->source),
1182                                &iph->daddr, ntohs(th->dest),
1183                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1184                 }
1185                 return 1;
1186         }
1187         return 0;
1188 }
1189
1190 #endif
1191
1192 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1193         .family         =       PF_INET,
1194         .obj_size       =       sizeof(struct tcp_request_sock),
1195         .rtx_syn_ack    =       tcp_v4_send_synack,
1196         .send_ack       =       tcp_v4_reqsk_send_ack,
1197         .destructor     =       tcp_v4_reqsk_destructor,
1198         .send_reset     =       tcp_v4_send_reset,
1199 };
1200
1201 #ifdef CONFIG_TCP_MD5SIG
1202 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1203         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1204         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1205 };
1206 #endif
1207
1208 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1209         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1210         .twsk_unique    = tcp_twsk_unique,
1211         .twsk_destructor= tcp_twsk_destructor,
1212 };
1213
1214 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1215 {
1216         struct tcp_extend_values tmp_ext;
1217         struct tcp_options_received tmp_opt;
1218         u8 *hash_location;
1219         struct request_sock *req;
1220         struct inet_request_sock *ireq;
1221         struct tcp_sock *tp = tcp_sk(sk);
1222         struct dst_entry *dst = NULL;
1223         __be32 saddr = ip_hdr(skb)->saddr;
1224         __be32 daddr = ip_hdr(skb)->daddr;
1225         __u32 isn = TCP_SKB_CB(skb)->when;
1226 #ifdef CONFIG_SYN_COOKIES
1227         int want_cookie = 0;
1228 #else
1229 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1230 #endif
1231
1232         /* Never answer to SYNs send to broadcast or multicast */
1233         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1234                 goto drop;
1235
1236         /* TW buckets are converted to open requests without
1237          * limitations, they conserve resources and peer is
1238          * evidently real one.
1239          */
1240         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1241 #ifdef CONFIG_SYN_COOKIES
1242                 if (sysctl_tcp_syncookies) {
1243                         want_cookie = 1;
1244                 } else
1245 #endif
1246                 goto drop;
1247         }
1248
1249         /* Accept backlog is full. If we have already queued enough
1250          * of warm entries in syn queue, drop request. It is better than
1251          * clogging syn queue with openreqs with exponentially increasing
1252          * timeout.
1253          */
1254         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1255                 goto drop;
1256
1257         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1258         if (!req)
1259                 goto drop;
1260
1261 #ifdef CONFIG_TCP_MD5SIG
1262         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1263 #endif
1264
1265         ireq = inet_rsk(req);
1266         ireq->loc_addr = daddr;
1267         ireq->rmt_addr = saddr;
1268         ireq->no_srccheck = inet_sk(sk)->transparent;
1269         ireq->opt = tcp_v4_save_options(sk, skb);
1270
1271         dst = inet_csk_route_req(sk, req);
1272         if(!dst)
1273                 goto drop_and_free;
1274
1275         tcp_clear_options(&tmp_opt);
1276         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1277         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1278         tcp_parse_options(skb, &tmp_opt, &hash_location, 0, dst);
1279
1280         if (tmp_opt.cookie_plus > 0 &&
1281             tmp_opt.saw_tstamp &&
1282             !tp->rx_opt.cookie_out_never &&
1283             (sysctl_tcp_cookie_size > 0 ||
1284              (tp->cookie_values != NULL &&
1285               tp->cookie_values->cookie_desired > 0))) {
1286                 u8 *c;
1287                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1288                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1289
1290                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1291                         goto drop_and_release;
1292
1293                 /* Secret recipe starts with IP addresses */
1294                 *mess++ ^= daddr;
1295                 *mess++ ^= saddr;
1296
1297                 /* plus variable length Initiator Cookie */
1298                 c = (u8 *)mess;
1299                 while (l-- > 0)
1300                         *c++ ^= *hash_location++;
1301
1302 #ifdef CONFIG_SYN_COOKIES
1303                 want_cookie = 0;        /* not our kind of cookie */
1304 #endif
1305                 tmp_ext.cookie_out_never = 0; /* false */
1306                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1307         } else if (!tp->rx_opt.cookie_in_always) {
1308                 /* redundant indications, but ensure initialization. */
1309                 tmp_ext.cookie_out_never = 1; /* true */
1310                 tmp_ext.cookie_plus = 0;
1311         } else {
1312                 goto drop_and_release;
1313         }
1314         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1315
1316         if (want_cookie && !tmp_opt.saw_tstamp)
1317                 tcp_clear_options(&tmp_opt);
1318
1319         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1320         tcp_openreq_init(req, &tmp_opt, skb);
1321
1322         if (security_inet_conn_request(sk, skb, req))
1323                 goto drop_and_release;
1324
1325         if (!want_cookie)
1326                 TCP_ECN_create_request(req, tcp_hdr(skb));
1327
1328         if (want_cookie) {
1329 #ifdef CONFIG_SYN_COOKIES
1330                 syn_flood_warning(skb);
1331                 req->cookie_ts = tmp_opt.tstamp_ok;
1332 #endif
1333                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1334         } else if (!isn) {
1335                 struct inet_peer *peer = NULL;
1336
1337                 /* VJ's idea. We save last timestamp seen
1338                  * from the destination in peer table, when entering
1339                  * state TIME-WAIT, and check against it before
1340                  * accepting new connection request.
1341                  *
1342                  * If "isn" is not zero, this request hit alive
1343                  * timewait bucket, so that all the necessary checks
1344                  * are made in the function processing timewait state.
1345                  */
1346                 if (tmp_opt.saw_tstamp &&
1347                     tcp_death_row.sysctl_tw_recycle &&
1348                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1349                     peer->v4daddr == saddr) {
1350                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1351                             (s32)(peer->tcp_ts - req->ts_recent) >
1352                                                         TCP_PAWS_WINDOW) {
1353                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1354                                 goto drop_and_release;
1355                         }
1356                 }
1357                 /* Kill the following clause, if you dislike this way. */
1358                 else if (!sysctl_tcp_syncookies &&
1359                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1360                           (sysctl_max_syn_backlog >> 2)) &&
1361                          (!peer || !peer->tcp_ts_stamp) &&
1362                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1363                         /* Without syncookies last quarter of
1364                          * backlog is filled with destinations,
1365                          * proven to be alive.
1366                          * It means that we continue to communicate
1367                          * to destinations, already remembered
1368                          * to the moment of synflood.
1369                          */
1370                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1371                                        &saddr, ntohs(tcp_hdr(skb)->source));
1372                         goto drop_and_release;
1373                 }
1374
1375                 isn = tcp_v4_init_sequence(skb);
1376         }
1377         tcp_rsk(req)->snt_isn = isn;
1378
1379         if (__tcp_v4_send_synack(sk, dst, req,
1380                                  (struct request_values *)&tmp_ext) ||
1381             want_cookie)
1382                 goto drop_and_free;
1383
1384         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1385         return 0;
1386
1387 drop_and_release:
1388         dst_release(dst);
1389 drop_and_free:
1390         reqsk_free(req);
1391 drop:
1392         return 0;
1393 }
1394
1395
1396 /*
1397  * The three way handshake has completed - we got a valid synack -
1398  * now create the new socket.
1399  */
1400 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1401                                   struct request_sock *req,
1402                                   struct dst_entry *dst)
1403 {
1404         struct inet_request_sock *ireq;
1405         struct inet_sock *newinet;
1406         struct tcp_sock *newtp;
1407         struct sock *newsk;
1408 #ifdef CONFIG_TCP_MD5SIG
1409         struct tcp_md5sig_key *key;
1410 #endif
1411
1412         if (sk_acceptq_is_full(sk))
1413                 goto exit_overflow;
1414
1415         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1416                 goto exit;
1417
1418         newsk = tcp_create_openreq_child(sk, req, skb);
1419         if (!newsk)
1420                 goto exit;
1421
1422         newsk->sk_gso_type = SKB_GSO_TCPV4;
1423         sk_setup_caps(newsk, dst);
1424
1425         newtp                 = tcp_sk(newsk);
1426         newinet               = inet_sk(newsk);
1427         ireq                  = inet_rsk(req);
1428         newinet->inet_daddr   = ireq->rmt_addr;
1429         newinet->inet_rcv_saddr = ireq->loc_addr;
1430         newinet->inet_saddr           = ireq->loc_addr;
1431         newinet->opt          = ireq->opt;
1432         ireq->opt             = NULL;
1433         newinet->mc_index     = inet_iif(skb);
1434         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1435         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1436         if (newinet->opt)
1437                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1438         newinet->inet_id = newtp->write_seq ^ jiffies;
1439
1440         tcp_mtup_init(newsk);
1441         tcp_sync_mss(newsk, dst_mtu(dst));
1442         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1443         if (tcp_sk(sk)->rx_opt.user_mss &&
1444             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1445                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1446
1447         tcp_initialize_rcv_mss(newsk);
1448
1449 #ifdef CONFIG_TCP_MD5SIG
1450         /* Copy over the MD5 key from the original socket */
1451         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1452         if (key != NULL) {
1453                 /*
1454                  * We're using one, so create a matching key
1455                  * on the newsk structure. If we fail to get
1456                  * memory, then we end up not copying the key
1457                  * across. Shucks.
1458                  */
1459                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1460                 if (newkey != NULL)
1461                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1462                                           newkey, key->keylen);
1463                 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1464         }
1465 #endif
1466
1467         __inet_hash_nolisten(newsk);
1468         __inet_inherit_port(sk, newsk);
1469
1470         return newsk;
1471
1472 exit_overflow:
1473         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1474 exit:
1475         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1476         dst_release(dst);
1477         return NULL;
1478 }
1479
1480 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1481 {
1482         struct tcphdr *th = tcp_hdr(skb);
1483         const struct iphdr *iph = ip_hdr(skb);
1484         struct sock *nsk;
1485         struct request_sock **prev;
1486         /* Find possible connection requests. */
1487         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1488                                                        iph->saddr, iph->daddr);
1489         if (req)
1490                 return tcp_check_req(sk, skb, req, prev);
1491
1492         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1493                         th->source, iph->daddr, th->dest, inet_iif(skb));
1494
1495         if (nsk) {
1496                 if (nsk->sk_state != TCP_TIME_WAIT) {
1497                         bh_lock_sock(nsk);
1498                         return nsk;
1499                 }
1500                 inet_twsk_put(inet_twsk(nsk));
1501                 return NULL;
1502         }
1503
1504 #ifdef CONFIG_SYN_COOKIES
1505         if (!th->rst && !th->syn && th->ack)
1506                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1507 #endif
1508         return sk;
1509 }
1510
1511 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1512 {
1513         const struct iphdr *iph = ip_hdr(skb);
1514
1515         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1516                 if (!tcp_v4_check(skb->len, iph->saddr,
1517                                   iph->daddr, skb->csum)) {
1518                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1519                         return 0;
1520                 }
1521         }
1522
1523         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1524                                        skb->len, IPPROTO_TCP, 0);
1525
1526         if (skb->len <= 76) {
1527                 return __skb_checksum_complete(skb);
1528         }
1529         return 0;
1530 }
1531
1532
1533 /* The socket must have it's spinlock held when we get
1534  * here.
1535  *
1536  * We have a potential double-lock case here, so even when
1537  * doing backlog processing we use the BH locking scheme.
1538  * This is because we cannot sleep with the original spinlock
1539  * held.
1540  */
1541 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1542 {
1543         struct sock *rsk;
1544 #ifdef CONFIG_TCP_MD5SIG
1545         /*
1546          * We really want to reject the packet as early as possible
1547          * if:
1548          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1549          *  o There is an MD5 option and we're not expecting one
1550          */
1551         if (tcp_v4_inbound_md5_hash(sk, skb))
1552                 goto discard;
1553 #endif
1554
1555         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1556                 TCP_CHECK_TIMER(sk);
1557                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1558                         rsk = sk;
1559                         goto reset;
1560                 }
1561                 TCP_CHECK_TIMER(sk);
1562                 return 0;
1563         }
1564
1565         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1566                 goto csum_err;
1567
1568         if (sk->sk_state == TCP_LISTEN) {
1569                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1570                 if (!nsk)
1571                         goto discard;
1572
1573                 if (nsk != sk) {
1574                         if (tcp_child_process(sk, nsk, skb)) {
1575                                 rsk = nsk;
1576                                 goto reset;
1577                         }
1578                         return 0;
1579                 }
1580         }
1581
1582         TCP_CHECK_TIMER(sk);
1583         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1584                 rsk = sk;
1585                 goto reset;
1586         }
1587         TCP_CHECK_TIMER(sk);
1588         return 0;
1589
1590 reset:
1591         tcp_v4_send_reset(rsk, skb);
1592 discard:
1593         kfree_skb(skb);
1594         /* Be careful here. If this function gets more complicated and
1595          * gcc suffers from register pressure on the x86, sk (in %ebx)
1596          * might be destroyed here. This current version compiles correctly,
1597          * but you have been warned.
1598          */
1599         return 0;
1600
1601 csum_err:
1602         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1603         goto discard;
1604 }
1605
1606 /*
1607  *      From tcp_input.c
1608  */
1609
1610 int tcp_v4_rcv(struct sk_buff *skb)
1611 {
1612         const struct iphdr *iph;
1613         struct tcphdr *th;
1614         struct sock *sk;
1615         int ret;
1616         struct net *net = dev_net(skb->dev);
1617
1618         if (skb->pkt_type != PACKET_HOST)
1619                 goto discard_it;
1620
1621         /* Count it even if it's bad */
1622         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1623
1624         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1625                 goto discard_it;
1626
1627         th = tcp_hdr(skb);
1628
1629         if (th->doff < sizeof(struct tcphdr) / 4)
1630                 goto bad_packet;
1631         if (!pskb_may_pull(skb, th->doff * 4))
1632                 goto discard_it;
1633
1634         /* An explanation is required here, I think.
1635          * Packet length and doff are validated by header prediction,
1636          * provided case of th->doff==0 is eliminated.
1637          * So, we defer the checks. */
1638         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1639                 goto bad_packet;
1640
1641         th = tcp_hdr(skb);
1642         iph = ip_hdr(skb);
1643         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1644         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1645                                     skb->len - th->doff * 4);
1646         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1647         TCP_SKB_CB(skb)->when    = 0;
1648         TCP_SKB_CB(skb)->flags   = iph->tos;
1649         TCP_SKB_CB(skb)->sacked  = 0;
1650
1651         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1652         if (!sk)
1653                 goto no_tcp_socket;
1654
1655 process:
1656         if (sk->sk_state == TCP_TIME_WAIT)
1657                 goto do_time_wait;
1658
1659         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1660                 goto discard_and_relse;
1661         nf_reset(skb);
1662
1663         if (sk_filter(sk, skb))
1664                 goto discard_and_relse;
1665
1666         skb->dev = NULL;
1667
1668         bh_lock_sock_nested(sk);
1669         ret = 0;
1670         if (!sock_owned_by_user(sk)) {
1671 #ifdef CONFIG_NET_DMA
1672                 struct tcp_sock *tp = tcp_sk(sk);
1673                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1674                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1675                 if (tp->ucopy.dma_chan)
1676                         ret = tcp_v4_do_rcv(sk, skb);
1677                 else
1678 #endif
1679                 {
1680                         if (!tcp_prequeue(sk, skb))
1681                                 ret = tcp_v4_do_rcv(sk, skb);
1682                 }
1683         } else
1684                 sk_add_backlog(sk, skb);
1685         bh_unlock_sock(sk);
1686
1687         sock_put(sk);
1688
1689         return ret;
1690
1691 no_tcp_socket:
1692         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1693                 goto discard_it;
1694
1695         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1696 bad_packet:
1697                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1698         } else {
1699                 tcp_v4_send_reset(NULL, skb);
1700         }
1701
1702 discard_it:
1703         /* Discard frame. */
1704         kfree_skb(skb);
1705         return 0;
1706
1707 discard_and_relse:
1708         sock_put(sk);
1709         goto discard_it;
1710
1711 do_time_wait:
1712         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1713                 inet_twsk_put(inet_twsk(sk));
1714                 goto discard_it;
1715         }
1716
1717         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1718                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1719                 inet_twsk_put(inet_twsk(sk));
1720                 goto discard_it;
1721         }
1722         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1723         case TCP_TW_SYN: {
1724                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1725                                                         &tcp_hashinfo,
1726                                                         iph->daddr, th->dest,
1727                                                         inet_iif(skb));
1728                 if (sk2) {
1729                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1730                         inet_twsk_put(inet_twsk(sk));
1731                         sk = sk2;
1732                         goto process;
1733                 }
1734                 /* Fall through to ACK */
1735         }
1736         case TCP_TW_ACK:
1737                 tcp_v4_timewait_ack(sk, skb);
1738                 break;
1739         case TCP_TW_RST:
1740                 goto no_tcp_socket;
1741         case TCP_TW_SUCCESS:;
1742         }
1743         goto discard_it;
1744 }
1745
1746 /* VJ's idea. Save last timestamp seen from this destination
1747  * and hold it at least for normal timewait interval to use for duplicate
1748  * segment detection in subsequent connections, before they enter synchronized
1749  * state.
1750  */
1751
1752 int tcp_v4_remember_stamp(struct sock *sk)
1753 {
1754         struct inet_sock *inet = inet_sk(sk);
1755         struct tcp_sock *tp = tcp_sk(sk);
1756         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1757         struct inet_peer *peer = NULL;
1758         int release_it = 0;
1759
1760         if (!rt || rt->rt_dst != inet->inet_daddr) {
1761                 peer = inet_getpeer(inet->inet_daddr, 1);
1762                 release_it = 1;
1763         } else {
1764                 if (!rt->peer)
1765                         rt_bind_peer(rt, 1);
1766                 peer = rt->peer;
1767         }
1768
1769         if (peer) {
1770                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1771                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1772                      peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1773                         peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1774                         peer->tcp_ts = tp->rx_opt.ts_recent;
1775                 }
1776                 if (release_it)
1777                         inet_putpeer(peer);
1778                 return 1;
1779         }
1780
1781         return 0;
1782 }
1783
1784 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1785 {
1786         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1787
1788         if (peer) {
1789                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1790
1791                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1792                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1793                      peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1794                         peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1795                         peer->tcp_ts       = tcptw->tw_ts_recent;
1796                 }
1797                 inet_putpeer(peer);
1798                 return 1;
1799         }
1800
1801         return 0;
1802 }
1803
1804 const struct inet_connection_sock_af_ops ipv4_specific = {
1805         .queue_xmit        = ip_queue_xmit,
1806         .send_check        = tcp_v4_send_check,
1807         .rebuild_header    = inet_sk_rebuild_header,
1808         .conn_request      = tcp_v4_conn_request,
1809         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1810         .remember_stamp    = tcp_v4_remember_stamp,
1811         .net_header_len    = sizeof(struct iphdr),
1812         .setsockopt        = ip_setsockopt,
1813         .getsockopt        = ip_getsockopt,
1814         .addr2sockaddr     = inet_csk_addr2sockaddr,
1815         .sockaddr_len      = sizeof(struct sockaddr_in),
1816         .bind_conflict     = inet_csk_bind_conflict,
1817 #ifdef CONFIG_COMPAT
1818         .compat_setsockopt = compat_ip_setsockopt,
1819         .compat_getsockopt = compat_ip_getsockopt,
1820 #endif
1821 };
1822
1823 #ifdef CONFIG_TCP_MD5SIG
1824 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1825         .md5_lookup             = tcp_v4_md5_lookup,
1826         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1827         .md5_add                = tcp_v4_md5_add_func,
1828         .md5_parse              = tcp_v4_parse_md5_keys,
1829 };
1830 #endif
1831
1832 /* NOTE: A lot of things set to zero explicitly by call to
1833  *       sk_alloc() so need not be done here.
1834  */
1835 static int tcp_v4_init_sock(struct sock *sk)
1836 {
1837         struct inet_connection_sock *icsk = inet_csk(sk);
1838         struct tcp_sock *tp = tcp_sk(sk);
1839
1840         skb_queue_head_init(&tp->out_of_order_queue);
1841         tcp_init_xmit_timers(sk);
1842         tcp_prequeue_init(tp);
1843
1844         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1845         tp->mdev = TCP_TIMEOUT_INIT;
1846
1847         /* So many TCP implementations out there (incorrectly) count the
1848          * initial SYN frame in their delayed-ACK and congestion control
1849          * algorithms that we must have the following bandaid to talk
1850          * efficiently to them.  -DaveM
1851          */
1852         tp->snd_cwnd = 2;
1853
1854         /* See draft-stevens-tcpca-spec-01 for discussion of the
1855          * initialization of these values.
1856          */
1857         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1858         tp->snd_cwnd_clamp = ~0;
1859         tp->mss_cache = TCP_MSS_DEFAULT;
1860
1861         tp->reordering = sysctl_tcp_reordering;
1862         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1863
1864         sk->sk_state = TCP_CLOSE;
1865
1866         sk->sk_write_space = sk_stream_write_space;
1867         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1868
1869         icsk->icsk_af_ops = &ipv4_specific;
1870         icsk->icsk_sync_mss = tcp_sync_mss;
1871 #ifdef CONFIG_TCP_MD5SIG
1872         tp->af_specific = &tcp_sock_ipv4_specific;
1873 #endif
1874
1875         /* TCP Cookie Transactions */
1876         if (sysctl_tcp_cookie_size > 0) {
1877                 /* Default, cookies without s_data_payload. */
1878                 tp->cookie_values =
1879                         kzalloc(sizeof(*tp->cookie_values),
1880                                 sk->sk_allocation);
1881                 if (tp->cookie_values != NULL)
1882                         kref_init(&tp->cookie_values->kref);
1883         }
1884         /* Presumed zeroed, in order of appearance:
1885          *      cookie_in_always, cookie_out_never,
1886          *      s_data_constant, s_data_in, s_data_out
1887          */
1888         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1889         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1890
1891         local_bh_disable();
1892         percpu_counter_inc(&tcp_sockets_allocated);
1893         local_bh_enable();
1894
1895         return 0;
1896 }
1897
1898 void tcp_v4_destroy_sock(struct sock *sk)
1899 {
1900         struct tcp_sock *tp = tcp_sk(sk);
1901
1902         tcp_clear_xmit_timers(sk);
1903
1904         tcp_cleanup_congestion_control(sk);
1905
1906         /* Cleanup up the write buffer. */
1907         tcp_write_queue_purge(sk);
1908
1909         /* Cleans up our, hopefully empty, out_of_order_queue. */
1910         __skb_queue_purge(&tp->out_of_order_queue);
1911
1912 #ifdef CONFIG_TCP_MD5SIG
1913         /* Clean up the MD5 key list, if any */
1914         if (tp->md5sig_info) {
1915                 tcp_v4_clear_md5_list(sk);
1916                 kfree(tp->md5sig_info);
1917                 tp->md5sig_info = NULL;
1918         }
1919 #endif
1920
1921 #ifdef CONFIG_NET_DMA
1922         /* Cleans up our sk_async_wait_queue */
1923         __skb_queue_purge(&sk->sk_async_wait_queue);
1924 #endif
1925
1926         /* Clean prequeue, it must be empty really */
1927         __skb_queue_purge(&tp->ucopy.prequeue);
1928
1929         /* Clean up a referenced TCP bind bucket. */
1930         if (inet_csk(sk)->icsk_bind_hash)
1931                 inet_put_port(sk);
1932
1933         /*
1934          * If sendmsg cached page exists, toss it.
1935          */
1936         if (sk->sk_sndmsg_page) {
1937                 __free_page(sk->sk_sndmsg_page);
1938                 sk->sk_sndmsg_page = NULL;
1939         }
1940
1941         /* TCP Cookie Transactions */
1942         if (tp->cookie_values != NULL) {
1943                 kref_put(&tp->cookie_values->kref,
1944                          tcp_cookie_values_release);
1945                 tp->cookie_values = NULL;
1946         }
1947
1948         percpu_counter_dec(&tcp_sockets_allocated);
1949 }
1950
1951 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1952
1953 #ifdef CONFIG_PROC_FS
1954 /* Proc filesystem TCP sock list dumping. */
1955
1956 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1957 {
1958         return hlist_nulls_empty(head) ? NULL :
1959                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1960 }
1961
1962 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1963 {
1964         return !is_a_nulls(tw->tw_node.next) ?
1965                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1966 }
1967
1968 static void *listening_get_next(struct seq_file *seq, void *cur)
1969 {
1970         struct inet_connection_sock *icsk;
1971         struct hlist_nulls_node *node;
1972         struct sock *sk = cur;
1973         struct inet_listen_hashbucket *ilb;
1974         struct tcp_iter_state *st = seq->private;
1975         struct net *net = seq_file_net(seq);
1976
1977         if (!sk) {
1978                 st->bucket = 0;
1979                 ilb = &tcp_hashinfo.listening_hash[0];
1980                 spin_lock_bh(&ilb->lock);
1981                 sk = sk_nulls_head(&ilb->head);
1982                 goto get_sk;
1983         }
1984         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1985         ++st->num;
1986
1987         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1988                 struct request_sock *req = cur;
1989
1990                 icsk = inet_csk(st->syn_wait_sk);
1991                 req = req->dl_next;
1992                 while (1) {
1993                         while (req) {
1994                                 if (req->rsk_ops->family == st->family) {
1995                                         cur = req;
1996                                         goto out;
1997                                 }
1998                                 req = req->dl_next;
1999                         }
2000                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2001                                 break;
2002 get_req:
2003                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2004                 }
2005                 sk        = sk_next(st->syn_wait_sk);
2006                 st->state = TCP_SEQ_STATE_LISTENING;
2007                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2008         } else {
2009                 icsk = inet_csk(sk);
2010                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2011                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2012                         goto start_req;
2013                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2014                 sk = sk_next(sk);
2015         }
2016 get_sk:
2017         sk_nulls_for_each_from(sk, node) {
2018                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
2019                         cur = sk;
2020                         goto out;
2021                 }
2022                 icsk = inet_csk(sk);
2023                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2024                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2025 start_req:
2026                         st->uid         = sock_i_uid(sk);
2027                         st->syn_wait_sk = sk;
2028                         st->state       = TCP_SEQ_STATE_OPENREQ;
2029                         st->sbucket     = 0;
2030                         goto get_req;
2031                 }
2032                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2033         }
2034         spin_unlock_bh(&ilb->lock);
2035         if (++st->bucket < INET_LHTABLE_SIZE) {
2036                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2037                 spin_lock_bh(&ilb->lock);
2038                 sk = sk_nulls_head(&ilb->head);
2039                 goto get_sk;
2040         }
2041         cur = NULL;
2042 out:
2043         return cur;
2044 }
2045
2046 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2047 {
2048         void *rc = listening_get_next(seq, NULL);
2049
2050         while (rc && *pos) {
2051                 rc = listening_get_next(seq, rc);
2052                 --*pos;
2053         }
2054         return rc;
2055 }
2056
2057 static inline int empty_bucket(struct tcp_iter_state *st)
2058 {
2059         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2060                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2061 }
2062
2063 static void *established_get_first(struct seq_file *seq)
2064 {
2065         struct tcp_iter_state *st = seq->private;
2066         struct net *net = seq_file_net(seq);
2067         void *rc = NULL;
2068
2069         for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2070                 struct sock *sk;
2071                 struct hlist_nulls_node *node;
2072                 struct inet_timewait_sock *tw;
2073                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2074
2075                 /* Lockless fast path for the common case of empty buckets */
2076                 if (empty_bucket(st))
2077                         continue;
2078
2079                 spin_lock_bh(lock);
2080                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2081                         if (sk->sk_family != st->family ||
2082                             !net_eq(sock_net(sk), net)) {
2083                                 continue;
2084                         }
2085                         rc = sk;
2086                         goto out;
2087                 }
2088                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2089                 inet_twsk_for_each(tw, node,
2090                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2091                         if (tw->tw_family != st->family ||
2092                             !net_eq(twsk_net(tw), net)) {
2093                                 continue;
2094                         }
2095                         rc = tw;
2096                         goto out;
2097                 }
2098                 spin_unlock_bh(lock);
2099                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2100         }
2101 out:
2102         return rc;
2103 }
2104
2105 static void *established_get_next(struct seq_file *seq, void *cur)
2106 {
2107         struct sock *sk = cur;
2108         struct inet_timewait_sock *tw;
2109         struct hlist_nulls_node *node;
2110         struct tcp_iter_state *st = seq->private;
2111         struct net *net = seq_file_net(seq);
2112
2113         ++st->num;
2114
2115         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2116                 tw = cur;
2117                 tw = tw_next(tw);
2118 get_tw:
2119                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2120                         tw = tw_next(tw);
2121                 }
2122                 if (tw) {
2123                         cur = tw;
2124                         goto out;
2125                 }
2126                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2127                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2128
2129                 /* Look for next non empty bucket */
2130                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2131                                 empty_bucket(st))
2132                         ;
2133                 if (st->bucket > tcp_hashinfo.ehash_mask)
2134                         return NULL;
2135
2136                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2137                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2138         } else
2139                 sk = sk_nulls_next(sk);
2140
2141         sk_nulls_for_each_from(sk, node) {
2142                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2143                         goto found;
2144         }
2145
2146         st->state = TCP_SEQ_STATE_TIME_WAIT;
2147         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2148         goto get_tw;
2149 found:
2150         cur = sk;
2151 out:
2152         return cur;
2153 }
2154
2155 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2156 {
2157         void *rc = established_get_first(seq);
2158
2159         while (rc && pos) {
2160                 rc = established_get_next(seq, rc);
2161                 --pos;
2162         }
2163         return rc;
2164 }
2165
2166 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2167 {
2168         void *rc;
2169         struct tcp_iter_state *st = seq->private;
2170
2171         st->state = TCP_SEQ_STATE_LISTENING;
2172         rc        = listening_get_idx(seq, &pos);
2173
2174         if (!rc) {
2175                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2176                 rc        = established_get_idx(seq, pos);
2177         }
2178
2179         return rc;
2180 }
2181
2182 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2183 {
2184         struct tcp_iter_state *st = seq->private;
2185         st->state = TCP_SEQ_STATE_LISTENING;
2186         st->num = 0;
2187         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2188 }
2189
2190 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2191 {
2192         void *rc = NULL;
2193         struct tcp_iter_state *st;
2194
2195         if (v == SEQ_START_TOKEN) {
2196                 rc = tcp_get_idx(seq, 0);
2197                 goto out;
2198         }
2199         st = seq->private;
2200
2201         switch (st->state) {
2202         case TCP_SEQ_STATE_OPENREQ:
2203         case TCP_SEQ_STATE_LISTENING:
2204                 rc = listening_get_next(seq, v);
2205                 if (!rc) {
2206                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2207                         rc        = established_get_first(seq);
2208                 }
2209                 break;
2210         case TCP_SEQ_STATE_ESTABLISHED:
2211         case TCP_SEQ_STATE_TIME_WAIT:
2212                 rc = established_get_next(seq, v);
2213                 break;
2214         }
2215 out:
2216         ++*pos;
2217         return rc;
2218 }
2219
2220 static void tcp_seq_stop(struct seq_file *seq, void *v)
2221 {
2222         struct tcp_iter_state *st = seq->private;
2223
2224         switch (st->state) {
2225         case TCP_SEQ_STATE_OPENREQ:
2226                 if (v) {
2227                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2228                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2229                 }
2230         case TCP_SEQ_STATE_LISTENING:
2231                 if (v != SEQ_START_TOKEN)
2232                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2233                 break;
2234         case TCP_SEQ_STATE_TIME_WAIT:
2235         case TCP_SEQ_STATE_ESTABLISHED:
2236                 if (v)
2237                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2238                 break;
2239         }
2240 }
2241
2242 static int tcp_seq_open(struct inode *inode, struct file *file)
2243 {
2244         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2245         struct tcp_iter_state *s;
2246         int err;
2247
2248         err = seq_open_net(inode, file, &afinfo->seq_ops,
2249                           sizeof(struct tcp_iter_state));
2250         if (err < 0)
2251                 return err;
2252
2253         s = ((struct seq_file *)file->private_data)->private;
2254         s->family               = afinfo->family;
2255         return 0;
2256 }
2257
2258 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2259 {
2260         int rc = 0;
2261         struct proc_dir_entry *p;
2262
2263         afinfo->seq_fops.open           = tcp_seq_open;
2264         afinfo->seq_fops.read           = seq_read;
2265         afinfo->seq_fops.llseek         = seq_lseek;
2266         afinfo->seq_fops.release        = seq_release_net;
2267
2268         afinfo->seq_ops.start           = tcp_seq_start;
2269         afinfo->seq_ops.next            = tcp_seq_next;
2270         afinfo->seq_ops.stop            = tcp_seq_stop;
2271
2272         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2273                              &afinfo->seq_fops, afinfo);
2274         if (!p)
2275                 rc = -ENOMEM;
2276         return rc;
2277 }
2278
2279 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2280 {
2281         proc_net_remove(net, afinfo->name);
2282 }
2283
2284 static void get_openreq4(struct sock *sk, struct request_sock *req,
2285                          struct seq_file *f, int i, int uid, int *len)
2286 {
2287         const struct inet_request_sock *ireq = inet_rsk(req);
2288         int ttd = req->expires - jiffies;
2289
2290         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2291                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2292                 i,
2293                 ireq->loc_addr,
2294                 ntohs(inet_sk(sk)->inet_sport),
2295                 ireq->rmt_addr,
2296                 ntohs(ireq->rmt_port),
2297                 TCP_SYN_RECV,
2298                 0, 0, /* could print option size, but that is af dependent. */
2299                 1,    /* timers active (only the expire timer) */
2300                 jiffies_to_clock_t(ttd),
2301                 req->retrans,
2302                 uid,
2303                 0,  /* non standard timer */
2304                 0, /* open_requests have no inode */
2305                 atomic_read(&sk->sk_refcnt),
2306                 req,
2307                 len);
2308 }
2309
2310 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2311 {
2312         int timer_active;
2313         unsigned long timer_expires;
2314         struct tcp_sock *tp = tcp_sk(sk);
2315         const struct inet_connection_sock *icsk = inet_csk(sk);
2316         struct inet_sock *inet = inet_sk(sk);
2317         __be32 dest = inet->inet_daddr;
2318         __be32 src = inet->inet_rcv_saddr;
2319         __u16 destp = ntohs(inet->inet_dport);
2320         __u16 srcp = ntohs(inet->inet_sport);
2321         int rx_queue;
2322
2323         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2324                 timer_active    = 1;
2325                 timer_expires   = icsk->icsk_timeout;
2326         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2327                 timer_active    = 4;
2328                 timer_expires   = icsk->icsk_timeout;
2329         } else if (timer_pending(&sk->sk_timer)) {
2330                 timer_active    = 2;
2331                 timer_expires   = sk->sk_timer.expires;
2332         } else {
2333                 timer_active    = 0;
2334                 timer_expires = jiffies;
2335         }
2336
2337         if (sk->sk_state == TCP_LISTEN)
2338                 rx_queue = sk->sk_ack_backlog;
2339         else
2340                 /*
2341                  * because we dont lock socket, we might find a transient negative value
2342                  */
2343                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2344
2345         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2346                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2347                 i, src, srcp, dest, destp, sk->sk_state,
2348                 tp->write_seq - tp->snd_una,
2349                 rx_queue,
2350                 timer_active,
2351                 jiffies_to_clock_t(timer_expires - jiffies),
2352                 icsk->icsk_retransmits,
2353                 sock_i_uid(sk),
2354                 icsk->icsk_probes_out,
2355                 sock_i_ino(sk),
2356                 atomic_read(&sk->sk_refcnt), sk,
2357                 jiffies_to_clock_t(icsk->icsk_rto),
2358                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2359                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2360                 tp->snd_cwnd,
2361                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2362                 len);
2363 }
2364
2365 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2366                                struct seq_file *f, int i, int *len)
2367 {
2368         __be32 dest, src;
2369         __u16 destp, srcp;
2370         int ttd = tw->tw_ttd - jiffies;
2371
2372         if (ttd < 0)
2373                 ttd = 0;
2374
2375         dest  = tw->tw_daddr;
2376         src   = tw->tw_rcv_saddr;
2377         destp = ntohs(tw->tw_dport);
2378         srcp  = ntohs(tw->tw_sport);
2379
2380         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2381                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2382                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2383                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2384                 atomic_read(&tw->tw_refcnt), tw, len);
2385 }
2386
2387 #define TMPSZ 150
2388
2389 static int tcp4_seq_show(struct seq_file *seq, void *v)
2390 {
2391         struct tcp_iter_state *st;
2392         int len;
2393
2394         if (v == SEQ_START_TOKEN) {
2395                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2396                            "  sl  local_address rem_address   st tx_queue "
2397                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2398                            "inode");
2399                 goto out;
2400         }
2401         st = seq->private;
2402
2403         switch (st->state) {
2404         case TCP_SEQ_STATE_LISTENING:
2405         case TCP_SEQ_STATE_ESTABLISHED:
2406                 get_tcp4_sock(v, seq, st->num, &len);
2407                 break;
2408         case TCP_SEQ_STATE_OPENREQ:
2409                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2410                 break;
2411         case TCP_SEQ_STATE_TIME_WAIT:
2412                 get_timewait4_sock(v, seq, st->num, &len);
2413                 break;
2414         }
2415         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2416 out:
2417         return 0;
2418 }
2419
2420 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2421         .name           = "tcp",
2422         .family         = AF_INET,
2423         .seq_fops       = {
2424                 .owner          = THIS_MODULE,
2425         },
2426         .seq_ops        = {
2427                 .show           = tcp4_seq_show,
2428         },
2429 };
2430
2431 static int tcp4_proc_init_net(struct net *net)
2432 {
2433         return tcp_proc_register(net, &tcp4_seq_afinfo);
2434 }
2435
2436 static void tcp4_proc_exit_net(struct net *net)
2437 {
2438         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2439 }
2440
2441 static struct pernet_operations tcp4_net_ops = {
2442         .init = tcp4_proc_init_net,
2443         .exit = tcp4_proc_exit_net,
2444 };
2445
2446 int __init tcp4_proc_init(void)
2447 {
2448         return register_pernet_subsys(&tcp4_net_ops);
2449 }
2450
2451 void tcp4_proc_exit(void)
2452 {
2453         unregister_pernet_subsys(&tcp4_net_ops);
2454 }
2455 #endif /* CONFIG_PROC_FS */
2456
2457 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2458 {
2459         struct iphdr *iph = skb_gro_network_header(skb);
2460
2461         switch (skb->ip_summed) {
2462         case CHECKSUM_COMPLETE:
2463                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2464                                   skb->csum)) {
2465                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2466                         break;
2467                 }
2468
2469                 /* fall through */
2470         case CHECKSUM_NONE:
2471                 NAPI_GRO_CB(skb)->flush = 1;
2472                 return NULL;
2473         }
2474
2475         return tcp_gro_receive(head, skb);
2476 }
2477 EXPORT_SYMBOL(tcp4_gro_receive);
2478
2479 int tcp4_gro_complete(struct sk_buff *skb)
2480 {
2481         struct iphdr *iph = ip_hdr(skb);
2482         struct tcphdr *th = tcp_hdr(skb);
2483
2484         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2485                                   iph->saddr, iph->daddr, 0);
2486         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2487
2488         return tcp_gro_complete(skb);
2489 }
2490 EXPORT_SYMBOL(tcp4_gro_complete);
2491
2492 struct proto tcp_prot = {
2493         .name                   = "TCP",
2494         .owner                  = THIS_MODULE,
2495         .close                  = tcp_close,
2496         .connect                = tcp_v4_connect,
2497         .disconnect             = tcp_disconnect,
2498         .accept                 = inet_csk_accept,
2499         .ioctl                  = tcp_ioctl,
2500         .init                   = tcp_v4_init_sock,
2501         .destroy                = tcp_v4_destroy_sock,
2502         .shutdown               = tcp_shutdown,
2503         .setsockopt             = tcp_setsockopt,
2504         .getsockopt             = tcp_getsockopt,
2505         .recvmsg                = tcp_recvmsg,
2506         .backlog_rcv            = tcp_v4_do_rcv,
2507         .hash                   = inet_hash,
2508         .unhash                 = inet_unhash,
2509         .get_port               = inet_csk_get_port,
2510         .enter_memory_pressure  = tcp_enter_memory_pressure,
2511         .sockets_allocated      = &tcp_sockets_allocated,
2512         .orphan_count           = &tcp_orphan_count,
2513         .memory_allocated       = &tcp_memory_allocated,
2514         .memory_pressure        = &tcp_memory_pressure,
2515         .sysctl_mem             = sysctl_tcp_mem,
2516         .sysctl_wmem            = sysctl_tcp_wmem,
2517         .sysctl_rmem            = sysctl_tcp_rmem,
2518         .max_header             = MAX_TCP_HEADER,
2519         .obj_size               = sizeof(struct tcp_sock),
2520         .slab_flags             = SLAB_DESTROY_BY_RCU,
2521         .twsk_prot              = &tcp_timewait_sock_ops,
2522         .rsk_prot               = &tcp_request_sock_ops,
2523         .h.hashinfo             = &tcp_hashinfo,
2524 #ifdef CONFIG_COMPAT
2525         .compat_setsockopt      = compat_tcp_setsockopt,
2526         .compat_getsockopt      = compat_tcp_getsockopt,
2527 #endif
2528 };
2529
2530
2531 static int __net_init tcp_sk_init(struct net *net)
2532 {
2533         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2534                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2535 }
2536
2537 static void __net_exit tcp_sk_exit(struct net *net)
2538 {
2539         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2540 }
2541
2542 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2543 {
2544         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2545 }
2546
2547 static struct pernet_operations __net_initdata tcp_sk_ops = {
2548        .init       = tcp_sk_init,
2549        .exit       = tcp_sk_exit,
2550        .exit_batch = tcp_sk_exit_batch,
2551 };
2552
2553 void __init tcp_v4_init(void)
2554 {
2555         inet_hashinfo_init(&tcp_hashinfo);
2556         if (register_pernet_subsys(&tcp_sk_ops))
2557                 panic("Failed to create the TCP control socket.\n");
2558 }
2559
2560 EXPORT_SYMBOL(ipv4_specific);
2561 EXPORT_SYMBOL(tcp_hashinfo);
2562 EXPORT_SYMBOL(tcp_prot);
2563 EXPORT_SYMBOL(tcp_v4_conn_request);
2564 EXPORT_SYMBOL(tcp_v4_connect);
2565 EXPORT_SYMBOL(tcp_v4_do_rcv);
2566 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2567 EXPORT_SYMBOL(tcp_v4_send_check);
2568 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2569
2570 #ifdef CONFIG_PROC_FS
2571 EXPORT_SYMBOL(tcp_proc_register);
2572 EXPORT_SYMBOL(tcp_proc_unregister);
2573 #endif
2574 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2575