ipv4: Can final ip_route_connect() arg to boolean "can_sleep".
[linux-2.6.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63 #include <linux/slab.h>
64
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81
82 #include <linux/crypto.h>
83 #include <linux/scatterlist.h>
84
85 int sysctl_tcp_tw_reuse __read_mostly;
86 int sysctl_tcp_low_latency __read_mostly;
87 EXPORT_SYMBOL(sysctl_tcp_low_latency);
88
89
90 #ifdef CONFIG_TCP_MD5SIG
91 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
92                                                    __be32 addr);
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
94                                __be32 daddr, __be32 saddr, struct tcphdr *th);
95 #else
96 static inline
97 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
98 {
99         return NULL;
100 }
101 #endif
102
103 struct inet_hashinfo tcp_hashinfo;
104 EXPORT_SYMBOL(tcp_hashinfo);
105
106 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107 {
108         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109                                           ip_hdr(skb)->saddr,
110                                           tcp_hdr(skb)->dest,
111                                           tcp_hdr(skb)->source);
112 }
113
114 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115 {
116         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117         struct tcp_sock *tp = tcp_sk(sk);
118
119         /* With PAWS, it is safe from the viewpoint
120            of data integrity. Even without PAWS it is safe provided sequence
121            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122
123            Actually, the idea is close to VJ's one, only timestamp cache is
124            held not per host, but per port pair and TW bucket is used as state
125            holder.
126
127            If TW bucket has been already destroyed we fall back to VJ's scheme
128            and use initial timestamp retrieved from peer table.
129          */
130         if (tcptw->tw_ts_recent_stamp &&
131             (twp == NULL || (sysctl_tcp_tw_reuse &&
132                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
133                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134                 if (tp->write_seq == 0)
135                         tp->write_seq = 1;
136                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
137                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138                 sock_hold(sktw);
139                 return 1;
140         }
141
142         return 0;
143 }
144 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
145
146 /* This will initiate an outgoing connection. */
147 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
148 {
149         struct inet_sock *inet = inet_sk(sk);
150         struct tcp_sock *tp = tcp_sk(sk);
151         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
152         __be16 orig_sport, orig_dport;
153         struct rtable *rt;
154         __be32 daddr, nexthop;
155         int tmp;
156         int err;
157
158         if (addr_len < sizeof(struct sockaddr_in))
159                 return -EINVAL;
160
161         if (usin->sin_family != AF_INET)
162                 return -EAFNOSUPPORT;
163
164         nexthop = daddr = usin->sin_addr.s_addr;
165         if (inet->opt && inet->opt->srr) {
166                 if (!daddr)
167                         return -EINVAL;
168                 nexthop = inet->opt->faddr;
169         }
170
171         orig_sport = inet->inet_sport;
172         orig_dport = usin->sin_port;
173         tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
174                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
175                                IPPROTO_TCP,
176                                orig_sport, orig_dport, sk, true);
177         if (tmp < 0) {
178                 if (tmp == -ENETUNREACH)
179                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
180                 return tmp;
181         }
182
183         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
184                 ip_rt_put(rt);
185                 return -ENETUNREACH;
186         }
187
188         if (!inet->opt || !inet->opt->srr)
189                 daddr = rt->rt_dst;
190
191         if (!inet->inet_saddr)
192                 inet->inet_saddr = rt->rt_src;
193         inet->inet_rcv_saddr = inet->inet_saddr;
194
195         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
196                 /* Reset inherited state */
197                 tp->rx_opt.ts_recent       = 0;
198                 tp->rx_opt.ts_recent_stamp = 0;
199                 tp->write_seq              = 0;
200         }
201
202         if (tcp_death_row.sysctl_tw_recycle &&
203             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
204                 struct inet_peer *peer = rt_get_peer(rt);
205                 /*
206                  * VJ's idea. We save last timestamp seen from
207                  * the destination in peer table, when entering state
208                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
209                  * when trying new connection.
210                  */
211                 if (peer) {
212                         inet_peer_refcheck(peer);
213                         if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
214                                 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
215                                 tp->rx_opt.ts_recent = peer->tcp_ts;
216                         }
217                 }
218         }
219
220         inet->inet_dport = usin->sin_port;
221         inet->inet_daddr = daddr;
222
223         inet_csk(sk)->icsk_ext_hdr_len = 0;
224         if (inet->opt)
225                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
226
227         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
228
229         /* Socket identity is still unknown (sport may be zero).
230          * However we set state to SYN-SENT and not releasing socket
231          * lock select source port, enter ourselves into the hash tables and
232          * complete initialization after this.
233          */
234         tcp_set_state(sk, TCP_SYN_SENT);
235         err = inet_hash_connect(&tcp_death_row, sk);
236         if (err)
237                 goto failure;
238
239         err = ip_route_newports(&rt, IPPROTO_TCP,
240                                 orig_sport, orig_dport,
241                                 inet->inet_sport, inet->inet_dport, sk);
242         if (err)
243                 goto failure;
244
245         /* OK, now commit destination to socket.  */
246         sk->sk_gso_type = SKB_GSO_TCPV4;
247         sk_setup_caps(sk, &rt->dst);
248
249         if (!tp->write_seq)
250                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
251                                                            inet->inet_daddr,
252                                                            inet->inet_sport,
253                                                            usin->sin_port);
254
255         inet->inet_id = tp->write_seq ^ jiffies;
256
257         err = tcp_connect(sk);
258         rt = NULL;
259         if (err)
260                 goto failure;
261
262         return 0;
263
264 failure:
265         /*
266          * This unhashes the socket and releases the local port,
267          * if necessary.
268          */
269         tcp_set_state(sk, TCP_CLOSE);
270         ip_rt_put(rt);
271         sk->sk_route_caps = 0;
272         inet->inet_dport = 0;
273         return err;
274 }
275 EXPORT_SYMBOL(tcp_v4_connect);
276
277 /*
278  * This routine does path mtu discovery as defined in RFC1191.
279  */
280 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
281 {
282         struct dst_entry *dst;
283         struct inet_sock *inet = inet_sk(sk);
284
285         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
286          * send out by Linux are always <576bytes so they should go through
287          * unfragmented).
288          */
289         if (sk->sk_state == TCP_LISTEN)
290                 return;
291
292         /* We don't check in the destentry if pmtu discovery is forbidden
293          * on this route. We just assume that no packet_to_big packets
294          * are send back when pmtu discovery is not active.
295          * There is a small race when the user changes this flag in the
296          * route, but I think that's acceptable.
297          */
298         if ((dst = __sk_dst_check(sk, 0)) == NULL)
299                 return;
300
301         dst->ops->update_pmtu(dst, mtu);
302
303         /* Something is about to be wrong... Remember soft error
304          * for the case, if this connection will not able to recover.
305          */
306         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
307                 sk->sk_err_soft = EMSGSIZE;
308
309         mtu = dst_mtu(dst);
310
311         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
312             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
313                 tcp_sync_mss(sk, mtu);
314
315                 /* Resend the TCP packet because it's
316                  * clear that the old packet has been
317                  * dropped. This is the new "fast" path mtu
318                  * discovery.
319                  */
320                 tcp_simple_retransmit(sk);
321         } /* else let the usual retransmit timer handle it */
322 }
323
324 /*
325  * This routine is called by the ICMP module when it gets some
326  * sort of error condition.  If err < 0 then the socket should
327  * be closed and the error returned to the user.  If err > 0
328  * it's just the icmp type << 8 | icmp code.  After adjustment
329  * header points to the first 8 bytes of the tcp header.  We need
330  * to find the appropriate port.
331  *
332  * The locking strategy used here is very "optimistic". When
333  * someone else accesses the socket the ICMP is just dropped
334  * and for some paths there is no check at all.
335  * A more general error queue to queue errors for later handling
336  * is probably better.
337  *
338  */
339
340 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
341 {
342         struct iphdr *iph = (struct iphdr *)icmp_skb->data;
343         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
344         struct inet_connection_sock *icsk;
345         struct tcp_sock *tp;
346         struct inet_sock *inet;
347         const int type = icmp_hdr(icmp_skb)->type;
348         const int code = icmp_hdr(icmp_skb)->code;
349         struct sock *sk;
350         struct sk_buff *skb;
351         __u32 seq;
352         __u32 remaining;
353         int err;
354         struct net *net = dev_net(icmp_skb->dev);
355
356         if (icmp_skb->len < (iph->ihl << 2) + 8) {
357                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
358                 return;
359         }
360
361         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
362                         iph->saddr, th->source, inet_iif(icmp_skb));
363         if (!sk) {
364                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
365                 return;
366         }
367         if (sk->sk_state == TCP_TIME_WAIT) {
368                 inet_twsk_put(inet_twsk(sk));
369                 return;
370         }
371
372         bh_lock_sock(sk);
373         /* If too many ICMPs get dropped on busy
374          * servers this needs to be solved differently.
375          */
376         if (sock_owned_by_user(sk))
377                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
378
379         if (sk->sk_state == TCP_CLOSE)
380                 goto out;
381
382         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
383                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
384                 goto out;
385         }
386
387         icsk = inet_csk(sk);
388         tp = tcp_sk(sk);
389         seq = ntohl(th->seq);
390         if (sk->sk_state != TCP_LISTEN &&
391             !between(seq, tp->snd_una, tp->snd_nxt)) {
392                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
393                 goto out;
394         }
395
396         switch (type) {
397         case ICMP_SOURCE_QUENCH:
398                 /* Just silently ignore these. */
399                 goto out;
400         case ICMP_PARAMETERPROB:
401                 err = EPROTO;
402                 break;
403         case ICMP_DEST_UNREACH:
404                 if (code > NR_ICMP_UNREACH)
405                         goto out;
406
407                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
408                         if (!sock_owned_by_user(sk))
409                                 do_pmtu_discovery(sk, iph, info);
410                         goto out;
411                 }
412
413                 err = icmp_err_convert[code].errno;
414                 /* check if icmp_skb allows revert of backoff
415                  * (see draft-zimmermann-tcp-lcd) */
416                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
417                         break;
418                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
419                     !icsk->icsk_backoff)
420                         break;
421
422                 if (sock_owned_by_user(sk))
423                         break;
424
425                 icsk->icsk_backoff--;
426                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
427                                          icsk->icsk_backoff;
428                 tcp_bound_rto(sk);
429
430                 skb = tcp_write_queue_head(sk);
431                 BUG_ON(!skb);
432
433                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
434                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
435
436                 if (remaining) {
437                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
438                                                   remaining, TCP_RTO_MAX);
439                 } else {
440                         /* RTO revert clocked out retransmission.
441                          * Will retransmit now */
442                         tcp_retransmit_timer(sk);
443                 }
444
445                 break;
446         case ICMP_TIME_EXCEEDED:
447                 err = EHOSTUNREACH;
448                 break;
449         default:
450                 goto out;
451         }
452
453         switch (sk->sk_state) {
454                 struct request_sock *req, **prev;
455         case TCP_LISTEN:
456                 if (sock_owned_by_user(sk))
457                         goto out;
458
459                 req = inet_csk_search_req(sk, &prev, th->dest,
460                                           iph->daddr, iph->saddr);
461                 if (!req)
462                         goto out;
463
464                 /* ICMPs are not backlogged, hence we cannot get
465                    an established socket here.
466                  */
467                 WARN_ON(req->sk);
468
469                 if (seq != tcp_rsk(req)->snt_isn) {
470                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
471                         goto out;
472                 }
473
474                 /*
475                  * Still in SYN_RECV, just remove it silently.
476                  * There is no good way to pass the error to the newly
477                  * created socket, and POSIX does not want network
478                  * errors returned from accept().
479                  */
480                 inet_csk_reqsk_queue_drop(sk, req, prev);
481                 goto out;
482
483         case TCP_SYN_SENT:
484         case TCP_SYN_RECV:  /* Cannot happen.
485                                It can f.e. if SYNs crossed.
486                              */
487                 if (!sock_owned_by_user(sk)) {
488                         sk->sk_err = err;
489
490                         sk->sk_error_report(sk);
491
492                         tcp_done(sk);
493                 } else {
494                         sk->sk_err_soft = err;
495                 }
496                 goto out;
497         }
498
499         /* If we've already connected we will keep trying
500          * until we time out, or the user gives up.
501          *
502          * rfc1122 4.2.3.9 allows to consider as hard errors
503          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
504          * but it is obsoleted by pmtu discovery).
505          *
506          * Note, that in modern internet, where routing is unreliable
507          * and in each dark corner broken firewalls sit, sending random
508          * errors ordered by their masters even this two messages finally lose
509          * their original sense (even Linux sends invalid PORT_UNREACHs)
510          *
511          * Now we are in compliance with RFCs.
512          *                                                      --ANK (980905)
513          */
514
515         inet = inet_sk(sk);
516         if (!sock_owned_by_user(sk) && inet->recverr) {
517                 sk->sk_err = err;
518                 sk->sk_error_report(sk);
519         } else  { /* Only an error on timeout */
520                 sk->sk_err_soft = err;
521         }
522
523 out:
524         bh_unlock_sock(sk);
525         sock_put(sk);
526 }
527
528 static void __tcp_v4_send_check(struct sk_buff *skb,
529                                 __be32 saddr, __be32 daddr)
530 {
531         struct tcphdr *th = tcp_hdr(skb);
532
533         if (skb->ip_summed == CHECKSUM_PARTIAL) {
534                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
535                 skb->csum_start = skb_transport_header(skb) - skb->head;
536                 skb->csum_offset = offsetof(struct tcphdr, check);
537         } else {
538                 th->check = tcp_v4_check(skb->len, saddr, daddr,
539                                          csum_partial(th,
540                                                       th->doff << 2,
541                                                       skb->csum));
542         }
543 }
544
545 /* This routine computes an IPv4 TCP checksum. */
546 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
547 {
548         struct inet_sock *inet = inet_sk(sk);
549
550         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
551 }
552 EXPORT_SYMBOL(tcp_v4_send_check);
553
554 int tcp_v4_gso_send_check(struct sk_buff *skb)
555 {
556         const struct iphdr *iph;
557         struct tcphdr *th;
558
559         if (!pskb_may_pull(skb, sizeof(*th)))
560                 return -EINVAL;
561
562         iph = ip_hdr(skb);
563         th = tcp_hdr(skb);
564
565         th->check = 0;
566         skb->ip_summed = CHECKSUM_PARTIAL;
567         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
568         return 0;
569 }
570
571 /*
572  *      This routine will send an RST to the other tcp.
573  *
574  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
575  *                    for reset.
576  *      Answer: if a packet caused RST, it is not for a socket
577  *              existing in our system, if it is matched to a socket,
578  *              it is just duplicate segment or bug in other side's TCP.
579  *              So that we build reply only basing on parameters
580  *              arrived with segment.
581  *      Exception: precedence violation. We do not implement it in any case.
582  */
583
584 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
585 {
586         struct tcphdr *th = tcp_hdr(skb);
587         struct {
588                 struct tcphdr th;
589 #ifdef CONFIG_TCP_MD5SIG
590                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
591 #endif
592         } rep;
593         struct ip_reply_arg arg;
594 #ifdef CONFIG_TCP_MD5SIG
595         struct tcp_md5sig_key *key;
596 #endif
597         struct net *net;
598
599         /* Never send a reset in response to a reset. */
600         if (th->rst)
601                 return;
602
603         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
604                 return;
605
606         /* Swap the send and the receive. */
607         memset(&rep, 0, sizeof(rep));
608         rep.th.dest   = th->source;
609         rep.th.source = th->dest;
610         rep.th.doff   = sizeof(struct tcphdr) / 4;
611         rep.th.rst    = 1;
612
613         if (th->ack) {
614                 rep.th.seq = th->ack_seq;
615         } else {
616                 rep.th.ack = 1;
617                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
618                                        skb->len - (th->doff << 2));
619         }
620
621         memset(&arg, 0, sizeof(arg));
622         arg.iov[0].iov_base = (unsigned char *)&rep;
623         arg.iov[0].iov_len  = sizeof(rep.th);
624
625 #ifdef CONFIG_TCP_MD5SIG
626         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
627         if (key) {
628                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
629                                    (TCPOPT_NOP << 16) |
630                                    (TCPOPT_MD5SIG << 8) |
631                                    TCPOLEN_MD5SIG);
632                 /* Update length and the length the header thinks exists */
633                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
634                 rep.th.doff = arg.iov[0].iov_len / 4;
635
636                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
637                                      key, ip_hdr(skb)->saddr,
638                                      ip_hdr(skb)->daddr, &rep.th);
639         }
640 #endif
641         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
642                                       ip_hdr(skb)->saddr, /* XXX */
643                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
644         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
645         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
646
647         net = dev_net(skb_dst(skb)->dev);
648         ip_send_reply(net->ipv4.tcp_sock, skb,
649                       &arg, arg.iov[0].iov_len);
650
651         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
652         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
653 }
654
655 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
656    outside socket context is ugly, certainly. What can I do?
657  */
658
659 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
660                             u32 win, u32 ts, int oif,
661                             struct tcp_md5sig_key *key,
662                             int reply_flags)
663 {
664         struct tcphdr *th = tcp_hdr(skb);
665         struct {
666                 struct tcphdr th;
667                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
668 #ifdef CONFIG_TCP_MD5SIG
669                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
670 #endif
671                         ];
672         } rep;
673         struct ip_reply_arg arg;
674         struct net *net = dev_net(skb_dst(skb)->dev);
675
676         memset(&rep.th, 0, sizeof(struct tcphdr));
677         memset(&arg, 0, sizeof(arg));
678
679         arg.iov[0].iov_base = (unsigned char *)&rep;
680         arg.iov[0].iov_len  = sizeof(rep.th);
681         if (ts) {
682                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
683                                    (TCPOPT_TIMESTAMP << 8) |
684                                    TCPOLEN_TIMESTAMP);
685                 rep.opt[1] = htonl(tcp_time_stamp);
686                 rep.opt[2] = htonl(ts);
687                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
688         }
689
690         /* Swap the send and the receive. */
691         rep.th.dest    = th->source;
692         rep.th.source  = th->dest;
693         rep.th.doff    = arg.iov[0].iov_len / 4;
694         rep.th.seq     = htonl(seq);
695         rep.th.ack_seq = htonl(ack);
696         rep.th.ack     = 1;
697         rep.th.window  = htons(win);
698
699 #ifdef CONFIG_TCP_MD5SIG
700         if (key) {
701                 int offset = (ts) ? 3 : 0;
702
703                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
704                                           (TCPOPT_NOP << 16) |
705                                           (TCPOPT_MD5SIG << 8) |
706                                           TCPOLEN_MD5SIG);
707                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
708                 rep.th.doff = arg.iov[0].iov_len/4;
709
710                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
711                                     key, ip_hdr(skb)->saddr,
712                                     ip_hdr(skb)->daddr, &rep.th);
713         }
714 #endif
715         arg.flags = reply_flags;
716         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
717                                       ip_hdr(skb)->saddr, /* XXX */
718                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
719         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
720         if (oif)
721                 arg.bound_dev_if = oif;
722
723         ip_send_reply(net->ipv4.tcp_sock, skb,
724                       &arg, arg.iov[0].iov_len);
725
726         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
727 }
728
729 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
730 {
731         struct inet_timewait_sock *tw = inet_twsk(sk);
732         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
733
734         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
735                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
736                         tcptw->tw_ts_recent,
737                         tw->tw_bound_dev_if,
738                         tcp_twsk_md5_key(tcptw),
739                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
740                         );
741
742         inet_twsk_put(tw);
743 }
744
745 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
746                                   struct request_sock *req)
747 {
748         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
749                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
750                         req->ts_recent,
751                         0,
752                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
753                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
754 }
755
756 /*
757  *      Send a SYN-ACK after having received a SYN.
758  *      This still operates on a request_sock only, not on a big
759  *      socket.
760  */
761 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
762                               struct request_sock *req,
763                               struct request_values *rvp)
764 {
765         const struct inet_request_sock *ireq = inet_rsk(req);
766         int err = -1;
767         struct sk_buff * skb;
768
769         /* First, grab a route. */
770         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
771                 return -1;
772
773         skb = tcp_make_synack(sk, dst, req, rvp);
774
775         if (skb) {
776                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
777
778                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
779                                             ireq->rmt_addr,
780                                             ireq->opt);
781                 err = net_xmit_eval(err);
782         }
783
784         dst_release(dst);
785         return err;
786 }
787
788 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
789                               struct request_values *rvp)
790 {
791         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
792         return tcp_v4_send_synack(sk, NULL, req, rvp);
793 }
794
795 /*
796  *      IPv4 request_sock destructor.
797  */
798 static void tcp_v4_reqsk_destructor(struct request_sock *req)
799 {
800         kfree(inet_rsk(req)->opt);
801 }
802
803 static void syn_flood_warning(const struct sk_buff *skb)
804 {
805         const char *msg;
806
807 #ifdef CONFIG_SYN_COOKIES
808         if (sysctl_tcp_syncookies)
809                 msg = "Sending cookies";
810         else
811 #endif
812                 msg = "Dropping request";
813
814         pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
815                                 ntohs(tcp_hdr(skb)->dest), msg);
816 }
817
818 /*
819  * Save and compile IPv4 options into the request_sock if needed.
820  */
821 static struct ip_options *tcp_v4_save_options(struct sock *sk,
822                                               struct sk_buff *skb)
823 {
824         struct ip_options *opt = &(IPCB(skb)->opt);
825         struct ip_options *dopt = NULL;
826
827         if (opt && opt->optlen) {
828                 int opt_size = optlength(opt);
829                 dopt = kmalloc(opt_size, GFP_ATOMIC);
830                 if (dopt) {
831                         if (ip_options_echo(dopt, skb)) {
832                                 kfree(dopt);
833                                 dopt = NULL;
834                         }
835                 }
836         }
837         return dopt;
838 }
839
840 #ifdef CONFIG_TCP_MD5SIG
841 /*
842  * RFC2385 MD5 checksumming requires a mapping of
843  * IP address->MD5 Key.
844  * We need to maintain these in the sk structure.
845  */
846
847 /* Find the Key structure for an address.  */
848 static struct tcp_md5sig_key *
849                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
850 {
851         struct tcp_sock *tp = tcp_sk(sk);
852         int i;
853
854         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
855                 return NULL;
856         for (i = 0; i < tp->md5sig_info->entries4; i++) {
857                 if (tp->md5sig_info->keys4[i].addr == addr)
858                         return &tp->md5sig_info->keys4[i].base;
859         }
860         return NULL;
861 }
862
863 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
864                                          struct sock *addr_sk)
865 {
866         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
867 }
868 EXPORT_SYMBOL(tcp_v4_md5_lookup);
869
870 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
871                                                       struct request_sock *req)
872 {
873         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
874 }
875
876 /* This can be called on a newly created socket, from other files */
877 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
878                       u8 *newkey, u8 newkeylen)
879 {
880         /* Add Key to the list */
881         struct tcp_md5sig_key *key;
882         struct tcp_sock *tp = tcp_sk(sk);
883         struct tcp4_md5sig_key *keys;
884
885         key = tcp_v4_md5_do_lookup(sk, addr);
886         if (key) {
887                 /* Pre-existing entry - just update that one. */
888                 kfree(key->key);
889                 key->key = newkey;
890                 key->keylen = newkeylen;
891         } else {
892                 struct tcp_md5sig_info *md5sig;
893
894                 if (!tp->md5sig_info) {
895                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
896                                                   GFP_ATOMIC);
897                         if (!tp->md5sig_info) {
898                                 kfree(newkey);
899                                 return -ENOMEM;
900                         }
901                         sk_nocaps_add(sk, NETIF_F_GSO_MASK);
902                 }
903                 if (tcp_alloc_md5sig_pool(sk) == NULL) {
904                         kfree(newkey);
905                         return -ENOMEM;
906                 }
907                 md5sig = tp->md5sig_info;
908
909                 if (md5sig->alloced4 == md5sig->entries4) {
910                         keys = kmalloc((sizeof(*keys) *
911                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
912                         if (!keys) {
913                                 kfree(newkey);
914                                 tcp_free_md5sig_pool();
915                                 return -ENOMEM;
916                         }
917
918                         if (md5sig->entries4)
919                                 memcpy(keys, md5sig->keys4,
920                                        sizeof(*keys) * md5sig->entries4);
921
922                         /* Free old key list, and reference new one */
923                         kfree(md5sig->keys4);
924                         md5sig->keys4 = keys;
925                         md5sig->alloced4++;
926                 }
927                 md5sig->entries4++;
928                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
929                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
930                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
931         }
932         return 0;
933 }
934 EXPORT_SYMBOL(tcp_v4_md5_do_add);
935
936 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
937                                u8 *newkey, u8 newkeylen)
938 {
939         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
940                                  newkey, newkeylen);
941 }
942
943 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
944 {
945         struct tcp_sock *tp = tcp_sk(sk);
946         int i;
947
948         for (i = 0; i < tp->md5sig_info->entries4; i++) {
949                 if (tp->md5sig_info->keys4[i].addr == addr) {
950                         /* Free the key */
951                         kfree(tp->md5sig_info->keys4[i].base.key);
952                         tp->md5sig_info->entries4--;
953
954                         if (tp->md5sig_info->entries4 == 0) {
955                                 kfree(tp->md5sig_info->keys4);
956                                 tp->md5sig_info->keys4 = NULL;
957                                 tp->md5sig_info->alloced4 = 0;
958                         } else if (tp->md5sig_info->entries4 != i) {
959                                 /* Need to do some manipulation */
960                                 memmove(&tp->md5sig_info->keys4[i],
961                                         &tp->md5sig_info->keys4[i+1],
962                                         (tp->md5sig_info->entries4 - i) *
963                                          sizeof(struct tcp4_md5sig_key));
964                         }
965                         tcp_free_md5sig_pool();
966                         return 0;
967                 }
968         }
969         return -ENOENT;
970 }
971 EXPORT_SYMBOL(tcp_v4_md5_do_del);
972
973 static void tcp_v4_clear_md5_list(struct sock *sk)
974 {
975         struct tcp_sock *tp = tcp_sk(sk);
976
977         /* Free each key, then the set of key keys,
978          * the crypto element, and then decrement our
979          * hold on the last resort crypto.
980          */
981         if (tp->md5sig_info->entries4) {
982                 int i;
983                 for (i = 0; i < tp->md5sig_info->entries4; i++)
984                         kfree(tp->md5sig_info->keys4[i].base.key);
985                 tp->md5sig_info->entries4 = 0;
986                 tcp_free_md5sig_pool();
987         }
988         if (tp->md5sig_info->keys4) {
989                 kfree(tp->md5sig_info->keys4);
990                 tp->md5sig_info->keys4 = NULL;
991                 tp->md5sig_info->alloced4  = 0;
992         }
993 }
994
995 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
996                                  int optlen)
997 {
998         struct tcp_md5sig cmd;
999         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1000         u8 *newkey;
1001
1002         if (optlen < sizeof(cmd))
1003                 return -EINVAL;
1004
1005         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1006                 return -EFAULT;
1007
1008         if (sin->sin_family != AF_INET)
1009                 return -EINVAL;
1010
1011         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1012                 if (!tcp_sk(sk)->md5sig_info)
1013                         return -ENOENT;
1014                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1015         }
1016
1017         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1018                 return -EINVAL;
1019
1020         if (!tcp_sk(sk)->md5sig_info) {
1021                 struct tcp_sock *tp = tcp_sk(sk);
1022                 struct tcp_md5sig_info *p;
1023
1024                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1025                 if (!p)
1026                         return -EINVAL;
1027
1028                 tp->md5sig_info = p;
1029                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1030         }
1031
1032         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1033         if (!newkey)
1034                 return -ENOMEM;
1035         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1036                                  newkey, cmd.tcpm_keylen);
1037 }
1038
1039 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1040                                         __be32 daddr, __be32 saddr, int nbytes)
1041 {
1042         struct tcp4_pseudohdr *bp;
1043         struct scatterlist sg;
1044
1045         bp = &hp->md5_blk.ip4;
1046
1047         /*
1048          * 1. the TCP pseudo-header (in the order: source IP address,
1049          * destination IP address, zero-padded protocol number, and
1050          * segment length)
1051          */
1052         bp->saddr = saddr;
1053         bp->daddr = daddr;
1054         bp->pad = 0;
1055         bp->protocol = IPPROTO_TCP;
1056         bp->len = cpu_to_be16(nbytes);
1057
1058         sg_init_one(&sg, bp, sizeof(*bp));
1059         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1060 }
1061
1062 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1063                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1064 {
1065         struct tcp_md5sig_pool *hp;
1066         struct hash_desc *desc;
1067
1068         hp = tcp_get_md5sig_pool();
1069         if (!hp)
1070                 goto clear_hash_noput;
1071         desc = &hp->md5_desc;
1072
1073         if (crypto_hash_init(desc))
1074                 goto clear_hash;
1075         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1076                 goto clear_hash;
1077         if (tcp_md5_hash_header(hp, th))
1078                 goto clear_hash;
1079         if (tcp_md5_hash_key(hp, key))
1080                 goto clear_hash;
1081         if (crypto_hash_final(desc, md5_hash))
1082                 goto clear_hash;
1083
1084         tcp_put_md5sig_pool();
1085         return 0;
1086
1087 clear_hash:
1088         tcp_put_md5sig_pool();
1089 clear_hash_noput:
1090         memset(md5_hash, 0, 16);
1091         return 1;
1092 }
1093
1094 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1095                         struct sock *sk, struct request_sock *req,
1096                         struct sk_buff *skb)
1097 {
1098         struct tcp_md5sig_pool *hp;
1099         struct hash_desc *desc;
1100         struct tcphdr *th = tcp_hdr(skb);
1101         __be32 saddr, daddr;
1102
1103         if (sk) {
1104                 saddr = inet_sk(sk)->inet_saddr;
1105                 daddr = inet_sk(sk)->inet_daddr;
1106         } else if (req) {
1107                 saddr = inet_rsk(req)->loc_addr;
1108                 daddr = inet_rsk(req)->rmt_addr;
1109         } else {
1110                 const struct iphdr *iph = ip_hdr(skb);
1111                 saddr = iph->saddr;
1112                 daddr = iph->daddr;
1113         }
1114
1115         hp = tcp_get_md5sig_pool();
1116         if (!hp)
1117                 goto clear_hash_noput;
1118         desc = &hp->md5_desc;
1119
1120         if (crypto_hash_init(desc))
1121                 goto clear_hash;
1122
1123         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1124                 goto clear_hash;
1125         if (tcp_md5_hash_header(hp, th))
1126                 goto clear_hash;
1127         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1128                 goto clear_hash;
1129         if (tcp_md5_hash_key(hp, key))
1130                 goto clear_hash;
1131         if (crypto_hash_final(desc, md5_hash))
1132                 goto clear_hash;
1133
1134         tcp_put_md5sig_pool();
1135         return 0;
1136
1137 clear_hash:
1138         tcp_put_md5sig_pool();
1139 clear_hash_noput:
1140         memset(md5_hash, 0, 16);
1141         return 1;
1142 }
1143 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1144
1145 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1146 {
1147         /*
1148          * This gets called for each TCP segment that arrives
1149          * so we want to be efficient.
1150          * We have 3 drop cases:
1151          * o No MD5 hash and one expected.
1152          * o MD5 hash and we're not expecting one.
1153          * o MD5 hash and its wrong.
1154          */
1155         __u8 *hash_location = NULL;
1156         struct tcp_md5sig_key *hash_expected;
1157         const struct iphdr *iph = ip_hdr(skb);
1158         struct tcphdr *th = tcp_hdr(skb);
1159         int genhash;
1160         unsigned char newhash[16];
1161
1162         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1163         hash_location = tcp_parse_md5sig_option(th);
1164
1165         /* We've parsed the options - do we have a hash? */
1166         if (!hash_expected && !hash_location)
1167                 return 0;
1168
1169         if (hash_expected && !hash_location) {
1170                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1171                 return 1;
1172         }
1173
1174         if (!hash_expected && hash_location) {
1175                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1176                 return 1;
1177         }
1178
1179         /* Okay, so this is hash_expected and hash_location -
1180          * so we need to calculate the checksum.
1181          */
1182         genhash = tcp_v4_md5_hash_skb(newhash,
1183                                       hash_expected,
1184                                       NULL, NULL, skb);
1185
1186         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1187                 if (net_ratelimit()) {
1188                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1189                                &iph->saddr, ntohs(th->source),
1190                                &iph->daddr, ntohs(th->dest),
1191                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1192                 }
1193                 return 1;
1194         }
1195         return 0;
1196 }
1197
1198 #endif
1199
1200 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1201         .family         =       PF_INET,
1202         .obj_size       =       sizeof(struct tcp_request_sock),
1203         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1204         .send_ack       =       tcp_v4_reqsk_send_ack,
1205         .destructor     =       tcp_v4_reqsk_destructor,
1206         .send_reset     =       tcp_v4_send_reset,
1207         .syn_ack_timeout =      tcp_syn_ack_timeout,
1208 };
1209
1210 #ifdef CONFIG_TCP_MD5SIG
1211 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1212         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1213         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1214 };
1215 #endif
1216
1217 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1218 {
1219         struct tcp_extend_values tmp_ext;
1220         struct tcp_options_received tmp_opt;
1221         u8 *hash_location;
1222         struct request_sock *req;
1223         struct inet_request_sock *ireq;
1224         struct tcp_sock *tp = tcp_sk(sk);
1225         struct dst_entry *dst = NULL;
1226         __be32 saddr = ip_hdr(skb)->saddr;
1227         __be32 daddr = ip_hdr(skb)->daddr;
1228         __u32 isn = TCP_SKB_CB(skb)->when;
1229 #ifdef CONFIG_SYN_COOKIES
1230         int want_cookie = 0;
1231 #else
1232 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1233 #endif
1234
1235         /* Never answer to SYNs send to broadcast or multicast */
1236         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1237                 goto drop;
1238
1239         /* TW buckets are converted to open requests without
1240          * limitations, they conserve resources and peer is
1241          * evidently real one.
1242          */
1243         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1244                 if (net_ratelimit())
1245                         syn_flood_warning(skb);
1246 #ifdef CONFIG_SYN_COOKIES
1247                 if (sysctl_tcp_syncookies) {
1248                         want_cookie = 1;
1249                 } else
1250 #endif
1251                 goto drop;
1252         }
1253
1254         /* Accept backlog is full. If we have already queued enough
1255          * of warm entries in syn queue, drop request. It is better than
1256          * clogging syn queue with openreqs with exponentially increasing
1257          * timeout.
1258          */
1259         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1260                 goto drop;
1261
1262         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1263         if (!req)
1264                 goto drop;
1265
1266 #ifdef CONFIG_TCP_MD5SIG
1267         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1268 #endif
1269
1270         tcp_clear_options(&tmp_opt);
1271         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1272         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1273         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1274
1275         if (tmp_opt.cookie_plus > 0 &&
1276             tmp_opt.saw_tstamp &&
1277             !tp->rx_opt.cookie_out_never &&
1278             (sysctl_tcp_cookie_size > 0 ||
1279              (tp->cookie_values != NULL &&
1280               tp->cookie_values->cookie_desired > 0))) {
1281                 u8 *c;
1282                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1283                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1284
1285                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1286                         goto drop_and_release;
1287
1288                 /* Secret recipe starts with IP addresses */
1289                 *mess++ ^= (__force u32)daddr;
1290                 *mess++ ^= (__force u32)saddr;
1291
1292                 /* plus variable length Initiator Cookie */
1293                 c = (u8 *)mess;
1294                 while (l-- > 0)
1295                         *c++ ^= *hash_location++;
1296
1297 #ifdef CONFIG_SYN_COOKIES
1298                 want_cookie = 0;        /* not our kind of cookie */
1299 #endif
1300                 tmp_ext.cookie_out_never = 0; /* false */
1301                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1302         } else if (!tp->rx_opt.cookie_in_always) {
1303                 /* redundant indications, but ensure initialization. */
1304                 tmp_ext.cookie_out_never = 1; /* true */
1305                 tmp_ext.cookie_plus = 0;
1306         } else {
1307                 goto drop_and_release;
1308         }
1309         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1310
1311         if (want_cookie && !tmp_opt.saw_tstamp)
1312                 tcp_clear_options(&tmp_opt);
1313
1314         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1315         tcp_openreq_init(req, &tmp_opt, skb);
1316
1317         ireq = inet_rsk(req);
1318         ireq->loc_addr = daddr;
1319         ireq->rmt_addr = saddr;
1320         ireq->no_srccheck = inet_sk(sk)->transparent;
1321         ireq->opt = tcp_v4_save_options(sk, skb);
1322
1323         if (security_inet_conn_request(sk, skb, req))
1324                 goto drop_and_free;
1325
1326         if (!want_cookie || tmp_opt.tstamp_ok)
1327                 TCP_ECN_create_request(req, tcp_hdr(skb));
1328
1329         if (want_cookie) {
1330                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1331                 req->cookie_ts = tmp_opt.tstamp_ok;
1332         } else if (!isn) {
1333                 struct inet_peer *peer = NULL;
1334
1335                 /* VJ's idea. We save last timestamp seen
1336                  * from the destination in peer table, when entering
1337                  * state TIME-WAIT, and check against it before
1338                  * accepting new connection request.
1339                  *
1340                  * If "isn" is not zero, this request hit alive
1341                  * timewait bucket, so that all the necessary checks
1342                  * are made in the function processing timewait state.
1343                  */
1344                 if (tmp_opt.saw_tstamp &&
1345                     tcp_death_row.sysctl_tw_recycle &&
1346                     (dst = inet_csk_route_req(sk, req)) != NULL &&
1347                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1348                     peer->daddr.addr.a4 == saddr) {
1349                         inet_peer_refcheck(peer);
1350                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1351                             (s32)(peer->tcp_ts - req->ts_recent) >
1352                                                         TCP_PAWS_WINDOW) {
1353                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1354                                 goto drop_and_release;
1355                         }
1356                 }
1357                 /* Kill the following clause, if you dislike this way. */
1358                 else if (!sysctl_tcp_syncookies &&
1359                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1360                           (sysctl_max_syn_backlog >> 2)) &&
1361                          (!peer || !peer->tcp_ts_stamp) &&
1362                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1363                         /* Without syncookies last quarter of
1364                          * backlog is filled with destinations,
1365                          * proven to be alive.
1366                          * It means that we continue to communicate
1367                          * to destinations, already remembered
1368                          * to the moment of synflood.
1369                          */
1370                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1371                                        &saddr, ntohs(tcp_hdr(skb)->source));
1372                         goto drop_and_release;
1373                 }
1374
1375                 isn = tcp_v4_init_sequence(skb);
1376         }
1377         tcp_rsk(req)->snt_isn = isn;
1378
1379         if (tcp_v4_send_synack(sk, dst, req,
1380                                (struct request_values *)&tmp_ext) ||
1381             want_cookie)
1382                 goto drop_and_free;
1383
1384         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1385         return 0;
1386
1387 drop_and_release:
1388         dst_release(dst);
1389 drop_and_free:
1390         reqsk_free(req);
1391 drop:
1392         return 0;
1393 }
1394 EXPORT_SYMBOL(tcp_v4_conn_request);
1395
1396
1397 /*
1398  * The three way handshake has completed - we got a valid synack -
1399  * now create the new socket.
1400  */
1401 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1402                                   struct request_sock *req,
1403                                   struct dst_entry *dst)
1404 {
1405         struct inet_request_sock *ireq;
1406         struct inet_sock *newinet;
1407         struct tcp_sock *newtp;
1408         struct sock *newsk;
1409 #ifdef CONFIG_TCP_MD5SIG
1410         struct tcp_md5sig_key *key;
1411 #endif
1412
1413         if (sk_acceptq_is_full(sk))
1414                 goto exit_overflow;
1415
1416         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1417                 goto exit;
1418
1419         newsk = tcp_create_openreq_child(sk, req, skb);
1420         if (!newsk)
1421                 goto exit_nonewsk;
1422
1423         newsk->sk_gso_type = SKB_GSO_TCPV4;
1424         sk_setup_caps(newsk, dst);
1425
1426         newtp                 = tcp_sk(newsk);
1427         newinet               = inet_sk(newsk);
1428         ireq                  = inet_rsk(req);
1429         newinet->inet_daddr   = ireq->rmt_addr;
1430         newinet->inet_rcv_saddr = ireq->loc_addr;
1431         newinet->inet_saddr           = ireq->loc_addr;
1432         newinet->opt          = ireq->opt;
1433         ireq->opt             = NULL;
1434         newinet->mc_index     = inet_iif(skb);
1435         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1436         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1437         if (newinet->opt)
1438                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1439         newinet->inet_id = newtp->write_seq ^ jiffies;
1440
1441         tcp_mtup_init(newsk);
1442         tcp_sync_mss(newsk, dst_mtu(dst));
1443         newtp->advmss = dst_metric_advmss(dst);
1444         if (tcp_sk(sk)->rx_opt.user_mss &&
1445             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1446                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1447
1448         tcp_initialize_rcv_mss(newsk);
1449
1450 #ifdef CONFIG_TCP_MD5SIG
1451         /* Copy over the MD5 key from the original socket */
1452         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1453         if (key != NULL) {
1454                 /*
1455                  * We're using one, so create a matching key
1456                  * on the newsk structure. If we fail to get
1457                  * memory, then we end up not copying the key
1458                  * across. Shucks.
1459                  */
1460                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1461                 if (newkey != NULL)
1462                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1463                                           newkey, key->keylen);
1464                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1465         }
1466 #endif
1467
1468         if (__inet_inherit_port(sk, newsk) < 0) {
1469                 sock_put(newsk);
1470                 goto exit;
1471         }
1472         __inet_hash_nolisten(newsk, NULL);
1473
1474         return newsk;
1475
1476 exit_overflow:
1477         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1478 exit_nonewsk:
1479         dst_release(dst);
1480 exit:
1481         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1482         return NULL;
1483 }
1484 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1485
1486 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1487 {
1488         struct tcphdr *th = tcp_hdr(skb);
1489         const struct iphdr *iph = ip_hdr(skb);
1490         struct sock *nsk;
1491         struct request_sock **prev;
1492         /* Find possible connection requests. */
1493         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1494                                                        iph->saddr, iph->daddr);
1495         if (req)
1496                 return tcp_check_req(sk, skb, req, prev);
1497
1498         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1499                         th->source, iph->daddr, th->dest, inet_iif(skb));
1500
1501         if (nsk) {
1502                 if (nsk->sk_state != TCP_TIME_WAIT) {
1503                         bh_lock_sock(nsk);
1504                         return nsk;
1505                 }
1506                 inet_twsk_put(inet_twsk(nsk));
1507                 return NULL;
1508         }
1509
1510 #ifdef CONFIG_SYN_COOKIES
1511         if (!th->syn)
1512                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1513 #endif
1514         return sk;
1515 }
1516
1517 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1518 {
1519         const struct iphdr *iph = ip_hdr(skb);
1520
1521         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1522                 if (!tcp_v4_check(skb->len, iph->saddr,
1523                                   iph->daddr, skb->csum)) {
1524                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1525                         return 0;
1526                 }
1527         }
1528
1529         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1530                                        skb->len, IPPROTO_TCP, 0);
1531
1532         if (skb->len <= 76) {
1533                 return __skb_checksum_complete(skb);
1534         }
1535         return 0;
1536 }
1537
1538
1539 /* The socket must have it's spinlock held when we get
1540  * here.
1541  *
1542  * We have a potential double-lock case here, so even when
1543  * doing backlog processing we use the BH locking scheme.
1544  * This is because we cannot sleep with the original spinlock
1545  * held.
1546  */
1547 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1548 {
1549         struct sock *rsk;
1550 #ifdef CONFIG_TCP_MD5SIG
1551         /*
1552          * We really want to reject the packet as early as possible
1553          * if:
1554          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1555          *  o There is an MD5 option and we're not expecting one
1556          */
1557         if (tcp_v4_inbound_md5_hash(sk, skb))
1558                 goto discard;
1559 #endif
1560
1561         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1562                 sock_rps_save_rxhash(sk, skb->rxhash);
1563                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1564                         rsk = sk;
1565                         goto reset;
1566                 }
1567                 return 0;
1568         }
1569
1570         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1571                 goto csum_err;
1572
1573         if (sk->sk_state == TCP_LISTEN) {
1574                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1575                 if (!nsk)
1576                         goto discard;
1577
1578                 if (nsk != sk) {
1579                         if (tcp_child_process(sk, nsk, skb)) {
1580                                 rsk = nsk;
1581                                 goto reset;
1582                         }
1583                         return 0;
1584                 }
1585         } else
1586                 sock_rps_save_rxhash(sk, skb->rxhash);
1587
1588         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1589                 rsk = sk;
1590                 goto reset;
1591         }
1592         return 0;
1593
1594 reset:
1595         tcp_v4_send_reset(rsk, skb);
1596 discard:
1597         kfree_skb(skb);
1598         /* Be careful here. If this function gets more complicated and
1599          * gcc suffers from register pressure on the x86, sk (in %ebx)
1600          * might be destroyed here. This current version compiles correctly,
1601          * but you have been warned.
1602          */
1603         return 0;
1604
1605 csum_err:
1606         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1607         goto discard;
1608 }
1609 EXPORT_SYMBOL(tcp_v4_do_rcv);
1610
1611 /*
1612  *      From tcp_input.c
1613  */
1614
1615 int tcp_v4_rcv(struct sk_buff *skb)
1616 {
1617         const struct iphdr *iph;
1618         struct tcphdr *th;
1619         struct sock *sk;
1620         int ret;
1621         struct net *net = dev_net(skb->dev);
1622
1623         if (skb->pkt_type != PACKET_HOST)
1624                 goto discard_it;
1625
1626         /* Count it even if it's bad */
1627         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1628
1629         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1630                 goto discard_it;
1631
1632         th = tcp_hdr(skb);
1633
1634         if (th->doff < sizeof(struct tcphdr) / 4)
1635                 goto bad_packet;
1636         if (!pskb_may_pull(skb, th->doff * 4))
1637                 goto discard_it;
1638
1639         /* An explanation is required here, I think.
1640          * Packet length and doff are validated by header prediction,
1641          * provided case of th->doff==0 is eliminated.
1642          * So, we defer the checks. */
1643         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1644                 goto bad_packet;
1645
1646         th = tcp_hdr(skb);
1647         iph = ip_hdr(skb);
1648         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1649         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1650                                     skb->len - th->doff * 4);
1651         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1652         TCP_SKB_CB(skb)->when    = 0;
1653         TCP_SKB_CB(skb)->flags   = iph->tos;
1654         TCP_SKB_CB(skb)->sacked  = 0;
1655
1656         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1657         if (!sk)
1658                 goto no_tcp_socket;
1659
1660 process:
1661         if (sk->sk_state == TCP_TIME_WAIT)
1662                 goto do_time_wait;
1663
1664         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1665                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1666                 goto discard_and_relse;
1667         }
1668
1669         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1670                 goto discard_and_relse;
1671         nf_reset(skb);
1672
1673         if (sk_filter(sk, skb))
1674                 goto discard_and_relse;
1675
1676         skb->dev = NULL;
1677
1678         bh_lock_sock_nested(sk);
1679         ret = 0;
1680         if (!sock_owned_by_user(sk)) {
1681 #ifdef CONFIG_NET_DMA
1682                 struct tcp_sock *tp = tcp_sk(sk);
1683                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1684                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1685                 if (tp->ucopy.dma_chan)
1686                         ret = tcp_v4_do_rcv(sk, skb);
1687                 else
1688 #endif
1689                 {
1690                         if (!tcp_prequeue(sk, skb))
1691                                 ret = tcp_v4_do_rcv(sk, skb);
1692                 }
1693         } else if (unlikely(sk_add_backlog(sk, skb))) {
1694                 bh_unlock_sock(sk);
1695                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1696                 goto discard_and_relse;
1697         }
1698         bh_unlock_sock(sk);
1699
1700         sock_put(sk);
1701
1702         return ret;
1703
1704 no_tcp_socket:
1705         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1706                 goto discard_it;
1707
1708         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1709 bad_packet:
1710                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1711         } else {
1712                 tcp_v4_send_reset(NULL, skb);
1713         }
1714
1715 discard_it:
1716         /* Discard frame. */
1717         kfree_skb(skb);
1718         return 0;
1719
1720 discard_and_relse:
1721         sock_put(sk);
1722         goto discard_it;
1723
1724 do_time_wait:
1725         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1726                 inet_twsk_put(inet_twsk(sk));
1727                 goto discard_it;
1728         }
1729
1730         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1731                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1732                 inet_twsk_put(inet_twsk(sk));
1733                 goto discard_it;
1734         }
1735         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1736         case TCP_TW_SYN: {
1737                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1738                                                         &tcp_hashinfo,
1739                                                         iph->daddr, th->dest,
1740                                                         inet_iif(skb));
1741                 if (sk2) {
1742                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1743                         inet_twsk_put(inet_twsk(sk));
1744                         sk = sk2;
1745                         goto process;
1746                 }
1747                 /* Fall through to ACK */
1748         }
1749         case TCP_TW_ACK:
1750                 tcp_v4_timewait_ack(sk, skb);
1751                 break;
1752         case TCP_TW_RST:
1753                 goto no_tcp_socket;
1754         case TCP_TW_SUCCESS:;
1755         }
1756         goto discard_it;
1757 }
1758
1759 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1760 {
1761         struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1762         struct inet_sock *inet = inet_sk(sk);
1763         struct inet_peer *peer;
1764
1765         if (!rt || rt->rt_dst != inet->inet_daddr) {
1766                 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1767                 *release_it = true;
1768         } else {
1769                 if (!rt->peer)
1770                         rt_bind_peer(rt, 1);
1771                 peer = rt->peer;
1772                 *release_it = false;
1773         }
1774
1775         return peer;
1776 }
1777 EXPORT_SYMBOL(tcp_v4_get_peer);
1778
1779 void *tcp_v4_tw_get_peer(struct sock *sk)
1780 {
1781         struct inet_timewait_sock *tw = inet_twsk(sk);
1782
1783         return inet_getpeer_v4(tw->tw_daddr, 1);
1784 }
1785 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1786
1787 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1788         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1789         .twsk_unique    = tcp_twsk_unique,
1790         .twsk_destructor= tcp_twsk_destructor,
1791         .twsk_getpeer   = tcp_v4_tw_get_peer,
1792 };
1793
1794 const struct inet_connection_sock_af_ops ipv4_specific = {
1795         .queue_xmit        = ip_queue_xmit,
1796         .send_check        = tcp_v4_send_check,
1797         .rebuild_header    = inet_sk_rebuild_header,
1798         .conn_request      = tcp_v4_conn_request,
1799         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1800         .get_peer          = tcp_v4_get_peer,
1801         .net_header_len    = sizeof(struct iphdr),
1802         .setsockopt        = ip_setsockopt,
1803         .getsockopt        = ip_getsockopt,
1804         .addr2sockaddr     = inet_csk_addr2sockaddr,
1805         .sockaddr_len      = sizeof(struct sockaddr_in),
1806         .bind_conflict     = inet_csk_bind_conflict,
1807 #ifdef CONFIG_COMPAT
1808         .compat_setsockopt = compat_ip_setsockopt,
1809         .compat_getsockopt = compat_ip_getsockopt,
1810 #endif
1811 };
1812 EXPORT_SYMBOL(ipv4_specific);
1813
1814 #ifdef CONFIG_TCP_MD5SIG
1815 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1816         .md5_lookup             = tcp_v4_md5_lookup,
1817         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1818         .md5_add                = tcp_v4_md5_add_func,
1819         .md5_parse              = tcp_v4_parse_md5_keys,
1820 };
1821 #endif
1822
1823 /* NOTE: A lot of things set to zero explicitly by call to
1824  *       sk_alloc() so need not be done here.
1825  */
1826 static int tcp_v4_init_sock(struct sock *sk)
1827 {
1828         struct inet_connection_sock *icsk = inet_csk(sk);
1829         struct tcp_sock *tp = tcp_sk(sk);
1830
1831         skb_queue_head_init(&tp->out_of_order_queue);
1832         tcp_init_xmit_timers(sk);
1833         tcp_prequeue_init(tp);
1834
1835         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1836         tp->mdev = TCP_TIMEOUT_INIT;
1837
1838         /* So many TCP implementations out there (incorrectly) count the
1839          * initial SYN frame in their delayed-ACK and congestion control
1840          * algorithms that we must have the following bandaid to talk
1841          * efficiently to them.  -DaveM
1842          */
1843         tp->snd_cwnd = 2;
1844
1845         /* See draft-stevens-tcpca-spec-01 for discussion of the
1846          * initialization of these values.
1847          */
1848         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1849         tp->snd_cwnd_clamp = ~0;
1850         tp->mss_cache = TCP_MSS_DEFAULT;
1851
1852         tp->reordering = sysctl_tcp_reordering;
1853         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1854
1855         sk->sk_state = TCP_CLOSE;
1856
1857         sk->sk_write_space = sk_stream_write_space;
1858         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1859
1860         icsk->icsk_af_ops = &ipv4_specific;
1861         icsk->icsk_sync_mss = tcp_sync_mss;
1862 #ifdef CONFIG_TCP_MD5SIG
1863         tp->af_specific = &tcp_sock_ipv4_specific;
1864 #endif
1865
1866         /* TCP Cookie Transactions */
1867         if (sysctl_tcp_cookie_size > 0) {
1868                 /* Default, cookies without s_data_payload. */
1869                 tp->cookie_values =
1870                         kzalloc(sizeof(*tp->cookie_values),
1871                                 sk->sk_allocation);
1872                 if (tp->cookie_values != NULL)
1873                         kref_init(&tp->cookie_values->kref);
1874         }
1875         /* Presumed zeroed, in order of appearance:
1876          *      cookie_in_always, cookie_out_never,
1877          *      s_data_constant, s_data_in, s_data_out
1878          */
1879         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1880         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1881
1882         local_bh_disable();
1883         percpu_counter_inc(&tcp_sockets_allocated);
1884         local_bh_enable();
1885
1886         return 0;
1887 }
1888
1889 void tcp_v4_destroy_sock(struct sock *sk)
1890 {
1891         struct tcp_sock *tp = tcp_sk(sk);
1892
1893         tcp_clear_xmit_timers(sk);
1894
1895         tcp_cleanup_congestion_control(sk);
1896
1897         /* Cleanup up the write buffer. */
1898         tcp_write_queue_purge(sk);
1899
1900         /* Cleans up our, hopefully empty, out_of_order_queue. */
1901         __skb_queue_purge(&tp->out_of_order_queue);
1902
1903 #ifdef CONFIG_TCP_MD5SIG
1904         /* Clean up the MD5 key list, if any */
1905         if (tp->md5sig_info) {
1906                 tcp_v4_clear_md5_list(sk);
1907                 kfree(tp->md5sig_info);
1908                 tp->md5sig_info = NULL;
1909         }
1910 #endif
1911
1912 #ifdef CONFIG_NET_DMA
1913         /* Cleans up our sk_async_wait_queue */
1914         __skb_queue_purge(&sk->sk_async_wait_queue);
1915 #endif
1916
1917         /* Clean prequeue, it must be empty really */
1918         __skb_queue_purge(&tp->ucopy.prequeue);
1919
1920         /* Clean up a referenced TCP bind bucket. */
1921         if (inet_csk(sk)->icsk_bind_hash)
1922                 inet_put_port(sk);
1923
1924         /*
1925          * If sendmsg cached page exists, toss it.
1926          */
1927         if (sk->sk_sndmsg_page) {
1928                 __free_page(sk->sk_sndmsg_page);
1929                 sk->sk_sndmsg_page = NULL;
1930         }
1931
1932         /* TCP Cookie Transactions */
1933         if (tp->cookie_values != NULL) {
1934                 kref_put(&tp->cookie_values->kref,
1935                          tcp_cookie_values_release);
1936                 tp->cookie_values = NULL;
1937         }
1938
1939         percpu_counter_dec(&tcp_sockets_allocated);
1940 }
1941 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1942
1943 #ifdef CONFIG_PROC_FS
1944 /* Proc filesystem TCP sock list dumping. */
1945
1946 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1947 {
1948         return hlist_nulls_empty(head) ? NULL :
1949                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1950 }
1951
1952 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1953 {
1954         return !is_a_nulls(tw->tw_node.next) ?
1955                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1956 }
1957
1958 /*
1959  * Get next listener socket follow cur.  If cur is NULL, get first socket
1960  * starting from bucket given in st->bucket; when st->bucket is zero the
1961  * very first socket in the hash table is returned.
1962  */
1963 static void *listening_get_next(struct seq_file *seq, void *cur)
1964 {
1965         struct inet_connection_sock *icsk;
1966         struct hlist_nulls_node *node;
1967         struct sock *sk = cur;
1968         struct inet_listen_hashbucket *ilb;
1969         struct tcp_iter_state *st = seq->private;
1970         struct net *net = seq_file_net(seq);
1971
1972         if (!sk) {
1973                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1974                 spin_lock_bh(&ilb->lock);
1975                 sk = sk_nulls_head(&ilb->head);
1976                 st->offset = 0;
1977                 goto get_sk;
1978         }
1979         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1980         ++st->num;
1981         ++st->offset;
1982
1983         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1984                 struct request_sock *req = cur;
1985
1986                 icsk = inet_csk(st->syn_wait_sk);
1987                 req = req->dl_next;
1988                 while (1) {
1989                         while (req) {
1990                                 if (req->rsk_ops->family == st->family) {
1991                                         cur = req;
1992                                         goto out;
1993                                 }
1994                                 req = req->dl_next;
1995                         }
1996                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1997                                 break;
1998 get_req:
1999                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2000                 }
2001                 sk        = sk_nulls_next(st->syn_wait_sk);
2002                 st->state = TCP_SEQ_STATE_LISTENING;
2003                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2004         } else {
2005                 icsk = inet_csk(sk);
2006                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2007                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2008                         goto start_req;
2009                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2010                 sk = sk_nulls_next(sk);
2011         }
2012 get_sk:
2013         sk_nulls_for_each_from(sk, node) {
2014                 if (!net_eq(sock_net(sk), net))
2015                         continue;
2016                 if (sk->sk_family == st->family) {
2017                         cur = sk;
2018                         goto out;
2019                 }
2020                 icsk = inet_csk(sk);
2021                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2022                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2023 start_req:
2024                         st->uid         = sock_i_uid(sk);
2025                         st->syn_wait_sk = sk;
2026                         st->state       = TCP_SEQ_STATE_OPENREQ;
2027                         st->sbucket     = 0;
2028                         goto get_req;
2029                 }
2030                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2031         }
2032         spin_unlock_bh(&ilb->lock);
2033         st->offset = 0;
2034         if (++st->bucket < INET_LHTABLE_SIZE) {
2035                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2036                 spin_lock_bh(&ilb->lock);
2037                 sk = sk_nulls_head(&ilb->head);
2038                 goto get_sk;
2039         }
2040         cur = NULL;
2041 out:
2042         return cur;
2043 }
2044
2045 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2046 {
2047         struct tcp_iter_state *st = seq->private;
2048         void *rc;
2049
2050         st->bucket = 0;
2051         st->offset = 0;
2052         rc = listening_get_next(seq, NULL);
2053
2054         while (rc && *pos) {
2055                 rc = listening_get_next(seq, rc);
2056                 --*pos;
2057         }
2058         return rc;
2059 }
2060
2061 static inline int empty_bucket(struct tcp_iter_state *st)
2062 {
2063         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2064                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2065 }
2066
2067 /*
2068  * Get first established socket starting from bucket given in st->bucket.
2069  * If st->bucket is zero, the very first socket in the hash is returned.
2070  */
2071 static void *established_get_first(struct seq_file *seq)
2072 {
2073         struct tcp_iter_state *st = seq->private;
2074         struct net *net = seq_file_net(seq);
2075         void *rc = NULL;
2076
2077         st->offset = 0;
2078         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2079                 struct sock *sk;
2080                 struct hlist_nulls_node *node;
2081                 struct inet_timewait_sock *tw;
2082                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2083
2084                 /* Lockless fast path for the common case of empty buckets */
2085                 if (empty_bucket(st))
2086                         continue;
2087
2088                 spin_lock_bh(lock);
2089                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2090                         if (sk->sk_family != st->family ||
2091                             !net_eq(sock_net(sk), net)) {
2092                                 continue;
2093                         }
2094                         rc = sk;
2095                         goto out;
2096                 }
2097                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2098                 inet_twsk_for_each(tw, node,
2099                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2100                         if (tw->tw_family != st->family ||
2101                             !net_eq(twsk_net(tw), net)) {
2102                                 continue;
2103                         }
2104                         rc = tw;
2105                         goto out;
2106                 }
2107                 spin_unlock_bh(lock);
2108                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2109         }
2110 out:
2111         return rc;
2112 }
2113
2114 static void *established_get_next(struct seq_file *seq, void *cur)
2115 {
2116         struct sock *sk = cur;
2117         struct inet_timewait_sock *tw;
2118         struct hlist_nulls_node *node;
2119         struct tcp_iter_state *st = seq->private;
2120         struct net *net = seq_file_net(seq);
2121
2122         ++st->num;
2123         ++st->offset;
2124
2125         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2126                 tw = cur;
2127                 tw = tw_next(tw);
2128 get_tw:
2129                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2130                         tw = tw_next(tw);
2131                 }
2132                 if (tw) {
2133                         cur = tw;
2134                         goto out;
2135                 }
2136                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2137                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2138
2139                 /* Look for next non empty bucket */
2140                 st->offset = 0;
2141                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2142                                 empty_bucket(st))
2143                         ;
2144                 if (st->bucket > tcp_hashinfo.ehash_mask)
2145                         return NULL;
2146
2147                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2148                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2149         } else
2150                 sk = sk_nulls_next(sk);
2151
2152         sk_nulls_for_each_from(sk, node) {
2153                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2154                         goto found;
2155         }
2156
2157         st->state = TCP_SEQ_STATE_TIME_WAIT;
2158         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2159         goto get_tw;
2160 found:
2161         cur = sk;
2162 out:
2163         return cur;
2164 }
2165
2166 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2167 {
2168         struct tcp_iter_state *st = seq->private;
2169         void *rc;
2170
2171         st->bucket = 0;
2172         rc = established_get_first(seq);
2173
2174         while (rc && pos) {
2175                 rc = established_get_next(seq, rc);
2176                 --pos;
2177         }
2178         return rc;
2179 }
2180
2181 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2182 {
2183         void *rc;
2184         struct tcp_iter_state *st = seq->private;
2185
2186         st->state = TCP_SEQ_STATE_LISTENING;
2187         rc        = listening_get_idx(seq, &pos);
2188
2189         if (!rc) {
2190                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2191                 rc        = established_get_idx(seq, pos);
2192         }
2193
2194         return rc;
2195 }
2196
2197 static void *tcp_seek_last_pos(struct seq_file *seq)
2198 {
2199         struct tcp_iter_state *st = seq->private;
2200         int offset = st->offset;
2201         int orig_num = st->num;
2202         void *rc = NULL;
2203
2204         switch (st->state) {
2205         case TCP_SEQ_STATE_OPENREQ:
2206         case TCP_SEQ_STATE_LISTENING:
2207                 if (st->bucket >= INET_LHTABLE_SIZE)
2208                         break;
2209                 st->state = TCP_SEQ_STATE_LISTENING;
2210                 rc = listening_get_next(seq, NULL);
2211                 while (offset-- && rc)
2212                         rc = listening_get_next(seq, rc);
2213                 if (rc)
2214                         break;
2215                 st->bucket = 0;
2216                 /* Fallthrough */
2217         case TCP_SEQ_STATE_ESTABLISHED:
2218         case TCP_SEQ_STATE_TIME_WAIT:
2219                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2220                 if (st->bucket > tcp_hashinfo.ehash_mask)
2221                         break;
2222                 rc = established_get_first(seq);
2223                 while (offset-- && rc)
2224                         rc = established_get_next(seq, rc);
2225         }
2226
2227         st->num = orig_num;
2228
2229         return rc;
2230 }
2231
2232 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2233 {
2234         struct tcp_iter_state *st = seq->private;
2235         void *rc;
2236
2237         if (*pos && *pos == st->last_pos) {
2238                 rc = tcp_seek_last_pos(seq);
2239                 if (rc)
2240                         goto out;
2241         }
2242
2243         st->state = TCP_SEQ_STATE_LISTENING;
2244         st->num = 0;
2245         st->bucket = 0;
2246         st->offset = 0;
2247         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2248
2249 out:
2250         st->last_pos = *pos;
2251         return rc;
2252 }
2253
2254 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2255 {
2256         struct tcp_iter_state *st = seq->private;
2257         void *rc = NULL;
2258
2259         if (v == SEQ_START_TOKEN) {
2260                 rc = tcp_get_idx(seq, 0);
2261                 goto out;
2262         }
2263
2264         switch (st->state) {
2265         case TCP_SEQ_STATE_OPENREQ:
2266         case TCP_SEQ_STATE_LISTENING:
2267                 rc = listening_get_next(seq, v);
2268                 if (!rc) {
2269                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2270                         st->bucket = 0;
2271                         st->offset = 0;
2272                         rc        = established_get_first(seq);
2273                 }
2274                 break;
2275         case TCP_SEQ_STATE_ESTABLISHED:
2276         case TCP_SEQ_STATE_TIME_WAIT:
2277                 rc = established_get_next(seq, v);
2278                 break;
2279         }
2280 out:
2281         ++*pos;
2282         st->last_pos = *pos;
2283         return rc;
2284 }
2285
2286 static void tcp_seq_stop(struct seq_file *seq, void *v)
2287 {
2288         struct tcp_iter_state *st = seq->private;
2289
2290         switch (st->state) {
2291         case TCP_SEQ_STATE_OPENREQ:
2292                 if (v) {
2293                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2294                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2295                 }
2296         case TCP_SEQ_STATE_LISTENING:
2297                 if (v != SEQ_START_TOKEN)
2298                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2299                 break;
2300         case TCP_SEQ_STATE_TIME_WAIT:
2301         case TCP_SEQ_STATE_ESTABLISHED:
2302                 if (v)
2303                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2304                 break;
2305         }
2306 }
2307
2308 static int tcp_seq_open(struct inode *inode, struct file *file)
2309 {
2310         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2311         struct tcp_iter_state *s;
2312         int err;
2313
2314         err = seq_open_net(inode, file, &afinfo->seq_ops,
2315                           sizeof(struct tcp_iter_state));
2316         if (err < 0)
2317                 return err;
2318
2319         s = ((struct seq_file *)file->private_data)->private;
2320         s->family               = afinfo->family;
2321         s->last_pos             = 0;
2322         return 0;
2323 }
2324
2325 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2326 {
2327         int rc = 0;
2328         struct proc_dir_entry *p;
2329
2330         afinfo->seq_fops.open           = tcp_seq_open;
2331         afinfo->seq_fops.read           = seq_read;
2332         afinfo->seq_fops.llseek         = seq_lseek;
2333         afinfo->seq_fops.release        = seq_release_net;
2334
2335         afinfo->seq_ops.start           = tcp_seq_start;
2336         afinfo->seq_ops.next            = tcp_seq_next;
2337         afinfo->seq_ops.stop            = tcp_seq_stop;
2338
2339         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2340                              &afinfo->seq_fops, afinfo);
2341         if (!p)
2342                 rc = -ENOMEM;
2343         return rc;
2344 }
2345 EXPORT_SYMBOL(tcp_proc_register);
2346
2347 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2348 {
2349         proc_net_remove(net, afinfo->name);
2350 }
2351 EXPORT_SYMBOL(tcp_proc_unregister);
2352
2353 static void get_openreq4(struct sock *sk, struct request_sock *req,
2354                          struct seq_file *f, int i, int uid, int *len)
2355 {
2356         const struct inet_request_sock *ireq = inet_rsk(req);
2357         int ttd = req->expires - jiffies;
2358
2359         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2360                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2361                 i,
2362                 ireq->loc_addr,
2363                 ntohs(inet_sk(sk)->inet_sport),
2364                 ireq->rmt_addr,
2365                 ntohs(ireq->rmt_port),
2366                 TCP_SYN_RECV,
2367                 0, 0, /* could print option size, but that is af dependent. */
2368                 1,    /* timers active (only the expire timer) */
2369                 jiffies_to_clock_t(ttd),
2370                 req->retrans,
2371                 uid,
2372                 0,  /* non standard timer */
2373                 0, /* open_requests have no inode */
2374                 atomic_read(&sk->sk_refcnt),
2375                 req,
2376                 len);
2377 }
2378
2379 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2380 {
2381         int timer_active;
2382         unsigned long timer_expires;
2383         struct tcp_sock *tp = tcp_sk(sk);
2384         const struct inet_connection_sock *icsk = inet_csk(sk);
2385         struct inet_sock *inet = inet_sk(sk);
2386         __be32 dest = inet->inet_daddr;
2387         __be32 src = inet->inet_rcv_saddr;
2388         __u16 destp = ntohs(inet->inet_dport);
2389         __u16 srcp = ntohs(inet->inet_sport);
2390         int rx_queue;
2391
2392         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2393                 timer_active    = 1;
2394                 timer_expires   = icsk->icsk_timeout;
2395         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2396                 timer_active    = 4;
2397                 timer_expires   = icsk->icsk_timeout;
2398         } else if (timer_pending(&sk->sk_timer)) {
2399                 timer_active    = 2;
2400                 timer_expires   = sk->sk_timer.expires;
2401         } else {
2402                 timer_active    = 0;
2403                 timer_expires = jiffies;
2404         }
2405
2406         if (sk->sk_state == TCP_LISTEN)
2407                 rx_queue = sk->sk_ack_backlog;
2408         else
2409                 /*
2410                  * because we dont lock socket, we might find a transient negative value
2411                  */
2412                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2413
2414         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2415                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2416                 i, src, srcp, dest, destp, sk->sk_state,
2417                 tp->write_seq - tp->snd_una,
2418                 rx_queue,
2419                 timer_active,
2420                 jiffies_to_clock_t(timer_expires - jiffies),
2421                 icsk->icsk_retransmits,
2422                 sock_i_uid(sk),
2423                 icsk->icsk_probes_out,
2424                 sock_i_ino(sk),
2425                 atomic_read(&sk->sk_refcnt), sk,
2426                 jiffies_to_clock_t(icsk->icsk_rto),
2427                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2428                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2429                 tp->snd_cwnd,
2430                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2431                 len);
2432 }
2433
2434 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2435                                struct seq_file *f, int i, int *len)
2436 {
2437         __be32 dest, src;
2438         __u16 destp, srcp;
2439         int ttd = tw->tw_ttd - jiffies;
2440
2441         if (ttd < 0)
2442                 ttd = 0;
2443
2444         dest  = tw->tw_daddr;
2445         src   = tw->tw_rcv_saddr;
2446         destp = ntohs(tw->tw_dport);
2447         srcp  = ntohs(tw->tw_sport);
2448
2449         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2450                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2451                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2452                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2453                 atomic_read(&tw->tw_refcnt), tw, len);
2454 }
2455
2456 #define TMPSZ 150
2457
2458 static int tcp4_seq_show(struct seq_file *seq, void *v)
2459 {
2460         struct tcp_iter_state *st;
2461         int len;
2462
2463         if (v == SEQ_START_TOKEN) {
2464                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2465                            "  sl  local_address rem_address   st tx_queue "
2466                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2467                            "inode");
2468                 goto out;
2469         }
2470         st = seq->private;
2471
2472         switch (st->state) {
2473         case TCP_SEQ_STATE_LISTENING:
2474         case TCP_SEQ_STATE_ESTABLISHED:
2475                 get_tcp4_sock(v, seq, st->num, &len);
2476                 break;
2477         case TCP_SEQ_STATE_OPENREQ:
2478                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2479                 break;
2480         case TCP_SEQ_STATE_TIME_WAIT:
2481                 get_timewait4_sock(v, seq, st->num, &len);
2482                 break;
2483         }
2484         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2485 out:
2486         return 0;
2487 }
2488
2489 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2490         .name           = "tcp",
2491         .family         = AF_INET,
2492         .seq_fops       = {
2493                 .owner          = THIS_MODULE,
2494         },
2495         .seq_ops        = {
2496                 .show           = tcp4_seq_show,
2497         },
2498 };
2499
2500 static int __net_init tcp4_proc_init_net(struct net *net)
2501 {
2502         return tcp_proc_register(net, &tcp4_seq_afinfo);
2503 }
2504
2505 static void __net_exit tcp4_proc_exit_net(struct net *net)
2506 {
2507         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2508 }
2509
2510 static struct pernet_operations tcp4_net_ops = {
2511         .init = tcp4_proc_init_net,
2512         .exit = tcp4_proc_exit_net,
2513 };
2514
2515 int __init tcp4_proc_init(void)
2516 {
2517         return register_pernet_subsys(&tcp4_net_ops);
2518 }
2519
2520 void tcp4_proc_exit(void)
2521 {
2522         unregister_pernet_subsys(&tcp4_net_ops);
2523 }
2524 #endif /* CONFIG_PROC_FS */
2525
2526 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2527 {
2528         struct iphdr *iph = skb_gro_network_header(skb);
2529
2530         switch (skb->ip_summed) {
2531         case CHECKSUM_COMPLETE:
2532                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2533                                   skb->csum)) {
2534                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2535                         break;
2536                 }
2537
2538                 /* fall through */
2539         case CHECKSUM_NONE:
2540                 NAPI_GRO_CB(skb)->flush = 1;
2541                 return NULL;
2542         }
2543
2544         return tcp_gro_receive(head, skb);
2545 }
2546
2547 int tcp4_gro_complete(struct sk_buff *skb)
2548 {
2549         struct iphdr *iph = ip_hdr(skb);
2550         struct tcphdr *th = tcp_hdr(skb);
2551
2552         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2553                                   iph->saddr, iph->daddr, 0);
2554         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2555
2556         return tcp_gro_complete(skb);
2557 }
2558
2559 struct proto tcp_prot = {
2560         .name                   = "TCP",
2561         .owner                  = THIS_MODULE,
2562         .close                  = tcp_close,
2563         .connect                = tcp_v4_connect,
2564         .disconnect             = tcp_disconnect,
2565         .accept                 = inet_csk_accept,
2566         .ioctl                  = tcp_ioctl,
2567         .init                   = tcp_v4_init_sock,
2568         .destroy                = tcp_v4_destroy_sock,
2569         .shutdown               = tcp_shutdown,
2570         .setsockopt             = tcp_setsockopt,
2571         .getsockopt             = tcp_getsockopt,
2572         .recvmsg                = tcp_recvmsg,
2573         .sendmsg                = tcp_sendmsg,
2574         .sendpage               = tcp_sendpage,
2575         .backlog_rcv            = tcp_v4_do_rcv,
2576         .hash                   = inet_hash,
2577         .unhash                 = inet_unhash,
2578         .get_port               = inet_csk_get_port,
2579         .enter_memory_pressure  = tcp_enter_memory_pressure,
2580         .sockets_allocated      = &tcp_sockets_allocated,
2581         .orphan_count           = &tcp_orphan_count,
2582         .memory_allocated       = &tcp_memory_allocated,
2583         .memory_pressure        = &tcp_memory_pressure,
2584         .sysctl_mem             = sysctl_tcp_mem,
2585         .sysctl_wmem            = sysctl_tcp_wmem,
2586         .sysctl_rmem            = sysctl_tcp_rmem,
2587         .max_header             = MAX_TCP_HEADER,
2588         .obj_size               = sizeof(struct tcp_sock),
2589         .slab_flags             = SLAB_DESTROY_BY_RCU,
2590         .twsk_prot              = &tcp_timewait_sock_ops,
2591         .rsk_prot               = &tcp_request_sock_ops,
2592         .h.hashinfo             = &tcp_hashinfo,
2593         .no_autobind            = true,
2594 #ifdef CONFIG_COMPAT
2595         .compat_setsockopt      = compat_tcp_setsockopt,
2596         .compat_getsockopt      = compat_tcp_getsockopt,
2597 #endif
2598 };
2599 EXPORT_SYMBOL(tcp_prot);
2600
2601
2602 static int __net_init tcp_sk_init(struct net *net)
2603 {
2604         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2605                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2606 }
2607
2608 static void __net_exit tcp_sk_exit(struct net *net)
2609 {
2610         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2611 }
2612
2613 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2614 {
2615         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2616 }
2617
2618 static struct pernet_operations __net_initdata tcp_sk_ops = {
2619        .init       = tcp_sk_init,
2620        .exit       = tcp_sk_exit,
2621        .exit_batch = tcp_sk_exit_batch,
2622 };
2623
2624 void __init tcp_v4_init(void)
2625 {
2626         inet_hashinfo_init(&tcp_hashinfo);
2627         if (register_pernet_subsys(&tcp_sk_ops))
2628                 panic("Failed to create the TCP control socket.\n");
2629 }