ae03b7b75af6a67a8b4bd0a7228ebfddc3332de3
[linux-2.6.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  */
52
53 #define pr_fmt(fmt) "TCP: " fmt
54
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/netdma.h>
76 #include <net/secure_seq.h>
77 #include <net/tcp_memcontrol.h>
78
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
87
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91
92
93 #ifdef CONFIG_TCP_MD5SIG
94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
95                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
96 #endif
97
98 struct inet_hashinfo tcp_hashinfo;
99 EXPORT_SYMBOL(tcp_hashinfo);
100
101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
102 {
103         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
104                                           ip_hdr(skb)->saddr,
105                                           tcp_hdr(skb)->dest,
106                                           tcp_hdr(skb)->source);
107 }
108
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112         struct tcp_sock *tp = tcp_sk(sk);
113
114         /* With PAWS, it is safe from the viewpoint
115            of data integrity. Even without PAWS it is safe provided sequence
116            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
117
118            Actually, the idea is close to VJ's one, only timestamp cache is
119            held not per host, but per port pair and TW bucket is used as state
120            holder.
121
122            If TW bucket has been already destroyed we fall back to VJ's scheme
123            and use initial timestamp retrieved from peer table.
124          */
125         if (tcptw->tw_ts_recent_stamp &&
126             (twp == NULL || (sysctl_tcp_tw_reuse &&
127                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
128                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
129                 if (tp->write_seq == 0)
130                         tp->write_seq = 1;
131                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
132                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
133                 sock_hold(sktw);
134                 return 1;
135         }
136
137         return 0;
138 }
139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
140
141 /* This will initiate an outgoing connection. */
142 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
143 {
144         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
145         struct inet_sock *inet = inet_sk(sk);
146         struct tcp_sock *tp = tcp_sk(sk);
147         __be16 orig_sport, orig_dport;
148         __be32 daddr, nexthop;
149         struct flowi4 *fl4;
150         struct rtable *rt;
151         int err;
152         struct ip_options_rcu *inet_opt;
153
154         if (addr_len < sizeof(struct sockaddr_in))
155                 return -EINVAL;
156
157         if (usin->sin_family != AF_INET)
158                 return -EAFNOSUPPORT;
159
160         nexthop = daddr = usin->sin_addr.s_addr;
161         inet_opt = rcu_dereference_protected(inet->inet_opt,
162                                              sock_owned_by_user(sk));
163         if (inet_opt && inet_opt->opt.srr) {
164                 if (!daddr)
165                         return -EINVAL;
166                 nexthop = inet_opt->opt.faddr;
167         }
168
169         orig_sport = inet->inet_sport;
170         orig_dport = usin->sin_port;
171         fl4 = &inet->cork.fl.u.ip4;
172         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
173                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
174                               IPPROTO_TCP,
175                               orig_sport, orig_dport, sk, true);
176         if (IS_ERR(rt)) {
177                 err = PTR_ERR(rt);
178                 if (err == -ENETUNREACH)
179                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
180                 return err;
181         }
182
183         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
184                 ip_rt_put(rt);
185                 return -ENETUNREACH;
186         }
187
188         if (!inet_opt || !inet_opt->opt.srr)
189                 daddr = fl4->daddr;
190
191         if (!inet->inet_saddr)
192                 inet->inet_saddr = fl4->saddr;
193         inet->inet_rcv_saddr = inet->inet_saddr;
194
195         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
196                 /* Reset inherited state */
197                 tp->rx_opt.ts_recent       = 0;
198                 tp->rx_opt.ts_recent_stamp = 0;
199                 tp->write_seq              = 0;
200         }
201
202         if (tcp_death_row.sysctl_tw_recycle &&
203             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
204                 struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
205                 /*
206                  * VJ's idea. We save last timestamp seen from
207                  * the destination in peer table, when entering state
208                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
209                  * when trying new connection.
210                  */
211                 if (peer) {
212                         inet_peer_refcheck(peer);
213                         if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
214                                 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
215                                 tp->rx_opt.ts_recent = peer->tcp_ts;
216                         }
217                 }
218         }
219
220         inet->inet_dport = usin->sin_port;
221         inet->inet_daddr = daddr;
222
223         inet_csk(sk)->icsk_ext_hdr_len = 0;
224         if (inet_opt)
225                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
226
227         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
228
229         /* Socket identity is still unknown (sport may be zero).
230          * However we set state to SYN-SENT and not releasing socket
231          * lock select source port, enter ourselves into the hash tables and
232          * complete initialization after this.
233          */
234         tcp_set_state(sk, TCP_SYN_SENT);
235         err = inet_hash_connect(&tcp_death_row, sk);
236         if (err)
237                 goto failure;
238
239         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
240                                inet->inet_sport, inet->inet_dport, sk);
241         if (IS_ERR(rt)) {
242                 err = PTR_ERR(rt);
243                 rt = NULL;
244                 goto failure;
245         }
246         /* OK, now commit destination to socket.  */
247         sk->sk_gso_type = SKB_GSO_TCPV4;
248         sk_setup_caps(sk, &rt->dst);
249
250         if (!tp->write_seq)
251                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
252                                                            inet->inet_daddr,
253                                                            inet->inet_sport,
254                                                            usin->sin_port);
255
256         inet->inet_id = tp->write_seq ^ jiffies;
257
258         err = tcp_connect(sk);
259         rt = NULL;
260         if (err)
261                 goto failure;
262
263         return 0;
264
265 failure:
266         /*
267          * This unhashes the socket and releases the local port,
268          * if necessary.
269          */
270         tcp_set_state(sk, TCP_CLOSE);
271         ip_rt_put(rt);
272         sk->sk_route_caps = 0;
273         inet->inet_dport = 0;
274         return err;
275 }
276 EXPORT_SYMBOL(tcp_v4_connect);
277
278 /*
279  * This routine does path mtu discovery as defined in RFC1191.
280  */
281 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
282 {
283         struct dst_entry *dst;
284         struct inet_sock *inet = inet_sk(sk);
285
286         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
287          * send out by Linux are always <576bytes so they should go through
288          * unfragmented).
289          */
290         if (sk->sk_state == TCP_LISTEN)
291                 return;
292
293         /* We don't check in the destentry if pmtu discovery is forbidden
294          * on this route. We just assume that no packet_to_big packets
295          * are send back when pmtu discovery is not active.
296          * There is a small race when the user changes this flag in the
297          * route, but I think that's acceptable.
298          */
299         if ((dst = __sk_dst_check(sk, 0)) == NULL)
300                 return;
301
302         dst->ops->update_pmtu(dst, mtu);
303
304         /* Something is about to be wrong... Remember soft error
305          * for the case, if this connection will not able to recover.
306          */
307         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
308                 sk->sk_err_soft = EMSGSIZE;
309
310         mtu = dst_mtu(dst);
311
312         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
313             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
314                 tcp_sync_mss(sk, mtu);
315
316                 /* Resend the TCP packet because it's
317                  * clear that the old packet has been
318                  * dropped. This is the new "fast" path mtu
319                  * discovery.
320                  */
321                 tcp_simple_retransmit(sk);
322         } /* else let the usual retransmit timer handle it */
323 }
324
325 /*
326  * This routine is called by the ICMP module when it gets some
327  * sort of error condition.  If err < 0 then the socket should
328  * be closed and the error returned to the user.  If err > 0
329  * it's just the icmp type << 8 | icmp code.  After adjustment
330  * header points to the first 8 bytes of the tcp header.  We need
331  * to find the appropriate port.
332  *
333  * The locking strategy used here is very "optimistic". When
334  * someone else accesses the socket the ICMP is just dropped
335  * and for some paths there is no check at all.
336  * A more general error queue to queue errors for later handling
337  * is probably better.
338  *
339  */
340
341 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
342 {
343         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
344         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
345         struct inet_connection_sock *icsk;
346         struct tcp_sock *tp;
347         struct inet_sock *inet;
348         const int type = icmp_hdr(icmp_skb)->type;
349         const int code = icmp_hdr(icmp_skb)->code;
350         struct sock *sk;
351         struct sk_buff *skb;
352         __u32 seq;
353         __u32 remaining;
354         int err;
355         struct net *net = dev_net(icmp_skb->dev);
356
357         if (icmp_skb->len < (iph->ihl << 2) + 8) {
358                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
359                 return;
360         }
361
362         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
363                         iph->saddr, th->source, inet_iif(icmp_skb));
364         if (!sk) {
365                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
366                 return;
367         }
368         if (sk->sk_state == TCP_TIME_WAIT) {
369                 inet_twsk_put(inet_twsk(sk));
370                 return;
371         }
372
373         bh_lock_sock(sk);
374         /* If too many ICMPs get dropped on busy
375          * servers this needs to be solved differently.
376          */
377         if (sock_owned_by_user(sk))
378                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
379
380         if (sk->sk_state == TCP_CLOSE)
381                 goto out;
382
383         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
384                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
385                 goto out;
386         }
387
388         icsk = inet_csk(sk);
389         tp = tcp_sk(sk);
390         seq = ntohl(th->seq);
391         if (sk->sk_state != TCP_LISTEN &&
392             !between(seq, tp->snd_una, tp->snd_nxt)) {
393                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
394                 goto out;
395         }
396
397         switch (type) {
398         case ICMP_SOURCE_QUENCH:
399                 /* Just silently ignore these. */
400                 goto out;
401         case ICMP_PARAMETERPROB:
402                 err = EPROTO;
403                 break;
404         case ICMP_DEST_UNREACH:
405                 if (code > NR_ICMP_UNREACH)
406                         goto out;
407
408                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
409                         if (!sock_owned_by_user(sk))
410                                 do_pmtu_discovery(sk, iph, info);
411                         goto out;
412                 }
413
414                 err = icmp_err_convert[code].errno;
415                 /* check if icmp_skb allows revert of backoff
416                  * (see draft-zimmermann-tcp-lcd) */
417                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
418                         break;
419                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
420                     !icsk->icsk_backoff)
421                         break;
422
423                 if (sock_owned_by_user(sk))
424                         break;
425
426                 icsk->icsk_backoff--;
427                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
428                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
429                 tcp_bound_rto(sk);
430
431                 skb = tcp_write_queue_head(sk);
432                 BUG_ON(!skb);
433
434                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
435                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
436
437                 if (remaining) {
438                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
439                                                   remaining, TCP_RTO_MAX);
440                 } else {
441                         /* RTO revert clocked out retransmission.
442                          * Will retransmit now */
443                         tcp_retransmit_timer(sk);
444                 }
445
446                 break;
447         case ICMP_TIME_EXCEEDED:
448                 err = EHOSTUNREACH;
449                 break;
450         default:
451                 goto out;
452         }
453
454         switch (sk->sk_state) {
455                 struct request_sock *req, **prev;
456         case TCP_LISTEN:
457                 if (sock_owned_by_user(sk))
458                         goto out;
459
460                 req = inet_csk_search_req(sk, &prev, th->dest,
461                                           iph->daddr, iph->saddr);
462                 if (!req)
463                         goto out;
464
465                 /* ICMPs are not backlogged, hence we cannot get
466                    an established socket here.
467                  */
468                 WARN_ON(req->sk);
469
470                 if (seq != tcp_rsk(req)->snt_isn) {
471                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
472                         goto out;
473                 }
474
475                 /*
476                  * Still in SYN_RECV, just remove it silently.
477                  * There is no good way to pass the error to the newly
478                  * created socket, and POSIX does not want network
479                  * errors returned from accept().
480                  */
481                 inet_csk_reqsk_queue_drop(sk, req, prev);
482                 goto out;
483
484         case TCP_SYN_SENT:
485         case TCP_SYN_RECV:  /* Cannot happen.
486                                It can f.e. if SYNs crossed.
487                              */
488                 if (!sock_owned_by_user(sk)) {
489                         sk->sk_err = err;
490
491                         sk->sk_error_report(sk);
492
493                         tcp_done(sk);
494                 } else {
495                         sk->sk_err_soft = err;
496                 }
497                 goto out;
498         }
499
500         /* If we've already connected we will keep trying
501          * until we time out, or the user gives up.
502          *
503          * rfc1122 4.2.3.9 allows to consider as hard errors
504          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
505          * but it is obsoleted by pmtu discovery).
506          *
507          * Note, that in modern internet, where routing is unreliable
508          * and in each dark corner broken firewalls sit, sending random
509          * errors ordered by their masters even this two messages finally lose
510          * their original sense (even Linux sends invalid PORT_UNREACHs)
511          *
512          * Now we are in compliance with RFCs.
513          *                                                      --ANK (980905)
514          */
515
516         inet = inet_sk(sk);
517         if (!sock_owned_by_user(sk) && inet->recverr) {
518                 sk->sk_err = err;
519                 sk->sk_error_report(sk);
520         } else  { /* Only an error on timeout */
521                 sk->sk_err_soft = err;
522         }
523
524 out:
525         bh_unlock_sock(sk);
526         sock_put(sk);
527 }
528
529 static void __tcp_v4_send_check(struct sk_buff *skb,
530                                 __be32 saddr, __be32 daddr)
531 {
532         struct tcphdr *th = tcp_hdr(skb);
533
534         if (skb->ip_summed == CHECKSUM_PARTIAL) {
535                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
536                 skb->csum_start = skb_transport_header(skb) - skb->head;
537                 skb->csum_offset = offsetof(struct tcphdr, check);
538         } else {
539                 th->check = tcp_v4_check(skb->len, saddr, daddr,
540                                          csum_partial(th,
541                                                       th->doff << 2,
542                                                       skb->csum));
543         }
544 }
545
546 /* This routine computes an IPv4 TCP checksum. */
547 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
548 {
549         const struct inet_sock *inet = inet_sk(sk);
550
551         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
552 }
553 EXPORT_SYMBOL(tcp_v4_send_check);
554
555 int tcp_v4_gso_send_check(struct sk_buff *skb)
556 {
557         const struct iphdr *iph;
558         struct tcphdr *th;
559
560         if (!pskb_may_pull(skb, sizeof(*th)))
561                 return -EINVAL;
562
563         iph = ip_hdr(skb);
564         th = tcp_hdr(skb);
565
566         th->check = 0;
567         skb->ip_summed = CHECKSUM_PARTIAL;
568         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
569         return 0;
570 }
571
572 /*
573  *      This routine will send an RST to the other tcp.
574  *
575  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
576  *                    for reset.
577  *      Answer: if a packet caused RST, it is not for a socket
578  *              existing in our system, if it is matched to a socket,
579  *              it is just duplicate segment or bug in other side's TCP.
580  *              So that we build reply only basing on parameters
581  *              arrived with segment.
582  *      Exception: precedence violation. We do not implement it in any case.
583  */
584
585 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
586 {
587         const struct tcphdr *th = tcp_hdr(skb);
588         struct {
589                 struct tcphdr th;
590 #ifdef CONFIG_TCP_MD5SIG
591                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
592 #endif
593         } rep;
594         struct ip_reply_arg arg;
595 #ifdef CONFIG_TCP_MD5SIG
596         struct tcp_md5sig_key *key;
597         const __u8 *hash_location = NULL;
598         unsigned char newhash[16];
599         int genhash;
600         struct sock *sk1 = NULL;
601 #endif
602         struct net *net;
603
604         /* Never send a reset in response to a reset. */
605         if (th->rst)
606                 return;
607
608         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
609                 return;
610
611         /* Swap the send and the receive. */
612         memset(&rep, 0, sizeof(rep));
613         rep.th.dest   = th->source;
614         rep.th.source = th->dest;
615         rep.th.doff   = sizeof(struct tcphdr) / 4;
616         rep.th.rst    = 1;
617
618         if (th->ack) {
619                 rep.th.seq = th->ack_seq;
620         } else {
621                 rep.th.ack = 1;
622                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
623                                        skb->len - (th->doff << 2));
624         }
625
626         memset(&arg, 0, sizeof(arg));
627         arg.iov[0].iov_base = (unsigned char *)&rep;
628         arg.iov[0].iov_len  = sizeof(rep.th);
629
630 #ifdef CONFIG_TCP_MD5SIG
631         hash_location = tcp_parse_md5sig_option(th);
632         if (!sk && hash_location) {
633                 /*
634                  * active side is lost. Try to find listening socket through
635                  * source port, and then find md5 key through listening socket.
636                  * we are not loose security here:
637                  * Incoming packet is checked with md5 hash with finding key,
638                  * no RST generated if md5 hash doesn't match.
639                  */
640                 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
641                                              &tcp_hashinfo, ip_hdr(skb)->daddr,
642                                              ntohs(th->source), inet_iif(skb));
643                 /* don't send rst if it can't find key */
644                 if (!sk1)
645                         return;
646                 rcu_read_lock();
647                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
648                                         &ip_hdr(skb)->saddr, AF_INET);
649                 if (!key)
650                         goto release_sk1;
651
652                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
653                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
654                         goto release_sk1;
655         } else {
656                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
657                                              &ip_hdr(skb)->saddr,
658                                              AF_INET) : NULL;
659         }
660
661         if (key) {
662                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
663                                    (TCPOPT_NOP << 16) |
664                                    (TCPOPT_MD5SIG << 8) |
665                                    TCPOLEN_MD5SIG);
666                 /* Update length and the length the header thinks exists */
667                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
668                 rep.th.doff = arg.iov[0].iov_len / 4;
669
670                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
671                                      key, ip_hdr(skb)->saddr,
672                                      ip_hdr(skb)->daddr, &rep.th);
673         }
674 #endif
675         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
676                                       ip_hdr(skb)->saddr, /* XXX */
677                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
678         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
679         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
680         /* When socket is gone, all binding information is lost.
681          * routing might fail in this case. No choice here, if we choose to force
682          * input interface, we will misroute in case of asymmetric route.
683          */
684         if (sk)
685                 arg.bound_dev_if = sk->sk_bound_dev_if;
686
687         net = dev_net(skb_dst(skb)->dev);
688         arg.tos = ip_hdr(skb)->tos;
689         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
690                       &arg, arg.iov[0].iov_len);
691
692         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
693         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
694
695 #ifdef CONFIG_TCP_MD5SIG
696 release_sk1:
697         if (sk1) {
698                 rcu_read_unlock();
699                 sock_put(sk1);
700         }
701 #endif
702 }
703
704 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
705    outside socket context is ugly, certainly. What can I do?
706  */
707
708 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
709                             u32 win, u32 ts, int oif,
710                             struct tcp_md5sig_key *key,
711                             int reply_flags, u8 tos)
712 {
713         const struct tcphdr *th = tcp_hdr(skb);
714         struct {
715                 struct tcphdr th;
716                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
717 #ifdef CONFIG_TCP_MD5SIG
718                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
719 #endif
720                         ];
721         } rep;
722         struct ip_reply_arg arg;
723         struct net *net = dev_net(skb_dst(skb)->dev);
724
725         memset(&rep.th, 0, sizeof(struct tcphdr));
726         memset(&arg, 0, sizeof(arg));
727
728         arg.iov[0].iov_base = (unsigned char *)&rep;
729         arg.iov[0].iov_len  = sizeof(rep.th);
730         if (ts) {
731                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
732                                    (TCPOPT_TIMESTAMP << 8) |
733                                    TCPOLEN_TIMESTAMP);
734                 rep.opt[1] = htonl(tcp_time_stamp);
735                 rep.opt[2] = htonl(ts);
736                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
737         }
738
739         /* Swap the send and the receive. */
740         rep.th.dest    = th->source;
741         rep.th.source  = th->dest;
742         rep.th.doff    = arg.iov[0].iov_len / 4;
743         rep.th.seq     = htonl(seq);
744         rep.th.ack_seq = htonl(ack);
745         rep.th.ack     = 1;
746         rep.th.window  = htons(win);
747
748 #ifdef CONFIG_TCP_MD5SIG
749         if (key) {
750                 int offset = (ts) ? 3 : 0;
751
752                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
753                                           (TCPOPT_NOP << 16) |
754                                           (TCPOPT_MD5SIG << 8) |
755                                           TCPOLEN_MD5SIG);
756                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
757                 rep.th.doff = arg.iov[0].iov_len/4;
758
759                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
760                                     key, ip_hdr(skb)->saddr,
761                                     ip_hdr(skb)->daddr, &rep.th);
762         }
763 #endif
764         arg.flags = reply_flags;
765         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
766                                       ip_hdr(skb)->saddr, /* XXX */
767                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
768         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
769         if (oif)
770                 arg.bound_dev_if = oif;
771         arg.tos = tos;
772         ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
773                       &arg, arg.iov[0].iov_len);
774
775         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
776 }
777
778 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
779 {
780         struct inet_timewait_sock *tw = inet_twsk(sk);
781         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
782
783         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
784                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
785                         tcptw->tw_ts_recent,
786                         tw->tw_bound_dev_if,
787                         tcp_twsk_md5_key(tcptw),
788                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
789                         tw->tw_tos
790                         );
791
792         inet_twsk_put(tw);
793 }
794
795 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
796                                   struct request_sock *req)
797 {
798         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
799                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
800                         req->ts_recent,
801                         0,
802                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
803                                           AF_INET),
804                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
805                         ip_hdr(skb)->tos);
806 }
807
808 /*
809  *      Send a SYN-ACK after having received a SYN.
810  *      This still operates on a request_sock only, not on a big
811  *      socket.
812  */
813 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
814                               struct request_sock *req,
815                               struct request_values *rvp)
816 {
817         const struct inet_request_sock *ireq = inet_rsk(req);
818         struct flowi4 fl4;
819         int err = -1;
820         struct sk_buff * skb;
821
822         /* First, grab a route. */
823         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
824                 return -1;
825
826         skb = tcp_make_synack(sk, dst, req, rvp);
827
828         if (skb) {
829                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
830
831                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
832                                             ireq->rmt_addr,
833                                             ireq->opt);
834                 err = net_xmit_eval(err);
835         }
836
837         dst_release(dst);
838         return err;
839 }
840
841 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
842                               struct request_values *rvp)
843 {
844         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
845         return tcp_v4_send_synack(sk, NULL, req, rvp);
846 }
847
848 /*
849  *      IPv4 request_sock destructor.
850  */
851 static void tcp_v4_reqsk_destructor(struct request_sock *req)
852 {
853         kfree(inet_rsk(req)->opt);
854 }
855
856 /*
857  * Return 1 if a syncookie should be sent
858  */
859 int tcp_syn_flood_action(struct sock *sk,
860                          const struct sk_buff *skb,
861                          const char *proto)
862 {
863         const char *msg = "Dropping request";
864         int want_cookie = 0;
865         struct listen_sock *lopt;
866
867
868
869 #ifdef CONFIG_SYN_COOKIES
870         if (sysctl_tcp_syncookies) {
871                 msg = "Sending cookies";
872                 want_cookie = 1;
873                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
874         } else
875 #endif
876                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
877
878         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
879         if (!lopt->synflood_warned) {
880                 lopt->synflood_warned = 1;
881                 pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
882                         proto, ntohs(tcp_hdr(skb)->dest), msg);
883         }
884         return want_cookie;
885 }
886 EXPORT_SYMBOL(tcp_syn_flood_action);
887
888 /*
889  * Save and compile IPv4 options into the request_sock if needed.
890  */
891 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
892                                                   struct sk_buff *skb)
893 {
894         const struct ip_options *opt = &(IPCB(skb)->opt);
895         struct ip_options_rcu *dopt = NULL;
896
897         if (opt && opt->optlen) {
898                 int opt_size = sizeof(*dopt) + opt->optlen;
899
900                 dopt = kmalloc(opt_size, GFP_ATOMIC);
901                 if (dopt) {
902                         if (ip_options_echo(&dopt->opt, skb)) {
903                                 kfree(dopt);
904                                 dopt = NULL;
905                         }
906                 }
907         }
908         return dopt;
909 }
910
911 #ifdef CONFIG_TCP_MD5SIG
912 /*
913  * RFC2385 MD5 checksumming requires a mapping of
914  * IP address->MD5 Key.
915  * We need to maintain these in the sk structure.
916  */
917
918 /* Find the Key structure for an address.  */
919 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
920                                          const union tcp_md5_addr *addr,
921                                          int family)
922 {
923         struct tcp_sock *tp = tcp_sk(sk);
924         struct tcp_md5sig_key *key;
925         struct hlist_node *pos;
926         unsigned int size = sizeof(struct in_addr);
927         struct tcp_md5sig_info *md5sig;
928
929         /* caller either holds rcu_read_lock() or socket lock */
930         md5sig = rcu_dereference_check(tp->md5sig_info,
931                                        sock_owned_by_user(sk) ||
932                                        lockdep_is_held(&sk->sk_lock.slock));
933         if (!md5sig)
934                 return NULL;
935 #if IS_ENABLED(CONFIG_IPV6)
936         if (family == AF_INET6)
937                 size = sizeof(struct in6_addr);
938 #endif
939         hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
940                 if (key->family != family)
941                         continue;
942                 if (!memcmp(&key->addr, addr, size))
943                         return key;
944         }
945         return NULL;
946 }
947 EXPORT_SYMBOL(tcp_md5_do_lookup);
948
949 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
950                                          struct sock *addr_sk)
951 {
952         union tcp_md5_addr *addr;
953
954         addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
955         return tcp_md5_do_lookup(sk, addr, AF_INET);
956 }
957 EXPORT_SYMBOL(tcp_v4_md5_lookup);
958
959 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
960                                                       struct request_sock *req)
961 {
962         union tcp_md5_addr *addr;
963
964         addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
965         return tcp_md5_do_lookup(sk, addr, AF_INET);
966 }
967
968 /* This can be called on a newly created socket, from other files */
969 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
970                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
971 {
972         /* Add Key to the list */
973         struct tcp_md5sig_key *key;
974         struct tcp_sock *tp = tcp_sk(sk);
975         struct tcp_md5sig_info *md5sig;
976
977         key = tcp_md5_do_lookup(sk, addr, family);
978         if (key) {
979                 /* Pre-existing entry - just update that one. */
980                 memcpy(key->key, newkey, newkeylen);
981                 key->keylen = newkeylen;
982                 return 0;
983         }
984
985         md5sig = rcu_dereference_protected(tp->md5sig_info,
986                                            sock_owned_by_user(sk));
987         if (!md5sig) {
988                 md5sig = kmalloc(sizeof(*md5sig), gfp);
989                 if (!md5sig)
990                         return -ENOMEM;
991
992                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
993                 INIT_HLIST_HEAD(&md5sig->head);
994                 rcu_assign_pointer(tp->md5sig_info, md5sig);
995         }
996
997         key = sock_kmalloc(sk, sizeof(*key), gfp);
998         if (!key)
999                 return -ENOMEM;
1000         if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1001                 sock_kfree_s(sk, key, sizeof(*key));
1002                 return -ENOMEM;
1003         }
1004
1005         memcpy(key->key, newkey, newkeylen);
1006         key->keylen = newkeylen;
1007         key->family = family;
1008         memcpy(&key->addr, addr,
1009                (family == AF_INET6) ? sizeof(struct in6_addr) :
1010                                       sizeof(struct in_addr));
1011         hlist_add_head_rcu(&key->node, &md5sig->head);
1012         return 0;
1013 }
1014 EXPORT_SYMBOL(tcp_md5_do_add);
1015
1016 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1017 {
1018         struct tcp_sock *tp = tcp_sk(sk);
1019         struct tcp_md5sig_key *key;
1020         struct tcp_md5sig_info *md5sig;
1021
1022         key = tcp_md5_do_lookup(sk, addr, family);
1023         if (!key)
1024                 return -ENOENT;
1025         hlist_del_rcu(&key->node);
1026         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1027         kfree_rcu(key, rcu);
1028         md5sig = rcu_dereference_protected(tp->md5sig_info,
1029                                            sock_owned_by_user(sk));
1030         if (hlist_empty(&md5sig->head))
1031                 tcp_free_md5sig_pool();
1032         return 0;
1033 }
1034 EXPORT_SYMBOL(tcp_md5_do_del);
1035
1036 void tcp_clear_md5_list(struct sock *sk)
1037 {
1038         struct tcp_sock *tp = tcp_sk(sk);
1039         struct tcp_md5sig_key *key;
1040         struct hlist_node *pos, *n;
1041         struct tcp_md5sig_info *md5sig;
1042
1043         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1044
1045         if (!hlist_empty(&md5sig->head))
1046                 tcp_free_md5sig_pool();
1047         hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1048                 hlist_del_rcu(&key->node);
1049                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1050                 kfree_rcu(key, rcu);
1051         }
1052 }
1053
1054 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1055                                  int optlen)
1056 {
1057         struct tcp_md5sig cmd;
1058         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1059
1060         if (optlen < sizeof(cmd))
1061                 return -EINVAL;
1062
1063         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1064                 return -EFAULT;
1065
1066         if (sin->sin_family != AF_INET)
1067                 return -EINVAL;
1068
1069         if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1070                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1071                                       AF_INET);
1072
1073         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1074                 return -EINVAL;
1075
1076         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1077                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1078                               GFP_KERNEL);
1079 }
1080
1081 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1082                                         __be32 daddr, __be32 saddr, int nbytes)
1083 {
1084         struct tcp4_pseudohdr *bp;
1085         struct scatterlist sg;
1086
1087         bp = &hp->md5_blk.ip4;
1088
1089         /*
1090          * 1. the TCP pseudo-header (in the order: source IP address,
1091          * destination IP address, zero-padded protocol number, and
1092          * segment length)
1093          */
1094         bp->saddr = saddr;
1095         bp->daddr = daddr;
1096         bp->pad = 0;
1097         bp->protocol = IPPROTO_TCP;
1098         bp->len = cpu_to_be16(nbytes);
1099
1100         sg_init_one(&sg, bp, sizeof(*bp));
1101         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1102 }
1103
1104 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1105                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1106 {
1107         struct tcp_md5sig_pool *hp;
1108         struct hash_desc *desc;
1109
1110         hp = tcp_get_md5sig_pool();
1111         if (!hp)
1112                 goto clear_hash_noput;
1113         desc = &hp->md5_desc;
1114
1115         if (crypto_hash_init(desc))
1116                 goto clear_hash;
1117         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1118                 goto clear_hash;
1119         if (tcp_md5_hash_header(hp, th))
1120                 goto clear_hash;
1121         if (tcp_md5_hash_key(hp, key))
1122                 goto clear_hash;
1123         if (crypto_hash_final(desc, md5_hash))
1124                 goto clear_hash;
1125
1126         tcp_put_md5sig_pool();
1127         return 0;
1128
1129 clear_hash:
1130         tcp_put_md5sig_pool();
1131 clear_hash_noput:
1132         memset(md5_hash, 0, 16);
1133         return 1;
1134 }
1135
1136 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1137                         const struct sock *sk, const struct request_sock *req,
1138                         const struct sk_buff *skb)
1139 {
1140         struct tcp_md5sig_pool *hp;
1141         struct hash_desc *desc;
1142         const struct tcphdr *th = tcp_hdr(skb);
1143         __be32 saddr, daddr;
1144
1145         if (sk) {
1146                 saddr = inet_sk(sk)->inet_saddr;
1147                 daddr = inet_sk(sk)->inet_daddr;
1148         } else if (req) {
1149                 saddr = inet_rsk(req)->loc_addr;
1150                 daddr = inet_rsk(req)->rmt_addr;
1151         } else {
1152                 const struct iphdr *iph = ip_hdr(skb);
1153                 saddr = iph->saddr;
1154                 daddr = iph->daddr;
1155         }
1156
1157         hp = tcp_get_md5sig_pool();
1158         if (!hp)
1159                 goto clear_hash_noput;
1160         desc = &hp->md5_desc;
1161
1162         if (crypto_hash_init(desc))
1163                 goto clear_hash;
1164
1165         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1166                 goto clear_hash;
1167         if (tcp_md5_hash_header(hp, th))
1168                 goto clear_hash;
1169         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1170                 goto clear_hash;
1171         if (tcp_md5_hash_key(hp, key))
1172                 goto clear_hash;
1173         if (crypto_hash_final(desc, md5_hash))
1174                 goto clear_hash;
1175
1176         tcp_put_md5sig_pool();
1177         return 0;
1178
1179 clear_hash:
1180         tcp_put_md5sig_pool();
1181 clear_hash_noput:
1182         memset(md5_hash, 0, 16);
1183         return 1;
1184 }
1185 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1186
1187 static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1188 {
1189         /*
1190          * This gets called for each TCP segment that arrives
1191          * so we want to be efficient.
1192          * We have 3 drop cases:
1193          * o No MD5 hash and one expected.
1194          * o MD5 hash and we're not expecting one.
1195          * o MD5 hash and its wrong.
1196          */
1197         const __u8 *hash_location = NULL;
1198         struct tcp_md5sig_key *hash_expected;
1199         const struct iphdr *iph = ip_hdr(skb);
1200         const struct tcphdr *th = tcp_hdr(skb);
1201         int genhash;
1202         unsigned char newhash[16];
1203
1204         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1205                                           AF_INET);
1206         hash_location = tcp_parse_md5sig_option(th);
1207
1208         /* We've parsed the options - do we have a hash? */
1209         if (!hash_expected && !hash_location)
1210                 return 0;
1211
1212         if (hash_expected && !hash_location) {
1213                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1214                 return 1;
1215         }
1216
1217         if (!hash_expected && hash_location) {
1218                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1219                 return 1;
1220         }
1221
1222         /* Okay, so this is hash_expected and hash_location -
1223          * so we need to calculate the checksum.
1224          */
1225         genhash = tcp_v4_md5_hash_skb(newhash,
1226                                       hash_expected,
1227                                       NULL, NULL, skb);
1228
1229         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1230                 if (net_ratelimit()) {
1231                         pr_info("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1232                                 &iph->saddr, ntohs(th->source),
1233                                 &iph->daddr, ntohs(th->dest),
1234                                 genhash ? " tcp_v4_calc_md5_hash failed" : "");
1235                 }
1236                 return 1;
1237         }
1238         return 0;
1239 }
1240
1241 #endif
1242
1243 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1244         .family         =       PF_INET,
1245         .obj_size       =       sizeof(struct tcp_request_sock),
1246         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1247         .send_ack       =       tcp_v4_reqsk_send_ack,
1248         .destructor     =       tcp_v4_reqsk_destructor,
1249         .send_reset     =       tcp_v4_send_reset,
1250         .syn_ack_timeout =      tcp_syn_ack_timeout,
1251 };
1252
1253 #ifdef CONFIG_TCP_MD5SIG
1254 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1255         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1256         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1257 };
1258 #endif
1259
1260 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1261 {
1262         struct tcp_extend_values tmp_ext;
1263         struct tcp_options_received tmp_opt;
1264         const u8 *hash_location;
1265         struct request_sock *req;
1266         struct inet_request_sock *ireq;
1267         struct tcp_sock *tp = tcp_sk(sk);
1268         struct dst_entry *dst = NULL;
1269         __be32 saddr = ip_hdr(skb)->saddr;
1270         __be32 daddr = ip_hdr(skb)->daddr;
1271         __u32 isn = TCP_SKB_CB(skb)->when;
1272         int want_cookie = 0;
1273
1274         /* Never answer to SYNs send to broadcast or multicast */
1275         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1276                 goto drop;
1277
1278         /* TW buckets are converted to open requests without
1279          * limitations, they conserve resources and peer is
1280          * evidently real one.
1281          */
1282         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1283                 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1284                 if (!want_cookie)
1285                         goto drop;
1286         }
1287
1288         /* Accept backlog is full. If we have already queued enough
1289          * of warm entries in syn queue, drop request. It is better than
1290          * clogging syn queue with openreqs with exponentially increasing
1291          * timeout.
1292          */
1293         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1294                 goto drop;
1295
1296         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1297         if (!req)
1298                 goto drop;
1299
1300 #ifdef CONFIG_TCP_MD5SIG
1301         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1302 #endif
1303
1304         tcp_clear_options(&tmp_opt);
1305         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1306         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1307         tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1308
1309         if (tmp_opt.cookie_plus > 0 &&
1310             tmp_opt.saw_tstamp &&
1311             !tp->rx_opt.cookie_out_never &&
1312             (sysctl_tcp_cookie_size > 0 ||
1313              (tp->cookie_values != NULL &&
1314               tp->cookie_values->cookie_desired > 0))) {
1315                 u8 *c;
1316                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1317                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1318
1319                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1320                         goto drop_and_release;
1321
1322                 /* Secret recipe starts with IP addresses */
1323                 *mess++ ^= (__force u32)daddr;
1324                 *mess++ ^= (__force u32)saddr;
1325
1326                 /* plus variable length Initiator Cookie */
1327                 c = (u8 *)mess;
1328                 while (l-- > 0)
1329                         *c++ ^= *hash_location++;
1330
1331                 want_cookie = 0;        /* not our kind of cookie */
1332                 tmp_ext.cookie_out_never = 0; /* false */
1333                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1334         } else if (!tp->rx_opt.cookie_in_always) {
1335                 /* redundant indications, but ensure initialization. */
1336                 tmp_ext.cookie_out_never = 1; /* true */
1337                 tmp_ext.cookie_plus = 0;
1338         } else {
1339                 goto drop_and_release;
1340         }
1341         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1342
1343         if (want_cookie && !tmp_opt.saw_tstamp)
1344                 tcp_clear_options(&tmp_opt);
1345
1346         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1347         tcp_openreq_init(req, &tmp_opt, skb);
1348
1349         ireq = inet_rsk(req);
1350         ireq->loc_addr = daddr;
1351         ireq->rmt_addr = saddr;
1352         ireq->no_srccheck = inet_sk(sk)->transparent;
1353         ireq->opt = tcp_v4_save_options(sk, skb);
1354
1355         if (security_inet_conn_request(sk, skb, req))
1356                 goto drop_and_free;
1357
1358         if (!want_cookie || tmp_opt.tstamp_ok)
1359                 TCP_ECN_create_request(req, tcp_hdr(skb));
1360
1361         if (want_cookie) {
1362                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1363                 req->cookie_ts = tmp_opt.tstamp_ok;
1364         } else if (!isn) {
1365                 struct inet_peer *peer = NULL;
1366                 struct flowi4 fl4;
1367
1368                 /* VJ's idea. We save last timestamp seen
1369                  * from the destination in peer table, when entering
1370                  * state TIME-WAIT, and check against it before
1371                  * accepting new connection request.
1372                  *
1373                  * If "isn" is not zero, this request hit alive
1374                  * timewait bucket, so that all the necessary checks
1375                  * are made in the function processing timewait state.
1376                  */
1377                 if (tmp_opt.saw_tstamp &&
1378                     tcp_death_row.sysctl_tw_recycle &&
1379                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1380                     fl4.daddr == saddr &&
1381                     (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1382                         inet_peer_refcheck(peer);
1383                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1384                             (s32)(peer->tcp_ts - req->ts_recent) >
1385                                                         TCP_PAWS_WINDOW) {
1386                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1387                                 goto drop_and_release;
1388                         }
1389                 }
1390                 /* Kill the following clause, if you dislike this way. */
1391                 else if (!sysctl_tcp_syncookies &&
1392                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1393                           (sysctl_max_syn_backlog >> 2)) &&
1394                          (!peer || !peer->tcp_ts_stamp) &&
1395                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1396                         /* Without syncookies last quarter of
1397                          * backlog is filled with destinations,
1398                          * proven to be alive.
1399                          * It means that we continue to communicate
1400                          * to destinations, already remembered
1401                          * to the moment of synflood.
1402                          */
1403                         LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1404                                        &saddr, ntohs(tcp_hdr(skb)->source));
1405                         goto drop_and_release;
1406                 }
1407
1408                 isn = tcp_v4_init_sequence(skb);
1409         }
1410         tcp_rsk(req)->snt_isn = isn;
1411         tcp_rsk(req)->snt_synack = tcp_time_stamp;
1412
1413         if (tcp_v4_send_synack(sk, dst, req,
1414                                (struct request_values *)&tmp_ext) ||
1415             want_cookie)
1416                 goto drop_and_free;
1417
1418         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1419         return 0;
1420
1421 drop_and_release:
1422         dst_release(dst);
1423 drop_and_free:
1424         reqsk_free(req);
1425 drop:
1426         return 0;
1427 }
1428 EXPORT_SYMBOL(tcp_v4_conn_request);
1429
1430
1431 /*
1432  * The three way handshake has completed - we got a valid synack -
1433  * now create the new socket.
1434  */
1435 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1436                                   struct request_sock *req,
1437                                   struct dst_entry *dst)
1438 {
1439         struct inet_request_sock *ireq;
1440         struct inet_sock *newinet;
1441         struct tcp_sock *newtp;
1442         struct sock *newsk;
1443 #ifdef CONFIG_TCP_MD5SIG
1444         struct tcp_md5sig_key *key;
1445 #endif
1446         struct ip_options_rcu *inet_opt;
1447
1448         if (sk_acceptq_is_full(sk))
1449                 goto exit_overflow;
1450
1451         newsk = tcp_create_openreq_child(sk, req, skb);
1452         if (!newsk)
1453                 goto exit_nonewsk;
1454
1455         newsk->sk_gso_type = SKB_GSO_TCPV4;
1456
1457         newtp                 = tcp_sk(newsk);
1458         newinet               = inet_sk(newsk);
1459         ireq                  = inet_rsk(req);
1460         newinet->inet_daddr   = ireq->rmt_addr;
1461         newinet->inet_rcv_saddr = ireq->loc_addr;
1462         newinet->inet_saddr           = ireq->loc_addr;
1463         inet_opt              = ireq->opt;
1464         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1465         ireq->opt             = NULL;
1466         newinet->mc_index     = inet_iif(skb);
1467         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1468         newinet->rcv_tos      = ip_hdr(skb)->tos;
1469         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1470         if (inet_opt)
1471                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1472         newinet->inet_id = newtp->write_seq ^ jiffies;
1473
1474         if (!dst) {
1475                 dst = inet_csk_route_child_sock(sk, newsk, req);
1476                 if (!dst)
1477                         goto put_and_exit;
1478         } else {
1479                 /* syncookie case : see end of cookie_v4_check() */
1480         }
1481         sk_setup_caps(newsk, dst);
1482
1483         tcp_mtup_init(newsk);
1484         tcp_sync_mss(newsk, dst_mtu(dst));
1485         newtp->advmss = dst_metric_advmss(dst);
1486         if (tcp_sk(sk)->rx_opt.user_mss &&
1487             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1488                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1489
1490         tcp_initialize_rcv_mss(newsk);
1491         if (tcp_rsk(req)->snt_synack)
1492                 tcp_valid_rtt_meas(newsk,
1493                     tcp_time_stamp - tcp_rsk(req)->snt_synack);
1494         newtp->total_retrans = req->retrans;
1495
1496 #ifdef CONFIG_TCP_MD5SIG
1497         /* Copy over the MD5 key from the original socket */
1498         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1499                                 AF_INET);
1500         if (key != NULL) {
1501                 /*
1502                  * We're using one, so create a matching key
1503                  * on the newsk structure. If we fail to get
1504                  * memory, then we end up not copying the key
1505                  * across. Shucks.
1506                  */
1507                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1508                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1509                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1510         }
1511 #endif
1512
1513         if (__inet_inherit_port(sk, newsk) < 0)
1514                 goto put_and_exit;
1515         __inet_hash_nolisten(newsk, NULL);
1516
1517         return newsk;
1518
1519 exit_overflow:
1520         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1521 exit_nonewsk:
1522         dst_release(dst);
1523 exit:
1524         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1525         return NULL;
1526 put_and_exit:
1527         inet_csk_prepare_forced_close(newsk);
1528         tcp_done(newsk);
1529         goto exit;
1530 }
1531 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1532
1533 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1534 {
1535         struct tcphdr *th = tcp_hdr(skb);
1536         const struct iphdr *iph = ip_hdr(skb);
1537         struct sock *nsk;
1538         struct request_sock **prev;
1539         /* Find possible connection requests. */
1540         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1541                                                        iph->saddr, iph->daddr);
1542         if (req)
1543                 return tcp_check_req(sk, skb, req, prev);
1544
1545         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1546                         th->source, iph->daddr, th->dest, inet_iif(skb));
1547
1548         if (nsk) {
1549                 if (nsk->sk_state != TCP_TIME_WAIT) {
1550                         bh_lock_sock(nsk);
1551                         return nsk;
1552                 }
1553                 inet_twsk_put(inet_twsk(nsk));
1554                 return NULL;
1555         }
1556
1557 #ifdef CONFIG_SYN_COOKIES
1558         if (!th->syn)
1559                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1560 #endif
1561         return sk;
1562 }
1563
1564 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1565 {
1566         const struct iphdr *iph = ip_hdr(skb);
1567
1568         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1569                 if (!tcp_v4_check(skb->len, iph->saddr,
1570                                   iph->daddr, skb->csum)) {
1571                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1572                         return 0;
1573                 }
1574         }
1575
1576         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1577                                        skb->len, IPPROTO_TCP, 0);
1578
1579         if (skb->len <= 76) {
1580                 return __skb_checksum_complete(skb);
1581         }
1582         return 0;
1583 }
1584
1585
1586 /* The socket must have it's spinlock held when we get
1587  * here.
1588  *
1589  * We have a potential double-lock case here, so even when
1590  * doing backlog processing we use the BH locking scheme.
1591  * This is because we cannot sleep with the original spinlock
1592  * held.
1593  */
1594 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1595 {
1596         struct sock *rsk;
1597 #ifdef CONFIG_TCP_MD5SIG
1598         /*
1599          * We really want to reject the packet as early as possible
1600          * if:
1601          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1602          *  o There is an MD5 option and we're not expecting one
1603          */
1604         if (tcp_v4_inbound_md5_hash(sk, skb))
1605                 goto discard;
1606 #endif
1607
1608         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1609                 sock_rps_save_rxhash(sk, skb);
1610                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1611                         rsk = sk;
1612                         goto reset;
1613                 }
1614                 return 0;
1615         }
1616
1617         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1618                 goto csum_err;
1619
1620         if (sk->sk_state == TCP_LISTEN) {
1621                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1622                 if (!nsk)
1623                         goto discard;
1624
1625                 if (nsk != sk) {
1626                         sock_rps_save_rxhash(nsk, skb);
1627                         if (tcp_child_process(sk, nsk, skb)) {
1628                                 rsk = nsk;
1629                                 goto reset;
1630                         }
1631                         return 0;
1632                 }
1633         } else
1634                 sock_rps_save_rxhash(sk, skb);
1635
1636         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1637                 rsk = sk;
1638                 goto reset;
1639         }
1640         return 0;
1641
1642 reset:
1643         tcp_v4_send_reset(rsk, skb);
1644 discard:
1645         kfree_skb(skb);
1646         /* Be careful here. If this function gets more complicated and
1647          * gcc suffers from register pressure on the x86, sk (in %ebx)
1648          * might be destroyed here. This current version compiles correctly,
1649          * but you have been warned.
1650          */
1651         return 0;
1652
1653 csum_err:
1654         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1655         goto discard;
1656 }
1657 EXPORT_SYMBOL(tcp_v4_do_rcv);
1658
1659 /*
1660  *      From tcp_input.c
1661  */
1662
1663 int tcp_v4_rcv(struct sk_buff *skb)
1664 {
1665         const struct iphdr *iph;
1666         const struct tcphdr *th;
1667         struct sock *sk;
1668         int ret;
1669         struct net *net = dev_net(skb->dev);
1670
1671         if (skb->pkt_type != PACKET_HOST)
1672                 goto discard_it;
1673
1674         /* Count it even if it's bad */
1675         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1676
1677         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1678                 goto discard_it;
1679
1680         th = tcp_hdr(skb);
1681
1682         if (th->doff < sizeof(struct tcphdr) / 4)
1683                 goto bad_packet;
1684         if (!pskb_may_pull(skb, th->doff * 4))
1685                 goto discard_it;
1686
1687         /* An explanation is required here, I think.
1688          * Packet length and doff are validated by header prediction,
1689          * provided case of th->doff==0 is eliminated.
1690          * So, we defer the checks. */
1691         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1692                 goto bad_packet;
1693
1694         th = tcp_hdr(skb);
1695         iph = ip_hdr(skb);
1696         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1697         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1698                                     skb->len - th->doff * 4);
1699         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1700         TCP_SKB_CB(skb)->when    = 0;
1701         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1702         TCP_SKB_CB(skb)->sacked  = 0;
1703
1704         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1705         if (!sk)
1706                 goto no_tcp_socket;
1707
1708 process:
1709         if (sk->sk_state == TCP_TIME_WAIT)
1710                 goto do_time_wait;
1711
1712         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1713                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1714                 goto discard_and_relse;
1715         }
1716
1717         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1718                 goto discard_and_relse;
1719         nf_reset(skb);
1720
1721         if (sk_filter(sk, skb))
1722                 goto discard_and_relse;
1723
1724         skb->dev = NULL;
1725
1726         bh_lock_sock_nested(sk);
1727         ret = 0;
1728         if (!sock_owned_by_user(sk)) {
1729 #ifdef CONFIG_NET_DMA
1730                 struct tcp_sock *tp = tcp_sk(sk);
1731                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1732                         tp->ucopy.dma_chan = net_dma_find_channel();
1733                 if (tp->ucopy.dma_chan)
1734                         ret = tcp_v4_do_rcv(sk, skb);
1735                 else
1736 #endif
1737                 {
1738                         if (!tcp_prequeue(sk, skb))
1739                                 ret = tcp_v4_do_rcv(sk, skb);
1740                 }
1741         } else if (unlikely(sk_add_backlog(sk, skb))) {
1742                 bh_unlock_sock(sk);
1743                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1744                 goto discard_and_relse;
1745         }
1746         bh_unlock_sock(sk);
1747
1748         sock_put(sk);
1749
1750         return ret;
1751
1752 no_tcp_socket:
1753         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1754                 goto discard_it;
1755
1756         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1757 bad_packet:
1758                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1759         } else {
1760                 tcp_v4_send_reset(NULL, skb);
1761         }
1762
1763 discard_it:
1764         /* Discard frame. */
1765         kfree_skb(skb);
1766         return 0;
1767
1768 discard_and_relse:
1769         sock_put(sk);
1770         goto discard_it;
1771
1772 do_time_wait:
1773         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1774                 inet_twsk_put(inet_twsk(sk));
1775                 goto discard_it;
1776         }
1777
1778         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1779                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1780                 inet_twsk_put(inet_twsk(sk));
1781                 goto discard_it;
1782         }
1783         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1784         case TCP_TW_SYN: {
1785                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1786                                                         &tcp_hashinfo,
1787                                                         iph->daddr, th->dest,
1788                                                         inet_iif(skb));
1789                 if (sk2) {
1790                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1791                         inet_twsk_put(inet_twsk(sk));
1792                         sk = sk2;
1793                         goto process;
1794                 }
1795                 /* Fall through to ACK */
1796         }
1797         case TCP_TW_ACK:
1798                 tcp_v4_timewait_ack(sk, skb);
1799                 break;
1800         case TCP_TW_RST:
1801                 goto no_tcp_socket;
1802         case TCP_TW_SUCCESS:;
1803         }
1804         goto discard_it;
1805 }
1806
1807 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1808 {
1809         struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1810         struct inet_sock *inet = inet_sk(sk);
1811         struct inet_peer *peer;
1812
1813         if (!rt ||
1814             inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1815                 peer = inet_getpeer_v4(inet->inet_daddr, 1);
1816                 *release_it = true;
1817         } else {
1818                 if (!rt->peer)
1819                         rt_bind_peer(rt, inet->inet_daddr, 1);
1820                 peer = rt->peer;
1821                 *release_it = false;
1822         }
1823
1824         return peer;
1825 }
1826 EXPORT_SYMBOL(tcp_v4_get_peer);
1827
1828 void *tcp_v4_tw_get_peer(struct sock *sk)
1829 {
1830         const struct inet_timewait_sock *tw = inet_twsk(sk);
1831
1832         return inet_getpeer_v4(tw->tw_daddr, 1);
1833 }
1834 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1835
1836 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1837         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1838         .twsk_unique    = tcp_twsk_unique,
1839         .twsk_destructor= tcp_twsk_destructor,
1840         .twsk_getpeer   = tcp_v4_tw_get_peer,
1841 };
1842
1843 const struct inet_connection_sock_af_ops ipv4_specific = {
1844         .queue_xmit        = ip_queue_xmit,
1845         .send_check        = tcp_v4_send_check,
1846         .rebuild_header    = inet_sk_rebuild_header,
1847         .conn_request      = tcp_v4_conn_request,
1848         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1849         .get_peer          = tcp_v4_get_peer,
1850         .net_header_len    = sizeof(struct iphdr),
1851         .setsockopt        = ip_setsockopt,
1852         .getsockopt        = ip_getsockopt,
1853         .addr2sockaddr     = inet_csk_addr2sockaddr,
1854         .sockaddr_len      = sizeof(struct sockaddr_in),
1855         .bind_conflict     = inet_csk_bind_conflict,
1856 #ifdef CONFIG_COMPAT
1857         .compat_setsockopt = compat_ip_setsockopt,
1858         .compat_getsockopt = compat_ip_getsockopt,
1859 #endif
1860 };
1861 EXPORT_SYMBOL(ipv4_specific);
1862
1863 #ifdef CONFIG_TCP_MD5SIG
1864 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1865         .md5_lookup             = tcp_v4_md5_lookup,
1866         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1867         .md5_parse              = tcp_v4_parse_md5_keys,
1868 };
1869 #endif
1870
1871 /* NOTE: A lot of things set to zero explicitly by call to
1872  *       sk_alloc() so need not be done here.
1873  */
1874 static int tcp_v4_init_sock(struct sock *sk)
1875 {
1876         struct inet_connection_sock *icsk = inet_csk(sk);
1877         struct tcp_sock *tp = tcp_sk(sk);
1878
1879         skb_queue_head_init(&tp->out_of_order_queue);
1880         tcp_init_xmit_timers(sk);
1881         tcp_prequeue_init(tp);
1882
1883         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1884         tp->mdev = TCP_TIMEOUT_INIT;
1885
1886         /* So many TCP implementations out there (incorrectly) count the
1887          * initial SYN frame in their delayed-ACK and congestion control
1888          * algorithms that we must have the following bandaid to talk
1889          * efficiently to them.  -DaveM
1890          */
1891         tp->snd_cwnd = TCP_INIT_CWND;
1892
1893         /* See draft-stevens-tcpca-spec-01 for discussion of the
1894          * initialization of these values.
1895          */
1896         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1897         tp->snd_cwnd_clamp = ~0;
1898         tp->mss_cache = TCP_MSS_DEFAULT;
1899
1900         tp->reordering = sysctl_tcp_reordering;
1901         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1902
1903         sk->sk_state = TCP_CLOSE;
1904
1905         sk->sk_write_space = sk_stream_write_space;
1906         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1907
1908         icsk->icsk_af_ops = &ipv4_specific;
1909         icsk->icsk_sync_mss = tcp_sync_mss;
1910 #ifdef CONFIG_TCP_MD5SIG
1911         tp->af_specific = &tcp_sock_ipv4_specific;
1912 #endif
1913
1914         /* TCP Cookie Transactions */
1915         if (sysctl_tcp_cookie_size > 0) {
1916                 /* Default, cookies without s_data_payload. */
1917                 tp->cookie_values =
1918                         kzalloc(sizeof(*tp->cookie_values),
1919                                 sk->sk_allocation);
1920                 if (tp->cookie_values != NULL)
1921                         kref_init(&tp->cookie_values->kref);
1922         }
1923         /* Presumed zeroed, in order of appearance:
1924          *      cookie_in_always, cookie_out_never,
1925          *      s_data_constant, s_data_in, s_data_out
1926          */
1927         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1928         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1929
1930         local_bh_disable();
1931         sock_update_memcg(sk);
1932         sk_sockets_allocated_inc(sk);
1933         local_bh_enable();
1934
1935         return 0;
1936 }
1937
1938 void tcp_v4_destroy_sock(struct sock *sk)
1939 {
1940         struct tcp_sock *tp = tcp_sk(sk);
1941
1942         tcp_clear_xmit_timers(sk);
1943
1944         tcp_cleanup_congestion_control(sk);
1945
1946         /* Cleanup up the write buffer. */
1947         tcp_write_queue_purge(sk);
1948
1949         /* Cleans up our, hopefully empty, out_of_order_queue. */
1950         __skb_queue_purge(&tp->out_of_order_queue);
1951
1952 #ifdef CONFIG_TCP_MD5SIG
1953         /* Clean up the MD5 key list, if any */
1954         if (tp->md5sig_info) {
1955                 tcp_clear_md5_list(sk);
1956                 kfree_rcu(tp->md5sig_info, rcu);
1957                 tp->md5sig_info = NULL;
1958         }
1959 #endif
1960
1961 #ifdef CONFIG_NET_DMA
1962         /* Cleans up our sk_async_wait_queue */
1963         __skb_queue_purge(&sk->sk_async_wait_queue);
1964 #endif
1965
1966         /* Clean prequeue, it must be empty really */
1967         __skb_queue_purge(&tp->ucopy.prequeue);
1968
1969         /* Clean up a referenced TCP bind bucket. */
1970         if (inet_csk(sk)->icsk_bind_hash)
1971                 inet_put_port(sk);
1972
1973         /*
1974          * If sendmsg cached page exists, toss it.
1975          */
1976         if (sk->sk_sndmsg_page) {
1977                 __free_page(sk->sk_sndmsg_page);
1978                 sk->sk_sndmsg_page = NULL;
1979         }
1980
1981         /* TCP Cookie Transactions */
1982         if (tp->cookie_values != NULL) {
1983                 kref_put(&tp->cookie_values->kref,
1984                          tcp_cookie_values_release);
1985                 tp->cookie_values = NULL;
1986         }
1987
1988         sk_sockets_allocated_dec(sk);
1989         sock_release_memcg(sk);
1990 }
1991 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1992
1993 #ifdef CONFIG_PROC_FS
1994 /* Proc filesystem TCP sock list dumping. */
1995
1996 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1997 {
1998         return hlist_nulls_empty(head) ? NULL :
1999                 list_entry(head->first, struct inet_timewait_sock, tw_node);
2000 }
2001
2002 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2003 {
2004         return !is_a_nulls(tw->tw_node.next) ?
2005                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2006 }
2007
2008 /*
2009  * Get next listener socket follow cur.  If cur is NULL, get first socket
2010  * starting from bucket given in st->bucket; when st->bucket is zero the
2011  * very first socket in the hash table is returned.
2012  */
2013 static void *listening_get_next(struct seq_file *seq, void *cur)
2014 {
2015         struct inet_connection_sock *icsk;
2016         struct hlist_nulls_node *node;
2017         struct sock *sk = cur;
2018         struct inet_listen_hashbucket *ilb;
2019         struct tcp_iter_state *st = seq->private;
2020         struct net *net = seq_file_net(seq);
2021
2022         if (!sk) {
2023                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2024                 spin_lock_bh(&ilb->lock);
2025                 sk = sk_nulls_head(&ilb->head);
2026                 st->offset = 0;
2027                 goto get_sk;
2028         }
2029         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2030         ++st->num;
2031         ++st->offset;
2032
2033         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2034                 struct request_sock *req = cur;
2035
2036                 icsk = inet_csk(st->syn_wait_sk);
2037                 req = req->dl_next;
2038                 while (1) {
2039                         while (req) {
2040                                 if (req->rsk_ops->family == st->family) {
2041                                         cur = req;
2042                                         goto out;
2043                                 }
2044                                 req = req->dl_next;
2045                         }
2046                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2047                                 break;
2048 get_req:
2049                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2050                 }
2051                 sk        = sk_nulls_next(st->syn_wait_sk);
2052                 st->state = TCP_SEQ_STATE_LISTENING;
2053                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2054         } else {
2055                 icsk = inet_csk(sk);
2056                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2057                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2058                         goto start_req;
2059                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2060                 sk = sk_nulls_next(sk);
2061         }
2062 get_sk:
2063         sk_nulls_for_each_from(sk, node) {
2064                 if (!net_eq(sock_net(sk), net))
2065                         continue;
2066                 if (sk->sk_family == st->family) {
2067                         cur = sk;
2068                         goto out;
2069                 }
2070                 icsk = inet_csk(sk);
2071                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2072                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2073 start_req:
2074                         st->uid         = sock_i_uid(sk);
2075                         st->syn_wait_sk = sk;
2076                         st->state       = TCP_SEQ_STATE_OPENREQ;
2077                         st->sbucket     = 0;
2078                         goto get_req;
2079                 }
2080                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2081         }
2082         spin_unlock_bh(&ilb->lock);
2083         st->offset = 0;
2084         if (++st->bucket < INET_LHTABLE_SIZE) {
2085                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2086                 spin_lock_bh(&ilb->lock);
2087                 sk = sk_nulls_head(&ilb->head);
2088                 goto get_sk;
2089         }
2090         cur = NULL;
2091 out:
2092         return cur;
2093 }
2094
2095 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2096 {
2097         struct tcp_iter_state *st = seq->private;
2098         void *rc;
2099
2100         st->bucket = 0;
2101         st->offset = 0;
2102         rc = listening_get_next(seq, NULL);
2103
2104         while (rc && *pos) {
2105                 rc = listening_get_next(seq, rc);
2106                 --*pos;
2107         }
2108         return rc;
2109 }
2110
2111 static inline int empty_bucket(struct tcp_iter_state *st)
2112 {
2113         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2114                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2115 }
2116
2117 /*
2118  * Get first established socket starting from bucket given in st->bucket.
2119  * If st->bucket is zero, the very first socket in the hash is returned.
2120  */
2121 static void *established_get_first(struct seq_file *seq)
2122 {
2123         struct tcp_iter_state *st = seq->private;
2124         struct net *net = seq_file_net(seq);
2125         void *rc = NULL;
2126
2127         st->offset = 0;
2128         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2129                 struct sock *sk;
2130                 struct hlist_nulls_node *node;
2131                 struct inet_timewait_sock *tw;
2132                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2133
2134                 /* Lockless fast path for the common case of empty buckets */
2135                 if (empty_bucket(st))
2136                         continue;
2137
2138                 spin_lock_bh(lock);
2139                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2140                         if (sk->sk_family != st->family ||
2141                             !net_eq(sock_net(sk), net)) {
2142                                 continue;
2143                         }
2144                         rc = sk;
2145                         goto out;
2146                 }
2147                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2148                 inet_twsk_for_each(tw, node,
2149                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2150                         if (tw->tw_family != st->family ||
2151                             !net_eq(twsk_net(tw), net)) {
2152                                 continue;
2153                         }
2154                         rc = tw;
2155                         goto out;
2156                 }
2157                 spin_unlock_bh(lock);
2158                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2159         }
2160 out:
2161         return rc;
2162 }
2163
2164 static void *established_get_next(struct seq_file *seq, void *cur)
2165 {
2166         struct sock *sk = cur;
2167         struct inet_timewait_sock *tw;
2168         struct hlist_nulls_node *node;
2169         struct tcp_iter_state *st = seq->private;
2170         struct net *net = seq_file_net(seq);
2171
2172         ++st->num;
2173         ++st->offset;
2174
2175         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2176                 tw = cur;
2177                 tw = tw_next(tw);
2178 get_tw:
2179                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2180                         tw = tw_next(tw);
2181                 }
2182                 if (tw) {
2183                         cur = tw;
2184                         goto out;
2185                 }
2186                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2187                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2188
2189                 /* Look for next non empty bucket */
2190                 st->offset = 0;
2191                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2192                                 empty_bucket(st))
2193                         ;
2194                 if (st->bucket > tcp_hashinfo.ehash_mask)
2195                         return NULL;
2196
2197                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2198                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2199         } else
2200                 sk = sk_nulls_next(sk);
2201
2202         sk_nulls_for_each_from(sk, node) {
2203                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2204                         goto found;
2205         }
2206
2207         st->state = TCP_SEQ_STATE_TIME_WAIT;
2208         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2209         goto get_tw;
2210 found:
2211         cur = sk;
2212 out:
2213         return cur;
2214 }
2215
2216 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2217 {
2218         struct tcp_iter_state *st = seq->private;
2219         void *rc;
2220
2221         st->bucket = 0;
2222         rc = established_get_first(seq);
2223
2224         while (rc && pos) {
2225                 rc = established_get_next(seq, rc);
2226                 --pos;
2227         }
2228         return rc;
2229 }
2230
2231 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2232 {
2233         void *rc;
2234         struct tcp_iter_state *st = seq->private;
2235
2236         st->state = TCP_SEQ_STATE_LISTENING;
2237         rc        = listening_get_idx(seq, &pos);
2238
2239         if (!rc) {
2240                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2241                 rc        = established_get_idx(seq, pos);
2242         }
2243
2244         return rc;
2245 }
2246
2247 static void *tcp_seek_last_pos(struct seq_file *seq)
2248 {
2249         struct tcp_iter_state *st = seq->private;
2250         int offset = st->offset;
2251         int orig_num = st->num;
2252         void *rc = NULL;
2253
2254         switch (st->state) {
2255         case TCP_SEQ_STATE_OPENREQ:
2256         case TCP_SEQ_STATE_LISTENING:
2257                 if (st->bucket >= INET_LHTABLE_SIZE)
2258                         break;
2259                 st->state = TCP_SEQ_STATE_LISTENING;
2260                 rc = listening_get_next(seq, NULL);
2261                 while (offset-- && rc)
2262                         rc = listening_get_next(seq, rc);
2263                 if (rc)
2264                         break;
2265                 st->bucket = 0;
2266                 /* Fallthrough */
2267         case TCP_SEQ_STATE_ESTABLISHED:
2268         case TCP_SEQ_STATE_TIME_WAIT:
2269                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2270                 if (st->bucket > tcp_hashinfo.ehash_mask)
2271                         break;
2272                 rc = established_get_first(seq);
2273                 while (offset-- && rc)
2274                         rc = established_get_next(seq, rc);
2275         }
2276
2277         st->num = orig_num;
2278
2279         return rc;
2280 }
2281
2282 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2283 {
2284         struct tcp_iter_state *st = seq->private;
2285         void *rc;
2286
2287         if (*pos && *pos == st->last_pos) {
2288                 rc = tcp_seek_last_pos(seq);
2289                 if (rc)
2290                         goto out;
2291         }
2292
2293         st->state = TCP_SEQ_STATE_LISTENING;
2294         st->num = 0;
2295         st->bucket = 0;
2296         st->offset = 0;
2297         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2298
2299 out:
2300         st->last_pos = *pos;
2301         return rc;
2302 }
2303
2304 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2305 {
2306         struct tcp_iter_state *st = seq->private;
2307         void *rc = NULL;
2308
2309         if (v == SEQ_START_TOKEN) {
2310                 rc = tcp_get_idx(seq, 0);
2311                 goto out;
2312         }
2313
2314         switch (st->state) {
2315         case TCP_SEQ_STATE_OPENREQ:
2316         case TCP_SEQ_STATE_LISTENING:
2317                 rc = listening_get_next(seq, v);
2318                 if (!rc) {
2319                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2320                         st->bucket = 0;
2321                         st->offset = 0;
2322                         rc        = established_get_first(seq);
2323                 }
2324                 break;
2325         case TCP_SEQ_STATE_ESTABLISHED:
2326         case TCP_SEQ_STATE_TIME_WAIT:
2327                 rc = established_get_next(seq, v);
2328                 break;
2329         }
2330 out:
2331         ++*pos;
2332         st->last_pos = *pos;
2333         return rc;
2334 }
2335
2336 static void tcp_seq_stop(struct seq_file *seq, void *v)
2337 {
2338         struct tcp_iter_state *st = seq->private;
2339
2340         switch (st->state) {
2341         case TCP_SEQ_STATE_OPENREQ:
2342                 if (v) {
2343                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2344                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2345                 }
2346         case TCP_SEQ_STATE_LISTENING:
2347                 if (v != SEQ_START_TOKEN)
2348                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2349                 break;
2350         case TCP_SEQ_STATE_TIME_WAIT:
2351         case TCP_SEQ_STATE_ESTABLISHED:
2352                 if (v)
2353                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2354                 break;
2355         }
2356 }
2357
2358 int tcp_seq_open(struct inode *inode, struct file *file)
2359 {
2360         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2361         struct tcp_iter_state *s;
2362         int err;
2363
2364         err = seq_open_net(inode, file, &afinfo->seq_ops,
2365                           sizeof(struct tcp_iter_state));
2366         if (err < 0)
2367                 return err;
2368
2369         s = ((struct seq_file *)file->private_data)->private;
2370         s->family               = afinfo->family;
2371         s->last_pos             = 0;
2372         return 0;
2373 }
2374 EXPORT_SYMBOL(tcp_seq_open);
2375
2376 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2377 {
2378         int rc = 0;
2379         struct proc_dir_entry *p;
2380
2381         afinfo->seq_ops.start           = tcp_seq_start;
2382         afinfo->seq_ops.next            = tcp_seq_next;
2383         afinfo->seq_ops.stop            = tcp_seq_stop;
2384
2385         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2386                              afinfo->seq_fops, afinfo);
2387         if (!p)
2388                 rc = -ENOMEM;
2389         return rc;
2390 }
2391 EXPORT_SYMBOL(tcp_proc_register);
2392
2393 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2394 {
2395         proc_net_remove(net, afinfo->name);
2396 }
2397 EXPORT_SYMBOL(tcp_proc_unregister);
2398
2399 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2400                          struct seq_file *f, int i, int uid, int *len)
2401 {
2402         const struct inet_request_sock *ireq = inet_rsk(req);
2403         int ttd = req->expires - jiffies;
2404
2405         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2406                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2407                 i,
2408                 ireq->loc_addr,
2409                 ntohs(inet_sk(sk)->inet_sport),
2410                 ireq->rmt_addr,
2411                 ntohs(ireq->rmt_port),
2412                 TCP_SYN_RECV,
2413                 0, 0, /* could print option size, but that is af dependent. */
2414                 1,    /* timers active (only the expire timer) */
2415                 jiffies_to_clock_t(ttd),
2416                 req->retrans,
2417                 uid,
2418                 0,  /* non standard timer */
2419                 0, /* open_requests have no inode */
2420                 atomic_read(&sk->sk_refcnt),
2421                 req,
2422                 len);
2423 }
2424
2425 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2426 {
2427         int timer_active;
2428         unsigned long timer_expires;
2429         const struct tcp_sock *tp = tcp_sk(sk);
2430         const struct inet_connection_sock *icsk = inet_csk(sk);
2431         const struct inet_sock *inet = inet_sk(sk);
2432         __be32 dest = inet->inet_daddr;
2433         __be32 src = inet->inet_rcv_saddr;
2434         __u16 destp = ntohs(inet->inet_dport);
2435         __u16 srcp = ntohs(inet->inet_sport);
2436         int rx_queue;
2437
2438         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2439                 timer_active    = 1;
2440                 timer_expires   = icsk->icsk_timeout;
2441         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2442                 timer_active    = 4;
2443                 timer_expires   = icsk->icsk_timeout;
2444         } else if (timer_pending(&sk->sk_timer)) {
2445                 timer_active    = 2;
2446                 timer_expires   = sk->sk_timer.expires;
2447         } else {
2448                 timer_active    = 0;
2449                 timer_expires = jiffies;
2450         }
2451
2452         if (sk->sk_state == TCP_LISTEN)
2453                 rx_queue = sk->sk_ack_backlog;
2454         else
2455                 /*
2456                  * because we dont lock socket, we might find a transient negative value
2457                  */
2458                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2459
2460         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2461                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2462                 i, src, srcp, dest, destp, sk->sk_state,
2463                 tp->write_seq - tp->snd_una,
2464                 rx_queue,
2465                 timer_active,
2466                 jiffies_to_clock_t(timer_expires - jiffies),
2467                 icsk->icsk_retransmits,
2468                 sock_i_uid(sk),
2469                 icsk->icsk_probes_out,
2470                 sock_i_ino(sk),
2471                 atomic_read(&sk->sk_refcnt), sk,
2472                 jiffies_to_clock_t(icsk->icsk_rto),
2473                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2474                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2475                 tp->snd_cwnd,
2476                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2477                 len);
2478 }
2479
2480 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2481                                struct seq_file *f, int i, int *len)
2482 {
2483         __be32 dest, src;
2484         __u16 destp, srcp;
2485         int ttd = tw->tw_ttd - jiffies;
2486
2487         if (ttd < 0)
2488                 ttd = 0;
2489
2490         dest  = tw->tw_daddr;
2491         src   = tw->tw_rcv_saddr;
2492         destp = ntohs(tw->tw_dport);
2493         srcp  = ntohs(tw->tw_sport);
2494
2495         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2496                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2497                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2498                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2499                 atomic_read(&tw->tw_refcnt), tw, len);
2500 }
2501
2502 #define TMPSZ 150
2503
2504 static int tcp4_seq_show(struct seq_file *seq, void *v)
2505 {
2506         struct tcp_iter_state *st;
2507         int len;
2508
2509         if (v == SEQ_START_TOKEN) {
2510                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2511                            "  sl  local_address rem_address   st tx_queue "
2512                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2513                            "inode");
2514                 goto out;
2515         }
2516         st = seq->private;
2517
2518         switch (st->state) {
2519         case TCP_SEQ_STATE_LISTENING:
2520         case TCP_SEQ_STATE_ESTABLISHED:
2521                 get_tcp4_sock(v, seq, st->num, &len);
2522                 break;
2523         case TCP_SEQ_STATE_OPENREQ:
2524                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2525                 break;
2526         case TCP_SEQ_STATE_TIME_WAIT:
2527                 get_timewait4_sock(v, seq, st->num, &len);
2528                 break;
2529         }
2530         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2531 out:
2532         return 0;
2533 }
2534
2535 static const struct file_operations tcp_afinfo_seq_fops = {
2536         .owner   = THIS_MODULE,
2537         .open    = tcp_seq_open,
2538         .read    = seq_read,
2539         .llseek  = seq_lseek,
2540         .release = seq_release_net
2541 };
2542
2543 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2544         .name           = "tcp",
2545         .family         = AF_INET,
2546         .seq_fops       = &tcp_afinfo_seq_fops,
2547         .seq_ops        = {
2548                 .show           = tcp4_seq_show,
2549         },
2550 };
2551
2552 static int __net_init tcp4_proc_init_net(struct net *net)
2553 {
2554         return tcp_proc_register(net, &tcp4_seq_afinfo);
2555 }
2556
2557 static void __net_exit tcp4_proc_exit_net(struct net *net)
2558 {
2559         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2560 }
2561
2562 static struct pernet_operations tcp4_net_ops = {
2563         .init = tcp4_proc_init_net,
2564         .exit = tcp4_proc_exit_net,
2565 };
2566
2567 int __init tcp4_proc_init(void)
2568 {
2569         return register_pernet_subsys(&tcp4_net_ops);
2570 }
2571
2572 void tcp4_proc_exit(void)
2573 {
2574         unregister_pernet_subsys(&tcp4_net_ops);
2575 }
2576 #endif /* CONFIG_PROC_FS */
2577
2578 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2579 {
2580         const struct iphdr *iph = skb_gro_network_header(skb);
2581
2582         switch (skb->ip_summed) {
2583         case CHECKSUM_COMPLETE:
2584                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2585                                   skb->csum)) {
2586                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2587                         break;
2588                 }
2589
2590                 /* fall through */
2591         case CHECKSUM_NONE:
2592                 NAPI_GRO_CB(skb)->flush = 1;
2593                 return NULL;
2594         }
2595
2596         return tcp_gro_receive(head, skb);
2597 }
2598
2599 int tcp4_gro_complete(struct sk_buff *skb)
2600 {
2601         const struct iphdr *iph = ip_hdr(skb);
2602         struct tcphdr *th = tcp_hdr(skb);
2603
2604         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2605                                   iph->saddr, iph->daddr, 0);
2606         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2607
2608         return tcp_gro_complete(skb);
2609 }
2610
2611 struct proto tcp_prot = {
2612         .name                   = "TCP",
2613         .owner                  = THIS_MODULE,
2614         .close                  = tcp_close,
2615         .connect                = tcp_v4_connect,
2616         .disconnect             = tcp_disconnect,
2617         .accept                 = inet_csk_accept,
2618         .ioctl                  = tcp_ioctl,
2619         .init                   = tcp_v4_init_sock,
2620         .destroy                = tcp_v4_destroy_sock,
2621         .shutdown               = tcp_shutdown,
2622         .setsockopt             = tcp_setsockopt,
2623         .getsockopt             = tcp_getsockopt,
2624         .recvmsg                = tcp_recvmsg,
2625         .sendmsg                = tcp_sendmsg,
2626         .sendpage               = tcp_sendpage,
2627         .backlog_rcv            = tcp_v4_do_rcv,
2628         .hash                   = inet_hash,
2629         .unhash                 = inet_unhash,
2630         .get_port               = inet_csk_get_port,
2631         .enter_memory_pressure  = tcp_enter_memory_pressure,
2632         .sockets_allocated      = &tcp_sockets_allocated,
2633         .orphan_count           = &tcp_orphan_count,
2634         .memory_allocated       = &tcp_memory_allocated,
2635         .memory_pressure        = &tcp_memory_pressure,
2636         .sysctl_wmem            = sysctl_tcp_wmem,
2637         .sysctl_rmem            = sysctl_tcp_rmem,
2638         .max_header             = MAX_TCP_HEADER,
2639         .obj_size               = sizeof(struct tcp_sock),
2640         .slab_flags             = SLAB_DESTROY_BY_RCU,
2641         .twsk_prot              = &tcp_timewait_sock_ops,
2642         .rsk_prot               = &tcp_request_sock_ops,
2643         .h.hashinfo             = &tcp_hashinfo,
2644         .no_autobind            = true,
2645 #ifdef CONFIG_COMPAT
2646         .compat_setsockopt      = compat_tcp_setsockopt,
2647         .compat_getsockopt      = compat_tcp_getsockopt,
2648 #endif
2649 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
2650         .init_cgroup            = tcp_init_cgroup,
2651         .destroy_cgroup         = tcp_destroy_cgroup,
2652         .proto_cgroup           = tcp_proto_cgroup,
2653 #endif
2654 };
2655 EXPORT_SYMBOL(tcp_prot);
2656
2657 static int __net_init tcp_sk_init(struct net *net)
2658 {
2659         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2660                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2661 }
2662
2663 static void __net_exit tcp_sk_exit(struct net *net)
2664 {
2665         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2666 }
2667
2668 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2669 {
2670         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2671 }
2672
2673 static struct pernet_operations __net_initdata tcp_sk_ops = {
2674        .init       = tcp_sk_init,
2675        .exit       = tcp_sk_exit,
2676        .exit_batch = tcp_sk_exit_batch,
2677 };
2678
2679 void __init tcp_v4_init(void)
2680 {
2681         inet_hashinfo_init(&tcp_hashinfo);
2682         if (register_pernet_subsys(&tcp_sk_ops))
2683                 panic("Failed to create the TCP control socket.\n");
2684 }