Merge remote-tracking branch 'origin/dev/sumit-linux-3.10.96' into TOT-merge
[linux-3.10.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  *              IPv4 specific functions
9  *
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  *
18  *      This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23
24 /*
25  * Changes:
26  *              David S. Miller :       New socket lookup architecture.
27  *                                      This code is dedicated to John Dyson.
28  *              David S. Miller :       Change semantics of established hash,
29  *                                      half is devoted to TIME_WAIT sockets
30  *                                      and the rest go in the other half.
31  *              Andi Kleen :            Add support for syncookies and fixed
32  *                                      some bugs: ip options weren't passed to
33  *                                      the TCP layer, missed a check for an
34  *                                      ACK bit.
35  *              Andi Kleen :            Implemented fast path mtu discovery.
36  *                                      Fixed many serious bugs in the
37  *                                      request_sock handling and moved
38  *                                      most of it into the af independent code.
39  *                                      Added tail drop and some other bugfixes.
40  *                                      Added new listen semantics.
41  *              Mike McLagan    :       Routing by source
42  *      Juan Jose Ciarlante:            ip_dynaddr bits
43  *              Andi Kleen:             various fixes.
44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
45  *                                      coma.
46  *      Andi Kleen              :       Fix new listen.
47  *      Andi Kleen              :       Fix accept error reporting.
48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
50  *                                      a single port at the same time.
51  *
52  * Copyright (c) 2013-2014, NVIDIA CORPORATION.  All rights reserved.
53  */
54
55 #define pr_fmt(fmt) "TCP: " fmt
56
57 #include <linux/bottom_half.h>
58 #include <linux/types.h>
59 #include <linux/fcntl.h>
60 #include <linux/module.h>
61 #include <linux/random.h>
62 #include <linux/cache.h>
63 #include <linux/jhash.h>
64 #include <linux/init.h>
65 #include <linux/times.h>
66 #include <linux/slab.h>
67
68 #include <net/net_namespace.h>
69 #include <net/icmp.h>
70 #include <net/inet_hashtables.h>
71 #include <net/tcp.h>
72 #include <net/transp_v6.h>
73 #include <net/ipv6.h>
74 #include <net/inet_common.h>
75 #include <net/timewait_sock.h>
76 #include <net/xfrm.h>
77 #include <net/netdma.h>
78 #include <net/secure_seq.h>
79 #include <net/tcp_memcontrol.h>
80
81 #include <linux/inet.h>
82 #include <linux/ipv6.h>
83 #include <linux/stddef.h>
84 #include <linux/proc_fs.h>
85 #include <linux/seq_file.h>
86
87 #include <linux/crypto.h>
88 #include <linux/scatterlist.h>
89
90 int sysctl_tcp_tw_reuse __read_mostly;
91 int sysctl_tcp_low_latency __read_mostly;
92 EXPORT_SYMBOL(sysctl_tcp_low_latency);
93
94
95 #ifdef CONFIG_TCP_MD5SIG
96 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
97                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
98 #endif
99
100 struct inet_hashinfo tcp_hashinfo;
101 EXPORT_SYMBOL(tcp_hashinfo);
102
103 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
104 {
105         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
106                                           ip_hdr(skb)->saddr,
107                                           tcp_hdr(skb)->dest,
108                                           tcp_hdr(skb)->source);
109 }
110
111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112 {
113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114         struct tcp_sock *tp = tcp_sk(sk);
115
116         /* With PAWS, it is safe from the viewpoint
117            of data integrity. Even without PAWS it is safe provided sequence
118            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119
120            Actually, the idea is close to VJ's one, only timestamp cache is
121            held not per host, but per port pair and TW bucket is used as state
122            holder.
123
124            If TW bucket has been already destroyed we fall back to VJ's scheme
125            and use initial timestamp retrieved from peer table.
126          */
127         if (tcptw->tw_ts_recent_stamp &&
128             (twp == NULL || (sysctl_tcp_tw_reuse &&
129                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131                 if (tp->write_seq == 0)
132                         tp->write_seq = 1;
133                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
134                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135                 sock_hold(sktw);
136                 return 1;
137         }
138
139         return 0;
140 }
141 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
142
143 /* This will initiate an outgoing connection. */
144 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
145 {
146         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
147         struct inet_sock *inet = inet_sk(sk);
148         struct tcp_sock *tp = tcp_sk(sk);
149         __be16 orig_sport, orig_dport;
150         __be32 daddr, nexthop;
151         struct flowi4 *fl4;
152         struct rtable *rt;
153         int err;
154         struct ip_options_rcu *inet_opt;
155
156         if (addr_len < sizeof(struct sockaddr_in))
157                 return -EINVAL;
158
159         if (usin->sin_family != AF_INET)
160                 return -EAFNOSUPPORT;
161
162         nexthop = daddr = usin->sin_addr.s_addr;
163         inet_opt = rcu_dereference_protected(inet->inet_opt,
164                                              sock_owned_by_user(sk));
165         if (inet_opt && inet_opt->opt.srr) {
166                 if (!daddr)
167                         return -EINVAL;
168                 nexthop = inet_opt->opt.faddr;
169         }
170
171         orig_sport = inet->inet_sport;
172         orig_dport = usin->sin_port;
173         fl4 = &inet->cork.fl.u.ip4;
174         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
175                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
176                               IPPROTO_TCP,
177                               orig_sport, orig_dport, sk, true);
178         if (IS_ERR(rt)) {
179                 err = PTR_ERR(rt);
180                 if (err == -ENETUNREACH)
181                         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
182                 return err;
183         }
184
185         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
186                 ip_rt_put(rt);
187                 return -ENETUNREACH;
188         }
189
190         if (!inet_opt || !inet_opt->opt.srr)
191                 daddr = fl4->daddr;
192
193         if (!inet->inet_saddr)
194                 inet->inet_saddr = fl4->saddr;
195         inet->inet_rcv_saddr = inet->inet_saddr;
196
197         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
198                 /* Reset inherited state */
199                 tp->rx_opt.ts_recent       = 0;
200                 tp->rx_opt.ts_recent_stamp = 0;
201                 if (likely(!tp->repair))
202                         tp->write_seq      = 0;
203         }
204
205         if (tcp_death_row.sysctl_tw_recycle &&
206             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
207                 tcp_fetch_timewait_stamp(sk, &rt->dst);
208
209         inet->inet_dport = usin->sin_port;
210         inet->inet_daddr = daddr;
211
212         inet_csk(sk)->icsk_ext_hdr_len = 0;
213         if (inet_opt)
214                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
215
216         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
217
218         /* Socket identity is still unknown (sport may be zero).
219          * However we set state to SYN-SENT and not releasing socket
220          * lock select source port, enter ourselves into the hash tables and
221          * complete initialization after this.
222          */
223         tcp_set_state(sk, TCP_SYN_SENT);
224         err = inet_hash_connect(&tcp_death_row, sk);
225         if (err)
226                 goto failure;
227
228         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
229                                inet->inet_sport, inet->inet_dport, sk);
230         if (IS_ERR(rt)) {
231                 err = PTR_ERR(rt);
232                 rt = NULL;
233                 goto failure;
234         }
235         /* OK, now commit destination to socket.  */
236         sk->sk_gso_type = SKB_GSO_TCPV4;
237         sk_setup_caps(sk, &rt->dst);
238
239         if (!tp->write_seq && likely(!tp->repair))
240                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
241                                                            inet->inet_daddr,
242                                                            inet->inet_sport,
243                                                            usin->sin_port);
244
245         inet->inet_id = tp->write_seq ^ jiffies;
246
247         err = tcp_connect(sk);
248
249         rt = NULL;
250         if (err)
251                 goto failure;
252
253         return 0;
254
255 failure:
256         /*
257          * This unhashes the socket and releases the local port,
258          * if necessary.
259          */
260         tcp_set_state(sk, TCP_CLOSE);
261         ip_rt_put(rt);
262         sk->sk_route_caps = 0;
263         inet->inet_dport = 0;
264         return err;
265 }
266 EXPORT_SYMBOL(tcp_v4_connect);
267
268 /*
269  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
270  * It can be called through tcp_release_cb() if socket was owned by user
271  * at the time tcp_v4_err() was called to handle ICMP message.
272  */
273 void tcp_v4_mtu_reduced(struct sock *sk)
274 {
275         struct dst_entry *dst;
276         struct inet_sock *inet = inet_sk(sk);
277         u32 mtu = tcp_sk(sk)->mtu_info;
278
279         dst = inet_csk_update_pmtu(sk, mtu);
280         if (!dst)
281                 return;
282
283         /* Something is about to be wrong... Remember soft error
284          * for the case, if this connection will not able to recover.
285          */
286         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
287                 sk->sk_err_soft = EMSGSIZE;
288
289         mtu = dst_mtu(dst);
290
291         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
292             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
293                 tcp_sync_mss(sk, mtu);
294
295                 /* Resend the TCP packet because it's
296                  * clear that the old packet has been
297                  * dropped. This is the new "fast" path mtu
298                  * discovery.
299                  */
300                 tcp_simple_retransmit(sk);
301         } /* else let the usual retransmit timer handle it */
302 }
303 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
304
305 static void do_redirect(struct sk_buff *skb, struct sock *sk)
306 {
307         struct dst_entry *dst = __sk_dst_check(sk, 0);
308
309         if (dst)
310                 dst->ops->redirect(dst, sk, skb);
311 }
312
313 /*
314  * This routine is called by the ICMP module when it gets some
315  * sort of error condition.  If err < 0 then the socket should
316  * be closed and the error returned to the user.  If err > 0
317  * it's just the icmp type << 8 | icmp code.  After adjustment
318  * header points to the first 8 bytes of the tcp header.  We need
319  * to find the appropriate port.
320  *
321  * The locking strategy used here is very "optimistic". When
322  * someone else accesses the socket the ICMP is just dropped
323  * and for some paths there is no check at all.
324  * A more general error queue to queue errors for later handling
325  * is probably better.
326  *
327  */
328
329 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
330 {
331         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
332         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
333         struct inet_connection_sock *icsk;
334         struct tcp_sock *tp;
335         struct inet_sock *inet;
336         const int type = icmp_hdr(icmp_skb)->type;
337         const int code = icmp_hdr(icmp_skb)->code;
338         struct sock *sk;
339         struct sk_buff *skb;
340         struct request_sock *req;
341         __u32 seq;
342         __u32 remaining;
343         int err;
344         struct net *net = dev_net(icmp_skb->dev);
345
346         if (icmp_skb->len < (iph->ihl << 2) + 8) {
347                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
348                 return;
349         }
350
351         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
352                         iph->saddr, th->source, inet_iif(icmp_skb));
353         if (!sk) {
354                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
355                 return;
356         }
357         if (sk->sk_state == TCP_TIME_WAIT) {
358                 inet_twsk_put(inet_twsk(sk));
359                 return;
360         }
361
362         bh_lock_sock(sk);
363         /* If too many ICMPs get dropped on busy
364          * servers this needs to be solved differently.
365          * We do take care of PMTU discovery (RFC1191) special case :
366          * we can receive locally generated ICMP messages while socket is held.
367          */
368         if (sock_owned_by_user(sk)) {
369                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
370                         NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
371         }
372         if (sk->sk_state == TCP_CLOSE)
373                 goto out;
374
375         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
376                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
377                 goto out;
378         }
379
380         icsk = inet_csk(sk);
381         tp = tcp_sk(sk);
382         req = tp->fastopen_rsk;
383         seq = ntohl(th->seq);
384         if (sk->sk_state != TCP_LISTEN &&
385             !between(seq, tp->snd_una, tp->snd_nxt) &&
386             (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
387                 /* For a Fast Open socket, allow seq to be snt_isn. */
388                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
389                 goto out;
390         }
391
392         switch (type) {
393         case ICMP_REDIRECT:
394                 do_redirect(icmp_skb, sk);
395                 goto out;
396         case ICMP_SOURCE_QUENCH:
397                 /* Just silently ignore these. */
398                 goto out;
399         case ICMP_PARAMETERPROB:
400                 err = EPROTO;
401                 break;
402         case ICMP_DEST_UNREACH:
403                 if (code > NR_ICMP_UNREACH)
404                         goto out;
405
406                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
407                         /* We are not interested in TCP_LISTEN and open_requests
408                          * (SYN-ACKs send out by Linux are always <576bytes so
409                          * they should go through unfragmented).
410                          */
411                         if (sk->sk_state == TCP_LISTEN)
412                                 goto out;
413
414                         tp->mtu_info = info;
415                         if (!sock_owned_by_user(sk)) {
416                                 tcp_v4_mtu_reduced(sk);
417                         } else {
418                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
419                                         sock_hold(sk);
420                         }
421                         goto out;
422                 }
423
424                 err = icmp_err_convert[code].errno;
425                 /* check if icmp_skb allows revert of backoff
426                  * (see draft-zimmermann-tcp-lcd) */
427                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
428                         break;
429                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
430                     !icsk->icsk_backoff)
431                         break;
432
433                 /* XXX (TFO) - revisit the following logic for TFO */
434
435                 if (sock_owned_by_user(sk))
436                         break;
437
438                 icsk->icsk_backoff--;
439                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
440                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
441                 tcp_bound_rto(sk);
442
443                 skb = tcp_write_queue_head(sk);
444                 BUG_ON(!skb);
445
446                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
447                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
448
449                 if (remaining) {
450                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
451                                                   remaining, TCP_RTO_MAX);
452                 } else {
453                         /* RTO revert clocked out retransmission.
454                          * Will retransmit now */
455                         tcp_retransmit_timer(sk);
456                 }
457
458                 break;
459         case ICMP_TIME_EXCEEDED:
460                 err = EHOSTUNREACH;
461                 break;
462         default:
463                 goto out;
464         }
465
466         /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
467          * than following the TCP_SYN_RECV case and closing the socket,
468          * we ignore the ICMP error and keep trying like a fully established
469          * socket. Is this the right thing to do?
470          */
471         if (req && req->sk == NULL)
472                 goto out;
473
474         switch (sk->sk_state) {
475                 struct request_sock *req, **prev;
476         case TCP_LISTEN:
477                 if (sock_owned_by_user(sk))
478                         goto out;
479
480                 req = inet_csk_search_req(sk, &prev, th->dest,
481                                           iph->daddr, iph->saddr);
482                 if (!req)
483                         goto out;
484
485                 /* ICMPs are not backlogged, hence we cannot get
486                    an established socket here.
487                  */
488                 WARN_ON(req->sk);
489
490                 if (seq != tcp_rsk(req)->snt_isn) {
491                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
492                         goto out;
493                 }
494
495                 /*
496                  * Still in SYN_RECV, just remove it silently.
497                  * There is no good way to pass the error to the newly
498                  * created socket, and POSIX does not want network
499                  * errors returned from accept().
500                  */
501                 inet_csk_reqsk_queue_drop(sk, req, prev);
502                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
503                 goto out;
504
505         case TCP_SYN_SENT:
506         case TCP_SYN_RECV:  /* Cannot happen.
507                                It can f.e. if SYNs crossed,
508                                or Fast Open.
509                              */
510                 if (!sock_owned_by_user(sk)) {
511                         sk->sk_err = err;
512
513                         sk->sk_error_report(sk);
514
515                         tcp_done(sk);
516                 } else {
517                         sk->sk_err_soft = err;
518                 }
519                 goto out;
520         }
521
522         /* If we've already connected we will keep trying
523          * until we time out, or the user gives up.
524          *
525          * rfc1122 4.2.3.9 allows to consider as hard errors
526          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
527          * but it is obsoleted by pmtu discovery).
528          *
529          * Note, that in modern internet, where routing is unreliable
530          * and in each dark corner broken firewalls sit, sending random
531          * errors ordered by their masters even this two messages finally lose
532          * their original sense (even Linux sends invalid PORT_UNREACHs)
533          *
534          * Now we are in compliance with RFCs.
535          *                                                      --ANK (980905)
536          */
537
538         inet = inet_sk(sk);
539         if (!sock_owned_by_user(sk) && inet->recverr) {
540                 sk->sk_err = err;
541                 sk->sk_error_report(sk);
542         } else  { /* Only an error on timeout */
543                 sk->sk_err_soft = err;
544         }
545
546 out:
547         bh_unlock_sock(sk);
548         sock_put(sk);
549 }
550
551 static void __tcp_v4_send_check(struct sk_buff *skb,
552                                 __be32 saddr, __be32 daddr)
553 {
554         struct tcphdr *th = tcp_hdr(skb);
555
556         if (skb->ip_summed == CHECKSUM_PARTIAL) {
557                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
558                 skb->csum_start = skb_transport_header(skb) - skb->head;
559                 skb->csum_offset = offsetof(struct tcphdr, check);
560         } else {
561                 th->check = tcp_v4_check(skb->len, saddr, daddr,
562                                          csum_partial(th,
563                                                       th->doff << 2,
564                                                       skb->csum));
565         }
566 }
567
568 /* This routine computes an IPv4 TCP checksum. */
569 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
570 {
571         const struct inet_sock *inet = inet_sk(sk);
572
573         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
574 }
575 EXPORT_SYMBOL(tcp_v4_send_check);
576
577 int tcp_v4_gso_send_check(struct sk_buff *skb)
578 {
579         const struct iphdr *iph;
580         struct tcphdr *th;
581
582         if (!pskb_may_pull(skb, sizeof(*th)))
583                 return -EINVAL;
584
585         iph = ip_hdr(skb);
586         th = tcp_hdr(skb);
587
588         th->check = 0;
589         skb->ip_summed = CHECKSUM_PARTIAL;
590         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
591         return 0;
592 }
593
594 /*
595  *      This routine will send an RST to the other tcp.
596  *
597  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
598  *                    for reset.
599  *      Answer: if a packet caused RST, it is not for a socket
600  *              existing in our system, if it is matched to a socket,
601  *              it is just duplicate segment or bug in other side's TCP.
602  *              So that we build reply only basing on parameters
603  *              arrived with segment.
604  *      Exception: precedence violation. We do not implement it in any case.
605  */
606
607 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
608 {
609         const struct tcphdr *th = tcp_hdr(skb);
610         struct {
611                 struct tcphdr th;
612 #ifdef CONFIG_TCP_MD5SIG
613                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
614 #endif
615         } rep;
616         struct ip_reply_arg arg;
617 #ifdef CONFIG_TCP_MD5SIG
618         struct tcp_md5sig_key *key;
619         const __u8 *hash_location = NULL;
620         unsigned char newhash[16];
621         int genhash;
622         struct sock *sk1 = NULL;
623 #endif
624         struct net *net;
625
626         /* Never send a reset in response to a reset. */
627         if (th->rst)
628                 return;
629
630         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
631                 return;
632
633         /* Swap the send and the receive. */
634         memset(&rep, 0, sizeof(rep));
635         rep.th.dest   = th->source;
636         rep.th.source = th->dest;
637         rep.th.doff   = sizeof(struct tcphdr) / 4;
638         rep.th.rst    = 1;
639
640         if (th->ack) {
641                 rep.th.seq = th->ack_seq;
642         } else {
643                 rep.th.ack = 1;
644                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
645                                        skb->len - (th->doff << 2));
646         }
647
648         memset(&arg, 0, sizeof(arg));
649         arg.iov[0].iov_base = (unsigned char *)&rep;
650         arg.iov[0].iov_len  = sizeof(rep.th);
651
652 #ifdef CONFIG_TCP_MD5SIG
653         hash_location = tcp_parse_md5sig_option(th);
654         if (!sk && hash_location) {
655                 /*
656                  * active side is lost. Try to find listening socket through
657                  * source port, and then find md5 key through listening socket.
658                  * we are not loose security here:
659                  * Incoming packet is checked with md5 hash with finding key,
660                  * no RST generated if md5 hash doesn't match.
661                  */
662                 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
663                                              &tcp_hashinfo, ip_hdr(skb)->saddr,
664                                              th->source, ip_hdr(skb)->daddr,
665                                              ntohs(th->source), inet_iif(skb));
666                 /* don't send rst if it can't find key */
667                 if (!sk1)
668                         return;
669                 rcu_read_lock();
670                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
671                                         &ip_hdr(skb)->saddr, AF_INET);
672                 if (!key)
673                         goto release_sk1;
674
675                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
676                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
677                         goto release_sk1;
678         } else {
679                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
680                                              &ip_hdr(skb)->saddr,
681                                              AF_INET) : NULL;
682         }
683
684         if (key) {
685                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
686                                    (TCPOPT_NOP << 16) |
687                                    (TCPOPT_MD5SIG << 8) |
688                                    TCPOLEN_MD5SIG);
689                 /* Update length and the length the header thinks exists */
690                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
691                 rep.th.doff = arg.iov[0].iov_len / 4;
692
693                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
694                                      key, ip_hdr(skb)->saddr,
695                                      ip_hdr(skb)->daddr, &rep.th);
696         }
697 #endif
698         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
699                                       ip_hdr(skb)->saddr, /* XXX */
700                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
701         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
702         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
703         /* When socket is gone, all binding information is lost.
704          * routing might fail in this case. No choice here, if we choose to force
705          * input interface, we will misroute in case of asymmetric route.
706          */
707         if (sk)
708                 arg.bound_dev_if = sk->sk_bound_dev_if;
709
710         net = dev_net(skb_dst(skb)->dev);
711         arg.tos = ip_hdr(skb)->tos;
712         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
713                               skb, ip_hdr(skb)->saddr,
714                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
715
716         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
717         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
718
719 #ifdef CONFIG_TCP_MD5SIG
720 release_sk1:
721         if (sk1) {
722                 rcu_read_unlock();
723                 sock_put(sk1);
724         }
725 #endif
726 }
727
728 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
729    outside socket context is ugly, certainly. What can I do?
730  */
731
732 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
733                             u32 win, u32 tsval, u32 tsecr, int oif,
734                             struct tcp_md5sig_key *key,
735                             int reply_flags, u8 tos)
736 {
737         const struct tcphdr *th = tcp_hdr(skb);
738         struct {
739                 struct tcphdr th;
740                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
741 #ifdef CONFIG_TCP_MD5SIG
742                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
743 #endif
744                         ];
745         } rep;
746         struct ip_reply_arg arg;
747         struct net *net = dev_net(skb_dst(skb)->dev);
748
749         memset(&rep.th, 0, sizeof(struct tcphdr));
750         memset(&arg, 0, sizeof(arg));
751
752         arg.iov[0].iov_base = (unsigned char *)&rep;
753         arg.iov[0].iov_len  = sizeof(rep.th);
754         if (tsecr) {
755                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
756                                    (TCPOPT_TIMESTAMP << 8) |
757                                    TCPOLEN_TIMESTAMP);
758                 rep.opt[1] = htonl(tsval);
759                 rep.opt[2] = htonl(tsecr);
760                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
761         }
762
763         /* Swap the send and the receive. */
764         rep.th.dest    = th->source;
765         rep.th.source  = th->dest;
766         rep.th.doff    = arg.iov[0].iov_len / 4;
767         rep.th.seq     = htonl(seq);
768         rep.th.ack_seq = htonl(ack);
769         rep.th.ack     = 1;
770         rep.th.window  = htons(win);
771
772 #ifdef CONFIG_TCP_MD5SIG
773         if (key) {
774                 int offset = (tsecr) ? 3 : 0;
775
776                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
777                                           (TCPOPT_NOP << 16) |
778                                           (TCPOPT_MD5SIG << 8) |
779                                           TCPOLEN_MD5SIG);
780                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
781                 rep.th.doff = arg.iov[0].iov_len/4;
782
783                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
784                                     key, ip_hdr(skb)->saddr,
785                                     ip_hdr(skb)->daddr, &rep.th);
786         }
787 #endif
788         arg.flags = reply_flags;
789         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
790                                       ip_hdr(skb)->saddr, /* XXX */
791                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
792         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
793         if (oif)
794                 arg.bound_dev_if = oif;
795         arg.tos = tos;
796         ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
797                               skb, ip_hdr(skb)->saddr,
798                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
799
800         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
801 }
802
803 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
804 {
805         struct inet_timewait_sock *tw = inet_twsk(sk);
806         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
807
808         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
809                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
810                         tcp_time_stamp + tcptw->tw_ts_offset,
811                         tcptw->tw_ts_recent,
812                         tw->tw_bound_dev_if,
813                         tcp_twsk_md5_key(tcptw),
814                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
815                         tw->tw_tos
816                         );
817
818         inet_twsk_put(tw);
819 }
820
821 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
822                                   struct request_sock *req)
823 {
824         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
825          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
826          */
827         tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
828                         tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
829                         tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
830                         tcp_time_stamp,
831                         req->ts_recent,
832                         0,
833                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
834                                           AF_INET),
835                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
836                         ip_hdr(skb)->tos);
837 }
838
839 /*
840  *      Send a SYN-ACK after having received a SYN.
841  *      This still operates on a request_sock only, not on a big
842  *      socket.
843  */
844 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
845                               struct request_sock *req,
846                               u16 queue_mapping,
847                               bool nocache)
848 {
849         const struct inet_request_sock *ireq = inet_rsk(req);
850         struct flowi4 fl4;
851         int err = -1;
852         struct sk_buff * skb;
853
854         /* First, grab a route. */
855         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
856                 return -1;
857
858         skb = tcp_make_synack(sk, dst, req, NULL);
859
860         if (skb) {
861                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
862
863                 skb_set_queue_mapping(skb, queue_mapping);
864                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
865                                             ireq->rmt_addr,
866                                             ireq->opt);
867                 err = net_xmit_eval(err);
868                 if (!tcp_rsk(req)->snt_synack && !err)
869                         tcp_rsk(req)->snt_synack = tcp_time_stamp;
870         }
871
872         return err;
873 }
874
875 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
876 {
877         int res = tcp_v4_send_synack(sk, NULL, req, 0, false);
878
879         if (!res)
880                 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
881         return res;
882 }
883
884 /*
885  *      IPv4 request_sock destructor.
886  */
887 static void tcp_v4_reqsk_destructor(struct request_sock *req)
888 {
889         kfree(inet_rsk(req)->opt);
890 }
891
892 /*
893  * Return true if a syncookie should be sent
894  */
895 bool tcp_syn_flood_action(struct sock *sk,
896                          const struct sk_buff *skb,
897                          const char *proto)
898 {
899         const char *msg = "Dropping request";
900         bool want_cookie = false;
901         struct listen_sock *lopt;
902
903
904
905 #ifdef CONFIG_SYN_COOKIES
906         if (sysctl_tcp_syncookies) {
907                 msg = "Sending cookies";
908                 want_cookie = true;
909                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
910         } else
911 #endif
912                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
913
914         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
915         if (!lopt->synflood_warned) {
916                 lopt->synflood_warned = 1;
917                 pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
918                         proto, ntohs(tcp_hdr(skb)->dest), msg);
919         }
920         return want_cookie;
921 }
922 EXPORT_SYMBOL(tcp_syn_flood_action);
923
924 /*
925  * Save and compile IPv4 options into the request_sock if needed.
926  */
927 static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
928 {
929         const struct ip_options *opt = &(IPCB(skb)->opt);
930         struct ip_options_rcu *dopt = NULL;
931
932         if (opt && opt->optlen) {
933                 int opt_size = sizeof(*dopt) + opt->optlen;
934
935                 dopt = kmalloc(opt_size, GFP_ATOMIC);
936                 if (dopt) {
937                         if (ip_options_echo(&dopt->opt, skb)) {
938                                 kfree(dopt);
939                                 dopt = NULL;
940                         }
941                 }
942         }
943         return dopt;
944 }
945
946 #ifdef CONFIG_TCP_MD5SIG
947 /*
948  * RFC2385 MD5 checksumming requires a mapping of
949  * IP address->MD5 Key.
950  * We need to maintain these in the sk structure.
951  */
952
953 /* Find the Key structure for an address.  */
954 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
955                                          const union tcp_md5_addr *addr,
956                                          int family)
957 {
958         struct tcp_sock *tp = tcp_sk(sk);
959         struct tcp_md5sig_key *key;
960         unsigned int size = sizeof(struct in_addr);
961         struct tcp_md5sig_info *md5sig;
962
963         /* caller either holds rcu_read_lock() or socket lock */
964         md5sig = rcu_dereference_check(tp->md5sig_info,
965                                        sock_owned_by_user(sk) ||
966                                        lockdep_is_held(&sk->sk_lock.slock));
967         if (!md5sig)
968                 return NULL;
969 #if IS_ENABLED(CONFIG_IPV6)
970         if (family == AF_INET6)
971                 size = sizeof(struct in6_addr);
972 #endif
973         hlist_for_each_entry_rcu(key, &md5sig->head, node) {
974                 if (key->family != family)
975                         continue;
976                 if (!memcmp(&key->addr, addr, size))
977                         return key;
978         }
979         return NULL;
980 }
981 EXPORT_SYMBOL(tcp_md5_do_lookup);
982
983 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
984                                          struct sock *addr_sk)
985 {
986         union tcp_md5_addr *addr;
987
988         addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
989         return tcp_md5_do_lookup(sk, addr, AF_INET);
990 }
991 EXPORT_SYMBOL(tcp_v4_md5_lookup);
992
993 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
994                                                       struct request_sock *req)
995 {
996         union tcp_md5_addr *addr;
997
998         addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
999         return tcp_md5_do_lookup(sk, addr, AF_INET);
1000 }
1001
1002 /* This can be called on a newly created socket, from other files */
1003 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1004                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
1005 {
1006         /* Add Key to the list */
1007         struct tcp_md5sig_key *key;
1008         struct tcp_sock *tp = tcp_sk(sk);
1009         struct tcp_md5sig_info *md5sig;
1010
1011         key = tcp_md5_do_lookup(sk, addr, family);
1012         if (key) {
1013                 /* Pre-existing entry - just update that one. */
1014                 memcpy(key->key, newkey, newkeylen);
1015                 key->keylen = newkeylen;
1016                 return 0;
1017         }
1018
1019         md5sig = rcu_dereference_protected(tp->md5sig_info,
1020                                            sock_owned_by_user(sk) ||
1021                                            lockdep_is_held(&sk->sk_lock.slock));
1022         if (!md5sig) {
1023                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1024                 if (!md5sig)
1025                         return -ENOMEM;
1026
1027                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1028                 INIT_HLIST_HEAD(&md5sig->head);
1029                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1030         }
1031
1032         key = sock_kmalloc(sk, sizeof(*key), gfp);
1033         if (!key)
1034                 return -ENOMEM;
1035         if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1036                 sock_kfree_s(sk, key, sizeof(*key));
1037                 return -ENOMEM;
1038         }
1039
1040         memcpy(key->key, newkey, newkeylen);
1041         key->keylen = newkeylen;
1042         key->family = family;
1043         memcpy(&key->addr, addr,
1044                (family == AF_INET6) ? sizeof(struct in6_addr) :
1045                                       sizeof(struct in_addr));
1046         hlist_add_head_rcu(&key->node, &md5sig->head);
1047         return 0;
1048 }
1049 EXPORT_SYMBOL(tcp_md5_do_add);
1050
1051 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1052 {
1053         struct tcp_sock *tp = tcp_sk(sk);
1054         struct tcp_md5sig_key *key;
1055         struct tcp_md5sig_info *md5sig;
1056
1057         key = tcp_md5_do_lookup(sk, addr, family);
1058         if (!key)
1059                 return -ENOENT;
1060         hlist_del_rcu(&key->node);
1061         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1062         kfree_rcu(key, rcu);
1063         md5sig = rcu_dereference_protected(tp->md5sig_info,
1064                                            sock_owned_by_user(sk));
1065         if (hlist_empty(&md5sig->head))
1066                 tcp_free_md5sig_pool();
1067         return 0;
1068 }
1069 EXPORT_SYMBOL(tcp_md5_do_del);
1070
1071 static void tcp_clear_md5_list(struct sock *sk)
1072 {
1073         struct tcp_sock *tp = tcp_sk(sk);
1074         struct tcp_md5sig_key *key;
1075         struct hlist_node *n;
1076         struct tcp_md5sig_info *md5sig;
1077
1078         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1079
1080         if (!hlist_empty(&md5sig->head))
1081                 tcp_free_md5sig_pool();
1082         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1083                 hlist_del_rcu(&key->node);
1084                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1085                 kfree_rcu(key, rcu);
1086         }
1087 }
1088
1089 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1090                                  int optlen)
1091 {
1092         struct tcp_md5sig cmd;
1093         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1094
1095         if (optlen < sizeof(cmd))
1096                 return -EINVAL;
1097
1098         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1099                 return -EFAULT;
1100
1101         if (sin->sin_family != AF_INET)
1102                 return -EINVAL;
1103
1104         if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1105                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1106                                       AF_INET);
1107
1108         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1109                 return -EINVAL;
1110
1111         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1112                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1113                               GFP_KERNEL);
1114 }
1115
1116 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1117                                         __be32 daddr, __be32 saddr, int nbytes)
1118 {
1119         struct tcp4_pseudohdr *bp;
1120         struct scatterlist sg;
1121
1122         bp = &hp->md5_blk.ip4;
1123
1124         /*
1125          * 1. the TCP pseudo-header (in the order: source IP address,
1126          * destination IP address, zero-padded protocol number, and
1127          * segment length)
1128          */
1129         bp->saddr = saddr;
1130         bp->daddr = daddr;
1131         bp->pad = 0;
1132         bp->protocol = IPPROTO_TCP;
1133         bp->len = cpu_to_be16(nbytes);
1134
1135         sg_init_one(&sg, bp, sizeof(*bp));
1136         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1137 }
1138
1139 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1140                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1141 {
1142         struct tcp_md5sig_pool *hp;
1143         struct hash_desc *desc;
1144
1145         hp = tcp_get_md5sig_pool();
1146         if (!hp)
1147                 goto clear_hash_noput;
1148         desc = &hp->md5_desc;
1149
1150         if (crypto_hash_init(desc))
1151                 goto clear_hash;
1152         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1153                 goto clear_hash;
1154         if (tcp_md5_hash_header(hp, th))
1155                 goto clear_hash;
1156         if (tcp_md5_hash_key(hp, key))
1157                 goto clear_hash;
1158         if (crypto_hash_final(desc, md5_hash))
1159                 goto clear_hash;
1160
1161         tcp_put_md5sig_pool();
1162         return 0;
1163
1164 clear_hash:
1165         tcp_put_md5sig_pool();
1166 clear_hash_noput:
1167         memset(md5_hash, 0, 16);
1168         return 1;
1169 }
1170
1171 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1172                         const struct sock *sk, const struct request_sock *req,
1173                         const struct sk_buff *skb)
1174 {
1175         struct tcp_md5sig_pool *hp;
1176         struct hash_desc *desc;
1177         const struct tcphdr *th = tcp_hdr(skb);
1178         __be32 saddr, daddr;
1179
1180         if (sk) {
1181                 saddr = inet_sk(sk)->inet_saddr;
1182                 daddr = inet_sk(sk)->inet_daddr;
1183         } else if (req) {
1184                 saddr = inet_rsk(req)->loc_addr;
1185                 daddr = inet_rsk(req)->rmt_addr;
1186         } else {
1187                 const struct iphdr *iph = ip_hdr(skb);
1188                 saddr = iph->saddr;
1189                 daddr = iph->daddr;
1190         }
1191
1192         hp = tcp_get_md5sig_pool();
1193         if (!hp)
1194                 goto clear_hash_noput;
1195         desc = &hp->md5_desc;
1196
1197         if (crypto_hash_init(desc))
1198                 goto clear_hash;
1199
1200         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1201                 goto clear_hash;
1202         if (tcp_md5_hash_header(hp, th))
1203                 goto clear_hash;
1204         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1205                 goto clear_hash;
1206         if (tcp_md5_hash_key(hp, key))
1207                 goto clear_hash;
1208         if (crypto_hash_final(desc, md5_hash))
1209                 goto clear_hash;
1210
1211         tcp_put_md5sig_pool();
1212         return 0;
1213
1214 clear_hash:
1215         tcp_put_md5sig_pool();
1216 clear_hash_noput:
1217         memset(md5_hash, 0, 16);
1218         return 1;
1219 }
1220 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1221
1222 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1223 {
1224         /*
1225          * This gets called for each TCP segment that arrives
1226          * so we want to be efficient.
1227          * We have 3 drop cases:
1228          * o No MD5 hash and one expected.
1229          * o MD5 hash and we're not expecting one.
1230          * o MD5 hash and its wrong.
1231          */
1232         const __u8 *hash_location = NULL;
1233         struct tcp_md5sig_key *hash_expected;
1234         const struct iphdr *iph = ip_hdr(skb);
1235         const struct tcphdr *th = tcp_hdr(skb);
1236         int genhash;
1237         unsigned char newhash[16];
1238
1239         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1240                                           AF_INET);
1241         hash_location = tcp_parse_md5sig_option(th);
1242
1243         /* We've parsed the options - do we have a hash? */
1244         if (!hash_expected && !hash_location)
1245                 return false;
1246
1247         if (hash_expected && !hash_location) {
1248                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1249                 return true;
1250         }
1251
1252         if (!hash_expected && hash_location) {
1253                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1254                 return true;
1255         }
1256
1257         /* Okay, so this is hash_expected and hash_location -
1258          * so we need to calculate the checksum.
1259          */
1260         genhash = tcp_v4_md5_hash_skb(newhash,
1261                                       hash_expected,
1262                                       NULL, NULL, skb);
1263
1264         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1265                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1266                                      &iph->saddr, ntohs(th->source),
1267                                      &iph->daddr, ntohs(th->dest),
1268                                      genhash ? " tcp_v4_calc_md5_hash failed"
1269                                      : "");
1270                 return true;
1271         }
1272         return false;
1273 }
1274
1275 #endif
1276
1277 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1278         .family         =       PF_INET,
1279         .obj_size       =       sizeof(struct tcp_request_sock),
1280         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1281         .send_ack       =       tcp_v4_reqsk_send_ack,
1282         .destructor     =       tcp_v4_reqsk_destructor,
1283         .send_reset     =       tcp_v4_send_reset,
1284         .syn_ack_timeout =      tcp_syn_ack_timeout,
1285 };
1286
1287 #ifdef CONFIG_TCP_MD5SIG
1288 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1289         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1290         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1291 };
1292 #endif
1293
1294 static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1295                                struct request_sock *req,
1296                                struct tcp_fastopen_cookie *foc,
1297                                struct tcp_fastopen_cookie *valid_foc)
1298 {
1299         bool skip_cookie = false;
1300         struct fastopen_queue *fastopenq;
1301
1302         if (likely(!fastopen_cookie_present(foc))) {
1303                 /* See include/net/tcp.h for the meaning of these knobs */
1304                 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1305                     ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1306                     (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1307                         skip_cookie = true; /* no cookie to validate */
1308                 else
1309                         return false;
1310         }
1311         fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1312         /* A FO option is present; bump the counter. */
1313         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1314
1315         /* Make sure the listener has enabled fastopen, and we don't
1316          * exceed the max # of pending TFO requests allowed before trying
1317          * to validating the cookie in order to avoid burning CPU cycles
1318          * unnecessarily.
1319          *
1320          * XXX (TFO) - The implication of checking the max_qlen before
1321          * processing a cookie request is that clients can't differentiate
1322          * between qlen overflow causing Fast Open to be disabled
1323          * temporarily vs a server not supporting Fast Open at all.
1324          */
1325         if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1326             fastopenq == NULL || fastopenq->max_qlen == 0)
1327                 return false;
1328
1329         if (fastopenq->qlen >= fastopenq->max_qlen) {
1330                 struct request_sock *req1;
1331                 spin_lock(&fastopenq->lock);
1332                 req1 = fastopenq->rskq_rst_head;
1333                 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1334                         spin_unlock(&fastopenq->lock);
1335                         NET_INC_STATS_BH(sock_net(sk),
1336                             LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1337                         /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1338                         foc->len = -1;
1339                         return false;
1340                 }
1341                 fastopenq->rskq_rst_head = req1->dl_next;
1342                 fastopenq->qlen--;
1343                 spin_unlock(&fastopenq->lock);
1344                 reqsk_free(req1);
1345         }
1346         if (skip_cookie) {
1347                 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1348                 return true;
1349         }
1350         if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1351                 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1352                         tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1353                         if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1354                             memcmp(&foc->val[0], &valid_foc->val[0],
1355                             TCP_FASTOPEN_COOKIE_SIZE) != 0)
1356                                 return false;
1357                         valid_foc->len = -1;
1358                 }
1359                 /* Acknowledge the data received from the peer. */
1360                 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1361                 return true;
1362         } else if (foc->len == 0) { /* Client requesting a cookie */
1363                 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1364                 NET_INC_STATS_BH(sock_net(sk),
1365                     LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1366         } else {
1367                 /* Client sent a cookie with wrong size. Treat it
1368                  * the same as invalid and return a valid one.
1369                  */
1370                 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1371         }
1372         return false;
1373 }
1374
1375 static int tcp_v4_conn_req_fastopen(struct sock *sk,
1376                                     struct sk_buff *skb,
1377                                     struct sk_buff *skb_synack,
1378                                     struct request_sock *req)
1379 {
1380         struct tcp_sock *tp = tcp_sk(sk);
1381         struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1382         const struct inet_request_sock *ireq = inet_rsk(req);
1383         struct sock *child;
1384         int err;
1385
1386         req->num_retrans = 0;
1387         req->num_timeout = 0;
1388         req->sk = NULL;
1389
1390         child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1391         if (child == NULL) {
1392                 NET_INC_STATS_BH(sock_net(sk),
1393                                  LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1394                 kfree_skb(skb_synack);
1395                 return -1;
1396         }
1397         err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1398                                     ireq->rmt_addr, ireq->opt);
1399         err = net_xmit_eval(err);
1400         if (!err)
1401                 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1402         /* XXX (TFO) - is it ok to ignore error and continue? */
1403
1404         spin_lock(&queue->fastopenq->lock);
1405         queue->fastopenq->qlen++;
1406         spin_unlock(&queue->fastopenq->lock);
1407
1408         /* Initialize the child socket. Have to fix some values to take
1409          * into account the child is a Fast Open socket and is created
1410          * only out of the bits carried in the SYN packet.
1411          */
1412         tp = tcp_sk(child);
1413
1414         tp->fastopen_rsk = req;
1415         /* Do a hold on the listner sk so that if the listener is being
1416          * closed, the child that has been accepted can live on and still
1417          * access listen_lock.
1418          */
1419         sock_hold(sk);
1420         tcp_rsk(req)->listener = sk;
1421
1422         /* RFC1323: The window in SYN & SYN/ACK segments is never
1423          * scaled. So correct it appropriately.
1424          */
1425         tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1426
1427         /* Activate the retrans timer so that SYNACK can be retransmitted.
1428          * The request socket is not added to the SYN table of the parent
1429          * because it's been added to the accept queue directly.
1430          */
1431         inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1432             TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1433
1434         /* Add the child socket directly into the accept queue */
1435         inet_csk_reqsk_queue_add(sk, req, child);
1436
1437         /* Now finish processing the fastopen child socket. */
1438         inet_csk(child)->icsk_af_ops->rebuild_header(child);
1439         tcp_init_congestion_control(child);
1440         tcp_mtup_init(child);
1441         tcp_init_buffer_space(child);
1442         tcp_init_metrics(child);
1443
1444         /* Queue the data carried in the SYN packet. We need to first
1445          * bump skb's refcnt because the caller will attempt to free it.
1446          *
1447          * XXX (TFO) - we honor a zero-payload TFO request for now.
1448          * (Any reason not to?)
1449          */
1450         if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1451                 /* Don't queue the skb if there is no payload in SYN.
1452                  * XXX (TFO) - How about SYN+FIN?
1453                  */
1454                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1455         } else {
1456                 skb = skb_get(skb);
1457                 skb_dst_drop(skb);
1458                 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1459                 skb_set_owner_r(skb, child);
1460                 __skb_queue_tail(&child->sk_receive_queue, skb);
1461                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1462                 tp->syn_data_acked = 1;
1463         }
1464         sk->sk_data_ready(sk, 0);
1465         bh_unlock_sock(child);
1466         sock_put(child);
1467         WARN_ON(req->sk == NULL);
1468         return 0;
1469 }
1470
1471 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1472 {
1473         struct tcp_options_received tmp_opt;
1474         struct request_sock *req;
1475         struct inet_request_sock *ireq;
1476         struct tcp_sock *tp = tcp_sk(sk);
1477         struct dst_entry *dst = NULL;
1478         __be32 saddr = ip_hdr(skb)->saddr;
1479         __be32 daddr = ip_hdr(skb)->daddr;
1480         __u32 isn = TCP_SKB_CB(skb)->when;
1481         bool want_cookie = false;
1482         struct flowi4 fl4;
1483         struct tcp_fastopen_cookie foc = { .len = -1 };
1484         struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1485         struct sk_buff *skb_synack;
1486         int do_fastopen;
1487
1488         /* Never answer to SYNs send to broadcast or multicast */
1489         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1490                 goto drop;
1491
1492         /* TW buckets are converted to open requests without
1493          * limitations, they conserve resources and peer is
1494          * evidently real one.
1495          */
1496         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1497                 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1498                 if (!want_cookie)
1499                         goto drop;
1500         }
1501
1502         /* Accept backlog is full. If we have already queued enough
1503          * of warm entries in syn queue, drop request. It is better than
1504          * clogging syn queue with openreqs with exponentially increasing
1505          * timeout.
1506          */
1507         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
1508                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1509                 goto drop;
1510         }
1511
1512         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1513         if (!req)
1514                 goto drop;
1515
1516 #ifdef CONFIG_TCP_MD5SIG
1517         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1518 #endif
1519
1520         tcp_clear_options(&tmp_opt);
1521         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1522         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1523         tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
1524
1525         if (want_cookie && !tmp_opt.saw_tstamp)
1526                 tcp_clear_options(&tmp_opt);
1527
1528         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1529         tcp_openreq_init(req, &tmp_opt, skb);
1530
1531         ireq = inet_rsk(req);
1532         ireq->loc_addr = daddr;
1533         ireq->rmt_addr = saddr;
1534         ireq->no_srccheck = inet_sk(sk)->transparent;
1535         ireq->opt = tcp_v4_save_options(skb);
1536         ireq->ir_mark = inet_request_mark(sk, skb);
1537
1538         if (security_inet_conn_request(sk, skb, req))
1539                 goto drop_and_free;
1540
1541         if (!want_cookie || tmp_opt.tstamp_ok)
1542                 TCP_ECN_create_request(req, skb, sock_net(sk));
1543
1544         if (want_cookie) {
1545                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1546                 req->cookie_ts = tmp_opt.tstamp_ok;
1547         } else if (!isn) {
1548                 /* VJ's idea. We save last timestamp seen
1549                  * from the destination in peer table, when entering
1550                  * state TIME-WAIT, and check against it before
1551                  * accepting new connection request.
1552                  *
1553                  * If "isn" is not zero, this request hit alive
1554                  * timewait bucket, so that all the necessary checks
1555                  * are made in the function processing timewait state.
1556                  */
1557                 if (tmp_opt.saw_tstamp &&
1558                     tcp_death_row.sysctl_tw_recycle &&
1559                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1560                     fl4.daddr == saddr) {
1561                         if (!tcp_peer_is_proven(req, dst, true)) {
1562                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1563                                 goto drop_and_release;
1564                         }
1565                 }
1566                 /* Kill the following clause, if you dislike this way. */
1567                 else if (!sysctl_tcp_syncookies &&
1568                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1569                           (sysctl_max_syn_backlog >> 2)) &&
1570                          !tcp_peer_is_proven(req, dst, false)) {
1571                         /* Without syncookies last quarter of
1572                          * backlog is filled with destinations,
1573                          * proven to be alive.
1574                          * It means that we continue to communicate
1575                          * to destinations, already remembered
1576                          * to the moment of synflood.
1577                          */
1578                         LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1579                                        &saddr, ntohs(tcp_hdr(skb)->source));
1580                         goto drop_and_release;
1581                 }
1582
1583                 isn = tcp_v4_init_sequence(skb);
1584         }
1585         tcp_rsk(req)->snt_isn = isn;
1586
1587         if (dst == NULL) {
1588                 dst = inet_csk_route_req(sk, &fl4, req);
1589                 if (dst == NULL)
1590                         goto drop_and_free;
1591         }
1592         do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1593
1594         /* We don't call tcp_v4_send_synack() directly because we need
1595          * to make sure a child socket can be created successfully before
1596          * sending back synack!
1597          *
1598          * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1599          * (or better yet, call tcp_send_synack() in the child context
1600          * directly, but will have to fix bunch of other code first)
1601          * after syn_recv_sock() except one will need to first fix the
1602          * latter to remove its dependency on the current implementation
1603          * of tcp_v4_send_synack()->tcp_select_initial_window().
1604          */
1605         skb_synack = tcp_make_synack(sk, dst, req,
1606             fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1607
1608         if (skb_synack) {
1609                 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1610                 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1611         } else
1612                 goto drop_and_free;
1613
1614         if (likely(!do_fastopen)) {
1615                 int err;
1616                 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1617                      ireq->rmt_addr, ireq->opt);
1618                 err = net_xmit_eval(err);
1619                 if (err || want_cookie)
1620                         goto drop_and_free;
1621
1622                 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1623                 tcp_rsk(req)->listener = NULL;
1624                 /* Add the request_sock to the SYN table */
1625                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1626                 if (fastopen_cookie_present(&foc) && foc.len != 0)
1627                         NET_INC_STATS_BH(sock_net(sk),
1628                             LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1629         } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req))
1630                 goto drop_and_free;
1631
1632         return 0;
1633
1634 drop_and_release:
1635         dst_release(dst);
1636 drop_and_free:
1637         reqsk_free(req);
1638 drop:
1639         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1640         return 0;
1641 }
1642 EXPORT_SYMBOL(tcp_v4_conn_request);
1643
1644
1645 /*
1646  * The three way handshake has completed - we got a valid synack -
1647  * now create the new socket.
1648  */
1649 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1650                                   struct request_sock *req,
1651                                   struct dst_entry *dst)
1652 {
1653         struct inet_request_sock *ireq;
1654         struct inet_sock *newinet;
1655         struct tcp_sock *newtp;
1656         struct sock *newsk;
1657 #ifdef CONFIG_TCP_MD5SIG
1658         struct tcp_md5sig_key *key;
1659 #endif
1660         struct ip_options_rcu *inet_opt;
1661
1662         if (sk_acceptq_is_full(sk))
1663                 goto exit_overflow;
1664
1665         newsk = tcp_create_openreq_child(sk, req, skb);
1666         if (!newsk)
1667                 goto exit_nonewsk;
1668
1669         newsk->sk_gso_type = SKB_GSO_TCPV4;
1670         inet_sk_rx_dst_set(newsk, skb);
1671
1672         newtp                 = tcp_sk(newsk);
1673         newinet               = inet_sk(newsk);
1674         ireq                  = inet_rsk(req);
1675         newinet->inet_daddr   = ireq->rmt_addr;
1676         newinet->inet_rcv_saddr = ireq->loc_addr;
1677         newinet->inet_saddr           = ireq->loc_addr;
1678         inet_opt              = ireq->opt;
1679         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1680         ireq->opt             = NULL;
1681         newinet->mc_index     = inet_iif(skb);
1682         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1683         newinet->rcv_tos      = ip_hdr(skb)->tos;
1684         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1685         if (inet_opt)
1686                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1687         newinet->inet_id = newtp->write_seq ^ jiffies;
1688
1689         if (!dst) {
1690                 dst = inet_csk_route_child_sock(sk, newsk, req);
1691                 if (!dst)
1692                         goto put_and_exit;
1693         } else {
1694                 /* syncookie case : see end of cookie_v4_check() */
1695         }
1696         sk_setup_caps(newsk, dst);
1697
1698         tcp_mtup_init(newsk);
1699         tcp_sync_mss(newsk, dst_mtu(dst));
1700         newtp->advmss = dst_metric_advmss(dst);
1701         if (tcp_sk(sk)->rx_opt.user_mss &&
1702             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1703                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1704
1705         tcp_initialize_rcv_mss(newsk);
1706         tcp_synack_rtt_meas(newsk, req);
1707         newtp->total_retrans = req->num_retrans;
1708
1709 #ifdef CONFIG_TCP_MD5SIG
1710         /* Copy over the MD5 key from the original socket */
1711         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1712                                 AF_INET);
1713         if (key != NULL) {
1714                 /*
1715                  * We're using one, so create a matching key
1716                  * on the newsk structure. If we fail to get
1717                  * memory, then we end up not copying the key
1718                  * across. Shucks.
1719                  */
1720                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1721                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1722                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1723         }
1724 #endif
1725
1726         if (__inet_inherit_port(sk, newsk) < 0)
1727                 goto put_and_exit;
1728         __inet_hash_nolisten(newsk, NULL);
1729
1730         return newsk;
1731
1732 exit_overflow:
1733         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1734 exit_nonewsk:
1735         dst_release(dst);
1736 exit:
1737         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1738         return NULL;
1739 put_and_exit:
1740         inet_csk_prepare_forced_close(newsk);
1741         tcp_done(newsk);
1742         goto exit;
1743 }
1744 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1745
1746 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1747 {
1748         struct tcphdr *th = tcp_hdr(skb);
1749         const struct iphdr *iph = ip_hdr(skb);
1750         struct sock *nsk;
1751         struct request_sock **prev;
1752         /* Find possible connection requests. */
1753         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1754                                                        iph->saddr, iph->daddr);
1755         if (req)
1756                 return tcp_check_req(sk, skb, req, prev, false);
1757
1758         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1759                         th->source, iph->daddr, th->dest, inet_iif(skb));
1760
1761         if (nsk) {
1762                 if (nsk->sk_state != TCP_TIME_WAIT) {
1763                         bh_lock_sock(nsk);
1764                         return nsk;
1765                 }
1766                 inet_twsk_put(inet_twsk(nsk));
1767                 return NULL;
1768         }
1769
1770 #ifdef CONFIG_SYN_COOKIES
1771         if (!th->syn)
1772                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1773 #endif
1774         return sk;
1775 }
1776
1777 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1778 {
1779         const struct iphdr *iph = ip_hdr(skb);
1780
1781         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1782                 if (!tcp_v4_check(skb->len, iph->saddr,
1783                                   iph->daddr, skb->csum)) {
1784                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1785                         return 0;
1786                 }
1787         }
1788
1789         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1790                                        skb->len, IPPROTO_TCP, 0);
1791
1792         if (skb->len <= 76) {
1793                 return __skb_checksum_complete(skb);
1794         }
1795         return 0;
1796 }
1797
1798
1799 /* The socket must have it's spinlock held when we get
1800  * here.
1801  *
1802  * We have a potential double-lock case here, so even when
1803  * doing backlog processing we use the BH locking scheme.
1804  * This is because we cannot sleep with the original spinlock
1805  * held.
1806  */
1807 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1808 {
1809         struct sock *rsk;
1810 #ifdef CONFIG_TCP_MD5SIG
1811         /*
1812          * We really want to reject the packet as early as possible
1813          * if:
1814          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1815          *  o There is an MD5 option and we're not expecting one
1816          */
1817         if (tcp_v4_inbound_md5_hash(sk, skb))
1818                 goto discard;
1819 #endif
1820
1821         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1822                 struct dst_entry *dst = sk->sk_rx_dst;
1823
1824                 sock_rps_save_rxhash(sk, skb);
1825                 if (dst) {
1826                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1827                             dst->ops->check(dst, 0) == NULL) {
1828                                 dst_release(dst);
1829                                 sk->sk_rx_dst = NULL;
1830                         }
1831                 }
1832                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1833                         rsk = sk;
1834                         goto reset;
1835                 }
1836                 return 0;
1837         }
1838
1839         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1840                 goto csum_err;
1841
1842         if (sk->sk_state == TCP_LISTEN) {
1843                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1844                 if (!nsk)
1845                         goto discard;
1846
1847                 if (nsk != sk) {
1848                         sock_rps_save_rxhash(nsk, skb);
1849                         if (tcp_child_process(sk, nsk, skb)) {
1850                                 rsk = nsk;
1851                                 goto reset;
1852                         }
1853                         return 0;
1854                 }
1855         } else
1856                 sock_rps_save_rxhash(sk, skb);
1857
1858         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1859                 rsk = sk;
1860                 goto reset;
1861         }
1862         return 0;
1863
1864 reset:
1865         tcp_v4_send_reset(rsk, skb);
1866 discard:
1867         kfree_skb(skb);
1868         /* Be careful here. If this function gets more complicated and
1869          * gcc suffers from register pressure on the x86, sk (in %ebx)
1870          * might be destroyed here. This current version compiles correctly,
1871          * but you have been warned.
1872          */
1873         return 0;
1874
1875 csum_err:
1876         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
1877         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1878         goto discard;
1879 }
1880 EXPORT_SYMBOL(tcp_v4_do_rcv);
1881
1882 void tcp_v4_early_demux(struct sk_buff *skb)
1883 {
1884         const struct iphdr *iph;
1885         const struct tcphdr *th;
1886         struct sock *sk;
1887
1888         if (skb->pkt_type != PACKET_HOST)
1889                 return;
1890
1891         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1892                 return;
1893
1894         iph = ip_hdr(skb);
1895         th = tcp_hdr(skb);
1896
1897         if (th->doff < sizeof(struct tcphdr) / 4)
1898                 return;
1899
1900         sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1901                                        iph->saddr, th->source,
1902                                        iph->daddr, ntohs(th->dest),
1903                                        skb->skb_iif);
1904         if (sk) {
1905                 skb->sk = sk;
1906                 skb->destructor = sock_edemux;
1907                 if (sk->sk_state != TCP_TIME_WAIT) {
1908                         struct dst_entry *dst = ACCESS_ONCE(sk->sk_rx_dst);
1909
1910                         if (dst)
1911                                 dst = dst_check(dst, 0);
1912                         if (dst &&
1913                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1914                                 skb_dst_set_noref(skb, dst);
1915                 }
1916         }
1917 }
1918
1919 /* Packet is added to VJ-style prequeue for processing in process
1920  * context, if a reader task is waiting. Apparently, this exciting
1921  * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
1922  * failed somewhere. Latency? Burstiness? Well, at least now we will
1923  * see, why it failed. 8)8)                               --ANK
1924  *
1925  */
1926 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
1927 {
1928         struct tcp_sock *tp = tcp_sk(sk);
1929
1930         if (sysctl_tcp_low_latency || !tp->ucopy.task)
1931                 return false;
1932
1933         if (skb->len <= tcp_hdrlen(skb) &&
1934             skb_queue_len(&tp->ucopy.prequeue) == 0)
1935                 return false;
1936
1937         skb_dst_force(skb);
1938         __skb_queue_tail(&tp->ucopy.prequeue, skb);
1939         tp->ucopy.memory += skb->truesize;
1940         if (tp->ucopy.memory > sk->sk_rcvbuf) {
1941                 struct sk_buff *skb1;
1942
1943                 BUG_ON(sock_owned_by_user(sk));
1944
1945                 while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
1946                         sk_backlog_rcv(sk, skb1);
1947                         NET_INC_STATS_BH(sock_net(sk),
1948                                          LINUX_MIB_TCPPREQUEUEDROPPED);
1949                 }
1950
1951                 tp->ucopy.memory = 0;
1952         } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
1953                 wake_up_interruptible_sync_poll(sk_sleep(sk),
1954                                            POLLIN | POLLRDNORM | POLLRDBAND);
1955                 if (!inet_csk_ack_scheduled(sk))
1956                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
1957                                                   (3 * tcp_rto_min(sk)) / 4,
1958                                                   TCP_RTO_MAX);
1959         }
1960         return true;
1961 }
1962 EXPORT_SYMBOL(tcp_prequeue);
1963
1964 /*
1965  *      From tcp_input.c
1966  */
1967
1968 int tcp_v4_rcv(struct sk_buff *skb)
1969 {
1970         const struct iphdr *iph;
1971         const struct tcphdr *th;
1972         struct sock *sk;
1973         int ret;
1974         struct net *net = dev_net(skb->dev);
1975
1976         if (skb->pkt_type != PACKET_HOST)
1977                 goto discard_it;
1978
1979         /* Count it even if it's bad */
1980         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1981
1982         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1983                 goto discard_it;
1984
1985         th = tcp_hdr(skb);
1986
1987         if (th->doff < sizeof(struct tcphdr) / 4)
1988                 goto bad_packet;
1989         if (!pskb_may_pull(skb, th->doff * 4))
1990                 goto discard_it;
1991
1992         /* An explanation is required here, I think.
1993          * Packet length and doff are validated by header prediction,
1994          * provided case of th->doff==0 is eliminated.
1995          * So, we defer the checks. */
1996         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1997                 goto csum_error;
1998
1999         th = tcp_hdr(skb);
2000         iph = ip_hdr(skb);
2001         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2002         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2003                                     skb->len - th->doff * 4);
2004         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2005         TCP_SKB_CB(skb)->when    = 0;
2006         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2007         TCP_SKB_CB(skb)->sacked  = 0;
2008
2009         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
2010         if (!sk)
2011                 goto no_tcp_socket;
2012
2013 process:
2014         if (sk->sk_state == TCP_TIME_WAIT)
2015                 goto do_time_wait;
2016
2017         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2018                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
2019                 goto discard_and_relse;
2020         }
2021
2022         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2023                 goto discard_and_relse;
2024         nf_reset(skb);
2025
2026         if (sk_filter(sk, skb))
2027                 goto discard_and_relse;
2028
2029         skb->dev = NULL;
2030
2031         bh_lock_sock_nested(sk);
2032         ret = 0;
2033         if (!sock_owned_by_user(sk)) {
2034 #ifdef CONFIG_NET_DMA
2035                 struct tcp_sock *tp = tcp_sk(sk);
2036                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2037                         tp->ucopy.dma_chan = net_dma_find_channel();
2038                 if (tp->ucopy.dma_chan)
2039                         ret = tcp_v4_do_rcv(sk, skb);
2040                 else
2041 #endif
2042                 {
2043                         if (!tcp_prequeue(sk, skb))
2044                                 ret = tcp_v4_do_rcv(sk, skb);
2045                 }
2046         } else if (unlikely(sk_add_backlog(sk, skb,
2047                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
2048                 bh_unlock_sock(sk);
2049                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2050                 goto discard_and_relse;
2051         }
2052         bh_unlock_sock(sk);
2053
2054         sock_put(sk);
2055
2056         return ret;
2057
2058 no_tcp_socket:
2059         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2060                 goto discard_it;
2061
2062         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2063 csum_error:
2064                 TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
2065 bad_packet:
2066                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2067         } else {
2068                 tcp_v4_send_reset(NULL, skb);
2069         }
2070
2071 discard_it:
2072         /* Discard frame. */
2073         kfree_skb(skb);
2074         return 0;
2075
2076 discard_and_relse:
2077         sock_put(sk);
2078         goto discard_it;
2079
2080 do_time_wait:
2081         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2082                 inet_twsk_put(inet_twsk(sk));
2083                 goto discard_it;
2084         }
2085
2086         if (skb->len < (th->doff << 2)) {
2087                 inet_twsk_put(inet_twsk(sk));
2088                 goto bad_packet;
2089         }
2090         if (tcp_checksum_complete(skb)) {
2091                 inet_twsk_put(inet_twsk(sk));
2092                 goto csum_error;
2093         }
2094         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2095         case TCP_TW_SYN: {
2096                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2097                                                         &tcp_hashinfo,
2098                                                         iph->saddr, th->source,
2099                                                         iph->daddr, th->dest,
2100                                                         inet_iif(skb));
2101                 if (sk2) {
2102                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2103                         inet_twsk_put(inet_twsk(sk));
2104                         sk = sk2;
2105                         goto process;
2106                 }
2107                 /* Fall through to ACK */
2108         }
2109         case TCP_TW_ACK:
2110                 tcp_v4_timewait_ack(sk, skb);
2111                 break;
2112         case TCP_TW_RST:
2113                 goto no_tcp_socket;
2114         case TCP_TW_SUCCESS:;
2115         }
2116         goto discard_it;
2117 }
2118
2119 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2120         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2121         .twsk_unique    = tcp_twsk_unique,
2122         .twsk_destructor= tcp_twsk_destructor,
2123 };
2124
2125 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2126 {
2127         struct dst_entry *dst = skb_dst(skb);
2128
2129         dst_hold(dst);
2130         sk->sk_rx_dst = dst;
2131         inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2132 }
2133 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2134
2135 const struct inet_connection_sock_af_ops ipv4_specific = {
2136         .queue_xmit        = ip_queue_xmit,
2137         .send_check        = tcp_v4_send_check,
2138         .rebuild_header    = inet_sk_rebuild_header,
2139         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2140         .conn_request      = tcp_v4_conn_request,
2141         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2142         .net_header_len    = sizeof(struct iphdr),
2143         .setsockopt        = ip_setsockopt,
2144         .getsockopt        = ip_getsockopt,
2145         .addr2sockaddr     = inet_csk_addr2sockaddr,
2146         .sockaddr_len      = sizeof(struct sockaddr_in),
2147         .bind_conflict     = inet_csk_bind_conflict,
2148 #ifdef CONFIG_COMPAT
2149         .compat_setsockopt = compat_ip_setsockopt,
2150         .compat_getsockopt = compat_ip_getsockopt,
2151 #endif
2152         .mtu_reduced       = tcp_v4_mtu_reduced,
2153 };
2154 EXPORT_SYMBOL(ipv4_specific);
2155
2156 #ifdef CONFIG_TCP_MD5SIG
2157 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2158         .md5_lookup             = tcp_v4_md5_lookup,
2159         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2160         .md5_parse              = tcp_v4_parse_md5_keys,
2161 };
2162 #endif
2163
2164 /* NOTE: A lot of things set to zero explicitly by call to
2165  *       sk_alloc() so need not be done here.
2166  */
2167 static int tcp_v4_init_sock(struct sock *sk)
2168 {
2169         struct inet_connection_sock *icsk = inet_csk(sk);
2170
2171         tcp_init_sock(sk);
2172
2173         icsk->icsk_af_ops = &ipv4_specific;
2174
2175 #ifdef CONFIG_TCP_MD5SIG
2176         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2177 #endif
2178
2179         return 0;
2180 }
2181
2182 void tcp_v4_destroy_sock(struct sock *sk)
2183 {
2184         struct tcp_sock *tp = tcp_sk(sk);
2185
2186         tcp_clear_xmit_timers(sk);
2187
2188         tcp_cleanup_congestion_control(sk);
2189
2190         /* Cleanup up the write buffer. */
2191         tcp_write_queue_purge(sk);
2192
2193         /* Cleans up our, hopefully empty, out_of_order_queue. */
2194         __skb_queue_purge(&tp->out_of_order_queue);
2195
2196 #ifdef CONFIG_TCP_MD5SIG
2197         /* Clean up the MD5 key list, if any */
2198         if (tp->md5sig_info) {
2199                 tcp_clear_md5_list(sk);
2200                 kfree_rcu(tp->md5sig_info, rcu);
2201                 tp->md5sig_info = NULL;
2202         }
2203 #endif
2204
2205 #ifdef CONFIG_NET_DMA
2206         /* Cleans up our sk_async_wait_queue */
2207         __skb_queue_purge(&sk->sk_async_wait_queue);
2208 #endif
2209
2210         /* Clean prequeue, it must be empty really */
2211         __skb_queue_purge(&tp->ucopy.prequeue);
2212
2213         /* Clean up a referenced TCP bind bucket. */
2214         if (inet_csk(sk)->icsk_bind_hash)
2215                 inet_put_port(sk);
2216
2217         BUG_ON(tp->fastopen_rsk != NULL);
2218
2219         /* If socket is aborted during connect operation */
2220         tcp_free_fastopen_req(tp);
2221
2222         sk_sockets_allocated_dec(sk);
2223         sock_release_memcg(sk);
2224 }
2225 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2226
2227 #ifdef CONFIG_PROC_FS
2228 /* Proc filesystem TCP sock list dumping. */
2229
2230 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
2231 {
2232         return hlist_nulls_empty(head) ? NULL :
2233                 list_entry(head->first, struct inet_timewait_sock, tw_node);
2234 }
2235
2236 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2237 {
2238         return !is_a_nulls(tw->tw_node.next) ?
2239                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2240 }
2241
2242 /*
2243  * Get next listener socket follow cur.  If cur is NULL, get first socket
2244  * starting from bucket given in st->bucket; when st->bucket is zero the
2245  * very first socket in the hash table is returned.
2246  */
2247 static void *listening_get_next(struct seq_file *seq, void *cur)
2248 {
2249         struct inet_connection_sock *icsk;
2250         struct hlist_nulls_node *node;
2251         struct sock *sk = cur;
2252         struct inet_listen_hashbucket *ilb;
2253         struct tcp_iter_state *st = seq->private;
2254         struct net *net = seq_file_net(seq);
2255
2256         if (!sk) {
2257                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2258                 spin_lock_bh(&ilb->lock);
2259                 sk = sk_nulls_head(&ilb->head);
2260                 st->offset = 0;
2261                 goto get_sk;
2262         }
2263         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2264         ++st->num;
2265         ++st->offset;
2266
2267         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2268                 struct request_sock *req = cur;
2269
2270                 icsk = inet_csk(st->syn_wait_sk);
2271                 req = req->dl_next;
2272                 while (1) {
2273                         while (req) {
2274                                 if (req->rsk_ops->family == st->family) {
2275                                         cur = req;
2276                                         goto out;
2277                                 }
2278                                 req = req->dl_next;
2279                         }
2280                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2281                                 break;
2282 get_req:
2283                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2284                 }
2285                 sk        = sk_nulls_next(st->syn_wait_sk);
2286                 st->state = TCP_SEQ_STATE_LISTENING;
2287                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2288         } else {
2289                 icsk = inet_csk(sk);
2290                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2291                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2292                         goto start_req;
2293                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2294                 sk = sk_nulls_next(sk);
2295         }
2296 get_sk:
2297         sk_nulls_for_each_from(sk, node) {
2298                 if (!net_eq(sock_net(sk), net))
2299                         continue;
2300                 if (sk->sk_family == st->family) {
2301                         cur = sk;
2302                         goto out;
2303                 }
2304                 icsk = inet_csk(sk);
2305                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2306                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2307 start_req:
2308                         st->uid         = sock_i_uid(sk);
2309                         st->syn_wait_sk = sk;
2310                         st->state       = TCP_SEQ_STATE_OPENREQ;
2311                         st->sbucket     = 0;
2312                         goto get_req;
2313                 }
2314                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2315         }
2316         spin_unlock_bh(&ilb->lock);
2317         st->offset = 0;
2318         if (++st->bucket < INET_LHTABLE_SIZE) {
2319                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2320                 spin_lock_bh(&ilb->lock);
2321                 sk = sk_nulls_head(&ilb->head);
2322                 goto get_sk;
2323         }
2324         cur = NULL;
2325 out:
2326         return cur;
2327 }
2328
2329 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2330 {
2331         struct tcp_iter_state *st = seq->private;
2332         void *rc;
2333
2334         st->bucket = 0;
2335         st->offset = 0;
2336         rc = listening_get_next(seq, NULL);
2337
2338         while (rc && *pos) {
2339                 rc = listening_get_next(seq, rc);
2340                 --*pos;
2341         }
2342         return rc;
2343 }
2344
2345 static inline bool empty_bucket(struct tcp_iter_state *st)
2346 {
2347         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2348                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2349 }
2350
2351 /*
2352  * Get first established socket starting from bucket given in st->bucket.
2353  * If st->bucket is zero, the very first socket in the hash is returned.
2354  */
2355 static void *established_get_first(struct seq_file *seq)
2356 {
2357         struct tcp_iter_state *st = seq->private;
2358         struct net *net = seq_file_net(seq);
2359         void *rc = NULL;
2360
2361         st->offset = 0;
2362         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2363                 struct sock *sk;
2364                 struct hlist_nulls_node *node;
2365                 struct inet_timewait_sock *tw;
2366                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2367
2368                 /* Lockless fast path for the common case of empty buckets */
2369                 if (empty_bucket(st))
2370                         continue;
2371
2372                 spin_lock_bh(lock);
2373                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2374                         if (sk->sk_family != st->family ||
2375                             !net_eq(sock_net(sk), net)) {
2376                                 continue;
2377                         }
2378                         rc = sk;
2379                         goto out;
2380                 }
2381                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2382                 inet_twsk_for_each(tw, node,
2383                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2384                         if (tw->tw_family != st->family ||
2385                             !net_eq(twsk_net(tw), net)) {
2386                                 continue;
2387                         }
2388                         rc = tw;
2389                         goto out;
2390                 }
2391                 spin_unlock_bh(lock);
2392                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2393         }
2394 out:
2395         return rc;
2396 }
2397
2398 static void *established_get_next(struct seq_file *seq, void *cur)
2399 {
2400         struct sock *sk = cur;
2401         struct inet_timewait_sock *tw;
2402         struct hlist_nulls_node *node;
2403         struct tcp_iter_state *st = seq->private;
2404         struct net *net = seq_file_net(seq);
2405
2406         ++st->num;
2407         ++st->offset;
2408
2409         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2410                 tw = cur;
2411                 tw = tw_next(tw);
2412 get_tw:
2413                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2414                         tw = tw_next(tw);
2415                 }
2416                 if (tw) {
2417                         cur = tw;
2418                         goto out;
2419                 }
2420                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2421                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2422
2423                 /* Look for next non empty bucket */
2424                 st->offset = 0;
2425                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2426                                 empty_bucket(st))
2427                         ;
2428                 if (st->bucket > tcp_hashinfo.ehash_mask)
2429                         return NULL;
2430
2431                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2432                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2433         } else
2434                 sk = sk_nulls_next(sk);
2435
2436         sk_nulls_for_each_from(sk, node) {
2437                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2438                         goto found;
2439         }
2440
2441         st->state = TCP_SEQ_STATE_TIME_WAIT;
2442         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2443         goto get_tw;
2444 found:
2445         cur = sk;
2446 out:
2447         return cur;
2448 }
2449
2450 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2451 {
2452         struct tcp_iter_state *st = seq->private;
2453         void *rc;
2454
2455         st->bucket = 0;
2456         rc = established_get_first(seq);
2457
2458         while (rc && pos) {
2459                 rc = established_get_next(seq, rc);
2460                 --pos;
2461         }
2462         return rc;
2463 }
2464
2465 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2466 {
2467         void *rc;
2468         struct tcp_iter_state *st = seq->private;
2469
2470         st->state = TCP_SEQ_STATE_LISTENING;
2471         rc        = listening_get_idx(seq, &pos);
2472
2473         if (!rc) {
2474                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2475                 rc        = established_get_idx(seq, pos);
2476         }
2477
2478         return rc;
2479 }
2480
2481 static void *tcp_seek_last_pos(struct seq_file *seq)
2482 {
2483         struct tcp_iter_state *st = seq->private;
2484         int offset = st->offset;
2485         int orig_num = st->num;
2486         void *rc = NULL;
2487
2488         switch (st->state) {
2489         case TCP_SEQ_STATE_OPENREQ:
2490         case TCP_SEQ_STATE_LISTENING:
2491                 if (st->bucket >= INET_LHTABLE_SIZE)
2492                         break;
2493                 st->state = TCP_SEQ_STATE_LISTENING;
2494                 rc = listening_get_next(seq, NULL);
2495                 while (offset-- && rc)
2496                         rc = listening_get_next(seq, rc);
2497                 if (rc)
2498                         break;
2499                 st->bucket = 0;
2500                 /* Fallthrough */
2501         case TCP_SEQ_STATE_ESTABLISHED:
2502         case TCP_SEQ_STATE_TIME_WAIT:
2503                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2504                 if (st->bucket > tcp_hashinfo.ehash_mask)
2505                         break;
2506                 rc = established_get_first(seq);
2507                 while (offset-- && rc)
2508                         rc = established_get_next(seq, rc);
2509         }
2510
2511         st->num = orig_num;
2512
2513         return rc;
2514 }
2515
2516 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2517 {
2518         struct tcp_iter_state *st = seq->private;
2519         void *rc;
2520
2521         if (*pos && *pos == st->last_pos) {
2522                 rc = tcp_seek_last_pos(seq);
2523                 if (rc)
2524                         goto out;
2525         }
2526
2527         st->state = TCP_SEQ_STATE_LISTENING;
2528         st->num = 0;
2529         st->bucket = 0;
2530         st->offset = 0;
2531         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2532
2533 out:
2534         st->last_pos = *pos;
2535         return rc;
2536 }
2537
2538 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2539 {
2540         struct tcp_iter_state *st = seq->private;
2541         void *rc = NULL;
2542
2543         if (v == SEQ_START_TOKEN) {
2544                 rc = tcp_get_idx(seq, 0);
2545                 goto out;
2546         }
2547
2548         switch (st->state) {
2549         case TCP_SEQ_STATE_OPENREQ:
2550         case TCP_SEQ_STATE_LISTENING:
2551                 rc = listening_get_next(seq, v);
2552                 if (!rc) {
2553                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2554                         st->bucket = 0;
2555                         st->offset = 0;
2556                         rc        = established_get_first(seq);
2557                 }
2558                 break;
2559         case TCP_SEQ_STATE_ESTABLISHED:
2560         case TCP_SEQ_STATE_TIME_WAIT:
2561                 rc = established_get_next(seq, v);
2562                 break;
2563         }
2564 out:
2565         ++*pos;
2566         st->last_pos = *pos;
2567         return rc;
2568 }
2569
2570 static void tcp_seq_stop(struct seq_file *seq, void *v)
2571 {
2572         struct tcp_iter_state *st = seq->private;
2573
2574         switch (st->state) {
2575         case TCP_SEQ_STATE_OPENREQ:
2576                 if (v) {
2577                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2578                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2579                 }
2580         case TCP_SEQ_STATE_LISTENING:
2581                 if (v != SEQ_START_TOKEN)
2582                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2583                 break;
2584         case TCP_SEQ_STATE_TIME_WAIT:
2585         case TCP_SEQ_STATE_ESTABLISHED:
2586                 if (v)
2587                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2588                 break;
2589         }
2590 }
2591
2592 int tcp_seq_open(struct inode *inode, struct file *file)
2593 {
2594         struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2595         struct tcp_iter_state *s;
2596         int err;
2597
2598         err = seq_open_net(inode, file, &afinfo->seq_ops,
2599                           sizeof(struct tcp_iter_state));
2600         if (err < 0)
2601                 return err;
2602
2603         s = ((struct seq_file *)file->private_data)->private;
2604         s->family               = afinfo->family;
2605         s->last_pos             = 0;
2606         return 0;
2607 }
2608 EXPORT_SYMBOL(tcp_seq_open);
2609
2610 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2611 {
2612         int rc = 0;
2613         struct proc_dir_entry *p;
2614
2615         afinfo->seq_ops.start           = tcp_seq_start;
2616         afinfo->seq_ops.next            = tcp_seq_next;
2617         afinfo->seq_ops.stop            = tcp_seq_stop;
2618
2619         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2620                              afinfo->seq_fops, afinfo);
2621         if (!p)
2622                 rc = -ENOMEM;
2623         return rc;
2624 }
2625 EXPORT_SYMBOL(tcp_proc_register);
2626
2627 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2628 {
2629         remove_proc_entry(afinfo->name, net->proc_net);
2630 }
2631 EXPORT_SYMBOL(tcp_proc_unregister);
2632
2633 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2634                          struct seq_file *f, int i, kuid_t uid, int *len)
2635 {
2636         const struct inet_request_sock *ireq = inet_rsk(req);
2637         long delta = req->expires - jiffies;
2638
2639         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2640                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2641                 i,
2642                 ireq->loc_addr,
2643                 ntohs(inet_sk(sk)->inet_sport),
2644                 ireq->rmt_addr,
2645                 ntohs(ireq->rmt_port),
2646                 TCP_SYN_RECV,
2647                 0, 0, /* could print option size, but that is af dependent. */
2648                 1,    /* timers active (only the expire timer) */
2649                 jiffies_delta_to_clock_t(delta),
2650                 req->num_timeout,
2651                 from_kuid_munged(seq_user_ns(f), uid),
2652                 0,  /* non standard timer */
2653                 0, /* open_requests have no inode */
2654                 atomic_read(&sk->sk_refcnt),
2655                 req,
2656                 len);
2657 }
2658
2659 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2660 {
2661         int timer_active;
2662         unsigned long timer_expires;
2663         const struct tcp_sock *tp = tcp_sk(sk);
2664         const struct inet_connection_sock *icsk = inet_csk(sk);
2665         const struct inet_sock *inet = inet_sk(sk);
2666         struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2667         __be32 dest = inet->inet_daddr;
2668         __be32 src = inet->inet_rcv_saddr;
2669         __u16 destp = ntohs(inet->inet_dport);
2670         __u16 srcp = ntohs(inet->inet_sport);
2671         int rx_queue;
2672         char cmdline[128] = {'\0'};
2673
2674         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2675             icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2676             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2677                 timer_active    = 1;
2678                 timer_expires   = icsk->icsk_timeout;
2679         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2680                 timer_active    = 4;
2681                 timer_expires   = icsk->icsk_timeout;
2682         } else if (timer_pending(&sk->sk_timer)) {
2683                 timer_active    = 2;
2684                 timer_expires   = sk->sk_timer.expires;
2685         } else {
2686                 timer_active    = 0;
2687                 timer_expires = jiffies;
2688         }
2689
2690         if (sk->sk_state == TCP_LISTEN)
2691                 rx_queue = sk->sk_ack_backlog;
2692         else
2693                 /*
2694                  * because we dont lock socket, we might find a transient negative value
2695                  */
2696                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2697
2698         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2699                 "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d %s%n",
2700                 i, src, srcp, dest, destp, sk->sk_state,
2701                 tp->write_seq - tp->snd_una,
2702                 rx_queue,
2703                 timer_active,
2704                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2705                 icsk->icsk_retransmits,
2706                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2707                 icsk->icsk_probes_out,
2708                 sock_i_ino(sk),
2709                 atomic_read(&sk->sk_refcnt), sk,
2710                 jiffies_to_clock_t(icsk->icsk_rto),
2711                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2712                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2713                 tp->snd_cwnd,
2714                 sk->sk_state == TCP_LISTEN ?
2715                     (fastopenq ? fastopenq->max_qlen : 0) :
2716                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2717                 sk_get_waiting_task_cmdline(sk, cmdline),
2718                 len);
2719 }
2720
2721 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2722                                struct seq_file *f, int i, int *len)
2723 {
2724         __be32 dest, src;
2725         __u16 destp, srcp;
2726         long delta = tw->tw_ttd - jiffies;
2727
2728         dest  = tw->tw_daddr;
2729         src   = tw->tw_rcv_saddr;
2730         destp = ntohs(tw->tw_dport);
2731         srcp  = ntohs(tw->tw_sport);
2732
2733         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2734                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2735                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2736                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2737                 atomic_read(&tw->tw_refcnt), tw, len);
2738 }
2739
2740 #define TMPSZ 150
2741
2742 static int tcp4_seq_show(struct seq_file *seq, void *v)
2743 {
2744         struct tcp_iter_state *st;
2745         int len;
2746
2747         if (v == SEQ_START_TOKEN) {
2748                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2749                         "  sl  local_address rem_address   st tx_queue "
2750                         "rx_queue tr tm->when retrnsmt   uid  timeout "
2751                         "inode "
2752                         "cmdline");
2753                 goto out;
2754         }
2755         st = seq->private;
2756
2757         switch (st->state) {
2758         case TCP_SEQ_STATE_LISTENING:
2759         case TCP_SEQ_STATE_ESTABLISHED:
2760                 get_tcp4_sock(v, seq, st->num, &len);
2761                 break;
2762         case TCP_SEQ_STATE_OPENREQ:
2763                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2764                 break;
2765         case TCP_SEQ_STATE_TIME_WAIT:
2766                 get_timewait4_sock(v, seq, st->num, &len);
2767                 break;
2768         }
2769         seq_printf(seq, "\n");
2770 out:
2771         return 0;
2772 }
2773
2774 static const struct file_operations tcp_afinfo_seq_fops = {
2775         .owner   = THIS_MODULE,
2776         .open    = tcp_seq_open,
2777         .read    = seq_read,
2778         .llseek  = seq_lseek,
2779         .release = seq_release_net
2780 };
2781
2782 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2783         .name           = "tcp",
2784         .family         = AF_INET,
2785         .seq_fops       = &tcp_afinfo_seq_fops,
2786         .seq_ops        = {
2787                 .show           = tcp4_seq_show,
2788         },
2789 };
2790
2791 static int __net_init tcp4_proc_init_net(struct net *net)
2792 {
2793         return tcp_proc_register(net, &tcp4_seq_afinfo);
2794 }
2795
2796 static void __net_exit tcp4_proc_exit_net(struct net *net)
2797 {
2798         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2799 }
2800
2801 static struct pernet_operations tcp4_net_ops = {
2802         .init = tcp4_proc_init_net,
2803         .exit = tcp4_proc_exit_net,
2804 };
2805
2806 int __init tcp4_proc_init(void)
2807 {
2808         return register_pernet_subsys(&tcp4_net_ops);
2809 }
2810
2811 void tcp4_proc_exit(void)
2812 {
2813         unregister_pernet_subsys(&tcp4_net_ops);
2814 }
2815 #endif /* CONFIG_PROC_FS */
2816
2817 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2818 {
2819         const struct iphdr *iph = skb_gro_network_header(skb);
2820         __wsum wsum;
2821         __sum16 sum;
2822
2823         switch (skb->ip_summed) {
2824         case CHECKSUM_COMPLETE:
2825                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2826                                   skb->csum)) {
2827                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2828                         break;
2829                 }
2830 flush:
2831                 NAPI_GRO_CB(skb)->flush = 1;
2832                 return NULL;
2833
2834         case CHECKSUM_NONE:
2835                 wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
2836                                           skb_gro_len(skb), IPPROTO_TCP, 0);
2837                 sum = csum_fold(skb_checksum(skb,
2838                                              skb_gro_offset(skb),
2839                                              skb_gro_len(skb),
2840                                              wsum));
2841                 if (sum)
2842                         goto flush;
2843
2844                 skb->ip_summed = CHECKSUM_UNNECESSARY;
2845                 break;
2846         }
2847
2848         return tcp_gro_receive(head, skb);
2849 }
2850
2851 int tcp4_gro_complete(struct sk_buff *skb)
2852 {
2853         const struct iphdr *iph = ip_hdr(skb);
2854         struct tcphdr *th = tcp_hdr(skb);
2855
2856         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2857                                   iph->saddr, iph->daddr, 0);
2858         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2859
2860         return tcp_gro_complete(skb);
2861 }
2862
2863 struct proto tcp_prot = {
2864         .name                   = "TCP",
2865         .owner                  = THIS_MODULE,
2866         .close                  = tcp_close,
2867         .connect                = tcp_v4_connect,
2868         .disconnect             = tcp_disconnect,
2869         .accept                 = inet_csk_accept,
2870         .ioctl                  = tcp_ioctl,
2871         .init                   = tcp_v4_init_sock,
2872         .destroy                = tcp_v4_destroy_sock,
2873         .shutdown               = tcp_shutdown,
2874         .setsockopt             = tcp_setsockopt,
2875         .getsockopt             = tcp_getsockopt,
2876         .recvmsg                = tcp_recvmsg,
2877         .sendmsg                = tcp_sendmsg,
2878         .sendpage               = tcp_sendpage,
2879         .backlog_rcv            = tcp_v4_do_rcv,
2880         .release_cb             = tcp_release_cb,
2881         .hash                   = inet_hash,
2882         .unhash                 = inet_unhash,
2883         .get_port               = inet_csk_get_port,
2884         .enter_memory_pressure  = tcp_enter_memory_pressure,
2885         .sockets_allocated      = &tcp_sockets_allocated,
2886         .orphan_count           = &tcp_orphan_count,
2887         .memory_allocated       = &tcp_memory_allocated,
2888         .memory_pressure        = &tcp_memory_pressure,
2889         .sysctl_wmem            = sysctl_tcp_wmem,
2890         .sysctl_rmem            = sysctl_tcp_rmem,
2891         .max_header             = MAX_TCP_HEADER,
2892         .obj_size               = sizeof(struct tcp_sock),
2893         .slab_flags             = SLAB_DESTROY_BY_RCU,
2894         .twsk_prot              = &tcp_timewait_sock_ops,
2895         .rsk_prot               = &tcp_request_sock_ops,
2896         .h.hashinfo             = &tcp_hashinfo,
2897         .no_autobind            = true,
2898 #ifdef CONFIG_COMPAT
2899         .compat_setsockopt      = compat_tcp_setsockopt,
2900         .compat_getsockopt      = compat_tcp_getsockopt,
2901 #endif
2902 #ifdef CONFIG_MEMCG_KMEM
2903         .init_cgroup            = tcp_init_cgroup,
2904         .destroy_cgroup         = tcp_destroy_cgroup,
2905         .proto_cgroup           = tcp_proto_cgroup,
2906 #endif
2907 };
2908 EXPORT_SYMBOL(tcp_prot);
2909
2910 static void __net_exit tcp_sk_exit(struct net *net)
2911 {
2912         int cpu;
2913
2914         for_each_possible_cpu(cpu)
2915                 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2916         free_percpu(net->ipv4.tcp_sk);
2917 }
2918
2919 static int __net_init tcp_sk_init(struct net *net)
2920 {
2921         int res, cpu;
2922
2923         net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2924         if (!net->ipv4.tcp_sk)
2925                 return -ENOMEM;
2926
2927         for_each_possible_cpu(cpu) {
2928                 struct sock *sk;
2929
2930                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2931                                            IPPROTO_TCP, net);
2932                 if (res)
2933                         goto fail;
2934                 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2935         }
2936         net->ipv4.sysctl_tcp_ecn = 2;
2937         return 0;
2938
2939 fail:
2940         tcp_sk_exit(net);
2941
2942         return res;
2943 }
2944
2945 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2946 {
2947         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2948 }
2949
2950 static struct pernet_operations __net_initdata tcp_sk_ops = {
2951        .init       = tcp_sk_init,
2952        .exit       = tcp_sk_exit,
2953        .exit_batch = tcp_sk_exit_batch,
2954 };
2955
2956 void __init tcp_v4_init(void)
2957 {
2958         inet_hashinfo_init(&tcp_hashinfo);
2959         if (register_pernet_subsys(&tcp_sk_ops))
2960                 panic("Failed to create the TCP control socket.\n");
2961 }