[INET]: Move bind_hash from tcp_sk to inet_sk
[linux-2.6.git] / net / ipv4 / tcp_ipv4.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:     $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9  *
10  *              IPv4 specific functions
11  *
12  *
13  *              code split from:
14  *              linux/ipv4/tcp.c
15  *              linux/ipv4/tcp_input.c
16  *              linux/ipv4/tcp_output.c
17  *
18  *              See tcp.c for author information
19  *
20  *      This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25
26 /*
27  * Changes:
28  *              David S. Miller :       New socket lookup architecture.
29  *                                      This code is dedicated to John Dyson.
30  *              David S. Miller :       Change semantics of established hash,
31  *                                      half is devoted to TIME_WAIT sockets
32  *                                      and the rest go in the other half.
33  *              Andi Kleen :            Add support for syncookies and fixed
34  *                                      some bugs: ip options weren't passed to
35  *                                      the TCP layer, missed a check for an
36  *                                      ACK bit.
37  *              Andi Kleen :            Implemented fast path mtu discovery.
38  *                                      Fixed many serious bugs in the
39  *                                      request_sock handling and moved
40  *                                      most of it into the af independent code.
41  *                                      Added tail drop and some other bugfixes.
42  *                                      Added new listen sematics.
43  *              Mike McLagan    :       Routing by source
44  *      Juan Jose Ciarlante:            ip_dynaddr bits
45  *              Andi Kleen:             various fixes.
46  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
47  *                                      coma.
48  *      Andi Kleen              :       Fix new listen.
49  *      Andi Kleen              :       Fix accept error reporting.
50  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
51  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
52  *                                      a single port at the same time.
53  */
54
55 #include <linux/config.h>
56
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
65
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/ipv6.h>
70 #include <net/inet_common.h>
71 #include <net/xfrm.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
82
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
85
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
88
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90                        struct sk_buff *skb);
91
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93         .lhash_lock     = RW_LOCK_UNLOCKED,
94         .lhash_users    = ATOMIC_INIT(0),
95         .lhash_wait     = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96         .portalloc_lock = SPIN_LOCK_UNLOCKED,
97 };
98
99 /*
100  * This array holds the first and last local port number.
101  * For high-usage systems, use sysctl to change this to
102  * 32768-61000
103  */
104 int sysctl_local_port_range[2] = { 1024, 4999 };
105 int tcp_port_rover = 1024 - 1;
106
107 /* Caller must disable local BH processing. */
108 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
109 {
110         struct inet_bind_hashbucket *head =
111                                 &tcp_bhash[inet_bhashfn(inet_sk(child)->num,
112                                                         tcp_bhash_size)];
113         struct inet_bind_bucket *tb;
114
115         spin_lock(&head->lock);
116         tb = inet_sk(sk)->bind_hash;
117         sk_add_bind_node(child, &tb->owners);
118         inet_sk(child)->bind_hash = tb;
119         spin_unlock(&head->lock);
120 }
121
122 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
123 {
124         local_bh_disable();
125         __tcp_inherit_port(sk, child);
126         local_bh_enable();
127 }
128
129 void tcp_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
130                    const unsigned short snum)
131 {
132         struct inet_sock *inet = inet_sk(sk);
133         inet->num       = snum;
134         sk_add_bind_node(sk, &tb->owners);
135         inet->bind_hash = tb;
136 }
137
138 static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
139 {
140         const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
141         struct sock *sk2;
142         struct hlist_node *node;
143         int reuse = sk->sk_reuse;
144
145         sk_for_each_bound(sk2, node, &tb->owners) {
146                 if (sk != sk2 &&
147                     !tcp_v6_ipv6only(sk2) &&
148                     (!sk->sk_bound_dev_if ||
149                      !sk2->sk_bound_dev_if ||
150                      sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
151                         if (!reuse || !sk2->sk_reuse ||
152                             sk2->sk_state == TCP_LISTEN) {
153                                 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
154                                 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
155                                     sk2_rcv_saddr == sk_rcv_saddr)
156                                         break;
157                         }
158                 }
159         }
160         return node != NULL;
161 }
162
163 /* Obtain a reference to a local port for the given sock,
164  * if snum is zero it means select any available local port.
165  */
166 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
167 {
168         struct inet_bind_hashbucket *head;
169         struct hlist_node *node;
170         struct inet_bind_bucket *tb;
171         int ret;
172
173         local_bh_disable();
174         if (!snum) {
175                 int low = sysctl_local_port_range[0];
176                 int high = sysctl_local_port_range[1];
177                 int remaining = (high - low) + 1;
178                 int rover;
179
180                 spin_lock(&tcp_portalloc_lock);
181                 if (tcp_port_rover < low)
182                         rover = low;
183                 else
184                         rover = tcp_port_rover;
185                 do {
186                         rover++;
187                         if (rover > high)
188                                 rover = low;
189                         head = &tcp_bhash[inet_bhashfn(rover, tcp_bhash_size)];
190                         spin_lock(&head->lock);
191                         inet_bind_bucket_for_each(tb, node, &head->chain)
192                                 if (tb->port == rover)
193                                         goto next;
194                         break;
195                 next:
196                         spin_unlock(&head->lock);
197                 } while (--remaining > 0);
198                 tcp_port_rover = rover;
199                 spin_unlock(&tcp_portalloc_lock);
200
201                 /* Exhausted local port range during search?  It is not
202                  * possible for us to be holding one of the bind hash
203                  * locks if this test triggers, because if 'remaining'
204                  * drops to zero, we broke out of the do/while loop at
205                  * the top level, not from the 'break;' statement.
206                  */
207                 ret = 1;
208                 if (unlikely(remaining <= 0))
209                         goto fail;
210
211                 /* OK, here is the one we will use.  HEAD is
212                  * non-NULL and we hold it's mutex.
213                  */
214                 snum = rover;
215         } else {
216                 head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)];
217                 spin_lock(&head->lock);
218                 inet_bind_bucket_for_each(tb, node, &head->chain)
219                         if (tb->port == snum)
220                                 goto tb_found;
221         }
222         tb = NULL;
223         goto tb_not_found;
224 tb_found:
225         if (!hlist_empty(&tb->owners)) {
226                 if (sk->sk_reuse > 1)
227                         goto success;
228                 if (tb->fastreuse > 0 &&
229                     sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
230                         goto success;
231                 } else {
232                         ret = 1;
233                         if (tcp_bind_conflict(sk, tb))
234                                 goto fail_unlock;
235                 }
236         }
237 tb_not_found:
238         ret = 1;
239         if (!tb && (tb = inet_bind_bucket_create(tcp_bucket_cachep, head, snum)) == NULL)
240                 goto fail_unlock;
241         if (hlist_empty(&tb->owners)) {
242                 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
243                         tb->fastreuse = 1;
244                 else
245                         tb->fastreuse = 0;
246         } else if (tb->fastreuse &&
247                    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
248                 tb->fastreuse = 0;
249 success:
250         if (!inet_sk(sk)->bind_hash)
251                 tcp_bind_hash(sk, tb, snum);
252         BUG_TRAP(inet_sk(sk)->bind_hash == tb);
253         ret = 0;
254
255 fail_unlock:
256         spin_unlock(&head->lock);
257 fail:
258         local_bh_enable();
259         return ret;
260 }
261
262 /* Get rid of any references to a local port held by the
263  * given sock.
264  */
265 static void __tcp_put_port(struct sock *sk)
266 {
267         struct inet_sock *inet = inet_sk(sk);
268         struct inet_bind_hashbucket *head = &tcp_bhash[inet_bhashfn(inet->num,
269                                                                     tcp_bhash_size)];
270         struct inet_bind_bucket *tb;
271
272         spin_lock(&head->lock);
273         tb = inet->bind_hash;
274         __sk_del_bind_node(sk);
275         inet->bind_hash = NULL;
276         inet->num = 0;
277         inet_bind_bucket_destroy(tcp_bucket_cachep, tb);
278         spin_unlock(&head->lock);
279 }
280
281 void tcp_put_port(struct sock *sk)
282 {
283         local_bh_disable();
284         __tcp_put_port(sk);
285         local_bh_enable();
286 }
287
288 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
289  * Look, when several writers sleep and reader wakes them up, all but one
290  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
291  * this, _but_ remember, it adds useless work on UP machines (wake up each
292  * exclusive lock release). It should be ifdefed really.
293  */
294
295 void tcp_listen_wlock(void)
296 {
297         write_lock(&tcp_lhash_lock);
298
299         if (atomic_read(&tcp_lhash_users)) {
300                 DEFINE_WAIT(wait);
301
302                 for (;;) {
303                         prepare_to_wait_exclusive(&tcp_lhash_wait,
304                                                 &wait, TASK_UNINTERRUPTIBLE);
305                         if (!atomic_read(&tcp_lhash_users))
306                                 break;
307                         write_unlock_bh(&tcp_lhash_lock);
308                         schedule();
309                         write_lock_bh(&tcp_lhash_lock);
310                 }
311
312                 finish_wait(&tcp_lhash_wait, &wait);
313         }
314 }
315
316 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
317 {
318         struct hlist_head *list;
319         rwlock_t *lock;
320
321         BUG_TRAP(sk_unhashed(sk));
322         if (listen_possible && sk->sk_state == TCP_LISTEN) {
323                 list = &tcp_listening_hash[inet_sk_listen_hashfn(sk)];
324                 lock = &tcp_lhash_lock;
325                 tcp_listen_wlock();
326         } else {
327                 sk->sk_hashent = inet_sk_ehashfn(sk, tcp_ehash_size);
328                 list = &tcp_ehash[sk->sk_hashent].chain;
329                 lock = &tcp_ehash[sk->sk_hashent].lock;
330                 write_lock(lock);
331         }
332         __sk_add_node(sk, list);
333         sock_prot_inc_use(sk->sk_prot);
334         write_unlock(lock);
335         if (listen_possible && sk->sk_state == TCP_LISTEN)
336                 wake_up(&tcp_lhash_wait);
337 }
338
339 static void tcp_v4_hash(struct sock *sk)
340 {
341         if (sk->sk_state != TCP_CLOSE) {
342                 local_bh_disable();
343                 __tcp_v4_hash(sk, 1);
344                 local_bh_enable();
345         }
346 }
347
348 void tcp_unhash(struct sock *sk)
349 {
350         rwlock_t *lock;
351
352         if (sk_unhashed(sk))
353                 goto ende;
354
355         if (sk->sk_state == TCP_LISTEN) {
356                 local_bh_disable();
357                 tcp_listen_wlock();
358                 lock = &tcp_lhash_lock;
359         } else {
360                 struct inet_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
361                 lock = &head->lock;
362                 write_lock_bh(&head->lock);
363         }
364
365         if (__sk_del_node_init(sk))
366                 sock_prot_dec_use(sk->sk_prot);
367         write_unlock_bh(lock);
368
369  ende:
370         if (sk->sk_state == TCP_LISTEN)
371                 wake_up(&tcp_lhash_wait);
372 }
373
374 /* Don't inline this cruft.  Here are some nice properties to
375  * exploit here.  The BSD API does not allow a listening TCP
376  * to specify the remote port nor the remote address for the
377  * connection.  So always assume those are both wildcarded
378  * during the search since they can never be otherwise.
379  */
380 static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head,
381                                              const u32 daddr,
382                                              const unsigned short hnum,
383                                              const int dif)
384 {
385         struct sock *result = NULL, *sk;
386         struct hlist_node *node;
387         int score, hiscore;
388
389         hiscore=-1;
390         sk_for_each(sk, node, head) {
391                 struct inet_sock *inet = inet_sk(sk);
392
393                 if (inet->num == hnum && !ipv6_only_sock(sk)) {
394                         __u32 rcv_saddr = inet->rcv_saddr;
395
396                         score = (sk->sk_family == PF_INET ? 1 : 0);
397                         if (rcv_saddr) {
398                                 if (rcv_saddr != daddr)
399                                         continue;
400                                 score+=2;
401                         }
402                         if (sk->sk_bound_dev_if) {
403                                 if (sk->sk_bound_dev_if != dif)
404                                         continue;
405                                 score+=2;
406                         }
407                         if (score == 5)
408                                 return sk;
409                         if (score > hiscore) {
410                                 hiscore = score;
411                                 result = sk;
412                         }
413                 }
414         }
415         return result;
416 }
417
418 /* Optimize the common listener case. */
419 static inline struct sock *tcp_v4_lookup_listener(const u32 daddr,
420                                                   const unsigned short hnum,
421                                                   const int dif)
422 {
423         struct sock *sk = NULL;
424         struct hlist_head *head;
425
426         read_lock(&tcp_lhash_lock);
427         head = &tcp_listening_hash[inet_lhashfn(hnum)];
428         if (!hlist_empty(head)) {
429                 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
430
431                 if (inet->num == hnum && !sk->sk_node.next &&
432                     (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
433                     (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
434                     !sk->sk_bound_dev_if)
435                         goto sherry_cache;
436                 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
437         }
438         if (sk) {
439 sherry_cache:
440                 sock_hold(sk);
441         }
442         read_unlock(&tcp_lhash_lock);
443         return sk;
444 }
445
446 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
447  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
448  *
449  * Local BH must be disabled here.
450  */
451
452 static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
453                                                        const u16 sport,
454                                                        const u32 daddr,
455                                                        const u16 hnum,
456                                                        const int dif)
457 {
458         struct inet_ehash_bucket *head;
459         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
460         __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
461         struct sock *sk;
462         struct hlist_node *node;
463         /* Optimize here for direct hit, only listening connections can
464          * have wildcards anyways.
465          */
466         const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_ehash_size);
467         head = &tcp_ehash[hash];
468         read_lock(&head->lock);
469         sk_for_each(sk, node, &head->chain) {
470                 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
471                         goto hit; /* You sunk my battleship! */
472         }
473
474         /* Must check for a TIME_WAIT'er before going to listener hash. */
475         sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
476                 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
477                         goto hit;
478         }
479         sk = NULL;
480 out:
481         read_unlock(&head->lock);
482         return sk;
483 hit:
484         sock_hold(sk);
485         goto out;
486 }
487
488 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
489                                            u32 daddr, u16 hnum, int dif)
490 {
491         struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
492                                                       daddr, hnum, dif);
493
494         return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
495 }
496
497 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
498                                   u16 dport, int dif)
499 {
500         struct sock *sk;
501
502         local_bh_disable();
503         sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
504         local_bh_enable();
505
506         return sk;
507 }
508
509 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
510
511 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
512 {
513         return secure_tcp_sequence_number(skb->nh.iph->daddr,
514                                           skb->nh.iph->saddr,
515                                           skb->h.th->dest,
516                                           skb->h.th->source);
517 }
518
519 /* called with local bh disabled */
520 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
521                                       struct tcp_tw_bucket **twp)
522 {
523         struct inet_sock *inet = inet_sk(sk);
524         u32 daddr = inet->rcv_saddr;
525         u32 saddr = inet->daddr;
526         int dif = sk->sk_bound_dev_if;
527         TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
528         __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
529         const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_ehash_size);
530         struct inet_ehash_bucket *head = &tcp_ehash[hash];
531         struct sock *sk2;
532         struct hlist_node *node;
533         struct tcp_tw_bucket *tw;
534
535         write_lock(&head->lock);
536
537         /* Check TIME-WAIT sockets first. */
538         sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
539                 tw = (struct tcp_tw_bucket *)sk2;
540
541                 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
542                         struct tcp_sock *tp = tcp_sk(sk);
543
544                         /* With PAWS, it is safe from the viewpoint
545                            of data integrity. Even without PAWS it
546                            is safe provided sequence spaces do not
547                            overlap i.e. at data rates <= 80Mbit/sec.
548
549                            Actually, the idea is close to VJ's one,
550                            only timestamp cache is held not per host,
551                            but per port pair and TW bucket is used
552                            as state holder.
553
554                            If TW bucket has been already destroyed we
555                            fall back to VJ's scheme and use initial
556                            timestamp retrieved from peer table.
557                          */
558                         if (tw->tw_ts_recent_stamp &&
559                             (!twp || (sysctl_tcp_tw_reuse &&
560                                       xtime.tv_sec -
561                                       tw->tw_ts_recent_stamp > 1))) {
562                                 if ((tp->write_seq =
563                                                 tw->tw_snd_nxt + 65535 + 2) == 0)
564                                         tp->write_seq = 1;
565                                 tp->rx_opt.ts_recent       = tw->tw_ts_recent;
566                                 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
567                                 sock_hold(sk2);
568                                 goto unique;
569                         } else
570                                 goto not_unique;
571                 }
572         }
573         tw = NULL;
574
575         /* And established part... */
576         sk_for_each(sk2, node, &head->chain) {
577                 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
578                         goto not_unique;
579         }
580
581 unique:
582         /* Must record num and sport now. Otherwise we will see
583          * in hash table socket with a funny identity. */
584         inet->num = lport;
585         inet->sport = htons(lport);
586         sk->sk_hashent = hash;
587         BUG_TRAP(sk_unhashed(sk));
588         __sk_add_node(sk, &head->chain);
589         sock_prot_inc_use(sk->sk_prot);
590         write_unlock(&head->lock);
591
592         if (twp) {
593                 *twp = tw;
594                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
595         } else if (tw) {
596                 /* Silly. Should hash-dance instead... */
597                 tcp_tw_deschedule(tw);
598                 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
599
600                 tcp_tw_put(tw);
601         }
602
603         return 0;
604
605 not_unique:
606         write_unlock(&head->lock);
607         return -EADDRNOTAVAIL;
608 }
609
610 static inline u32 connect_port_offset(const struct sock *sk)
611 {
612         const struct inet_sock *inet = inet_sk(sk);
613
614         return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, 
615                                          inet->dport);
616 }
617
618 /*
619  * Bind a port for a connect operation and hash it.
620  */
621 static inline int tcp_v4_hash_connect(struct sock *sk)
622 {
623         const unsigned short snum = inet_sk(sk)->num;
624         struct inet_bind_hashbucket *head;
625         struct inet_bind_bucket *tb;
626         int ret;
627
628         if (!snum) {
629                 int low = sysctl_local_port_range[0];
630                 int high = sysctl_local_port_range[1];
631                 int range = high - low;
632                 int i;
633                 int port;
634                 static u32 hint;
635                 u32 offset = hint + connect_port_offset(sk);
636                 struct hlist_node *node;
637                 struct tcp_tw_bucket *tw = NULL;
638
639                 local_bh_disable();
640                 for (i = 1; i <= range; i++) {
641                         port = low + (i + offset) % range;
642                         head = &tcp_bhash[inet_bhashfn(port, tcp_bhash_size)];
643                         spin_lock(&head->lock);
644
645                         /* Does not bother with rcv_saddr checks,
646                          * because the established check is already
647                          * unique enough.
648                          */
649                         inet_bind_bucket_for_each(tb, node, &head->chain) {
650                                 if (tb->port == port) {
651                                         BUG_TRAP(!hlist_empty(&tb->owners));
652                                         if (tb->fastreuse >= 0)
653                                                 goto next_port;
654                                         if (!__tcp_v4_check_established(sk,
655                                                                         port,
656                                                                         &tw))
657                                                 goto ok;
658                                         goto next_port;
659                                 }
660                         }
661
662                         tb = inet_bind_bucket_create(tcp_bucket_cachep, head, port);
663                         if (!tb) {
664                                 spin_unlock(&head->lock);
665                                 break;
666                         }
667                         tb->fastreuse = -1;
668                         goto ok;
669
670                 next_port:
671                         spin_unlock(&head->lock);
672                 }
673                 local_bh_enable();
674
675                 return -EADDRNOTAVAIL;
676
677 ok:
678                 hint += i;
679
680                 /* Head lock still held and bh's disabled */
681                 tcp_bind_hash(sk, tb, port);
682                 if (sk_unhashed(sk)) {
683                         inet_sk(sk)->sport = htons(port);
684                         __tcp_v4_hash(sk, 0);
685                 }
686                 spin_unlock(&head->lock);
687
688                 if (tw) {
689                         tcp_tw_deschedule(tw);
690                         tcp_tw_put(tw);
691                 }
692
693                 ret = 0;
694                 goto out;
695         }
696
697         head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)];
698         tb  = inet_sk(sk)->bind_hash;
699         spin_lock_bh(&head->lock);
700         if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
701                 __tcp_v4_hash(sk, 0);
702                 spin_unlock_bh(&head->lock);
703                 return 0;
704         } else {
705                 spin_unlock(&head->lock);
706                 /* No definite answer... Walk to established hash table */
707                 ret = __tcp_v4_check_established(sk, snum, NULL);
708 out:
709                 local_bh_enable();
710                 return ret;
711         }
712 }
713
714 /* This will initiate an outgoing connection. */
715 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
716 {
717         struct inet_sock *inet = inet_sk(sk);
718         struct tcp_sock *tp = tcp_sk(sk);
719         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
720         struct rtable *rt;
721         u32 daddr, nexthop;
722         int tmp;
723         int err;
724
725         if (addr_len < sizeof(struct sockaddr_in))
726                 return -EINVAL;
727
728         if (usin->sin_family != AF_INET)
729                 return -EAFNOSUPPORT;
730
731         nexthop = daddr = usin->sin_addr.s_addr;
732         if (inet->opt && inet->opt->srr) {
733                 if (!daddr)
734                         return -EINVAL;
735                 nexthop = inet->opt->faddr;
736         }
737
738         tmp = ip_route_connect(&rt, nexthop, inet->saddr,
739                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
740                                IPPROTO_TCP,
741                                inet->sport, usin->sin_port, sk);
742         if (tmp < 0)
743                 return tmp;
744
745         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
746                 ip_rt_put(rt);
747                 return -ENETUNREACH;
748         }
749
750         if (!inet->opt || !inet->opt->srr)
751                 daddr = rt->rt_dst;
752
753         if (!inet->saddr)
754                 inet->saddr = rt->rt_src;
755         inet->rcv_saddr = inet->saddr;
756
757         if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
758                 /* Reset inherited state */
759                 tp->rx_opt.ts_recent       = 0;
760                 tp->rx_opt.ts_recent_stamp = 0;
761                 tp->write_seq              = 0;
762         }
763
764         if (sysctl_tcp_tw_recycle &&
765             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
766                 struct inet_peer *peer = rt_get_peer(rt);
767
768                 /* VJ's idea. We save last timestamp seen from
769                  * the destination in peer table, when entering state TIME-WAIT
770                  * and initialize rx_opt.ts_recent from it, when trying new connection.
771                  */
772
773                 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
774                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
775                         tp->rx_opt.ts_recent = peer->tcp_ts;
776                 }
777         }
778
779         inet->dport = usin->sin_port;
780         inet->daddr = daddr;
781
782         tp->ext_header_len = 0;
783         if (inet->opt)
784                 tp->ext_header_len = inet->opt->optlen;
785
786         tp->rx_opt.mss_clamp = 536;
787
788         /* Socket identity is still unknown (sport may be zero).
789          * However we set state to SYN-SENT and not releasing socket
790          * lock select source port, enter ourselves into the hash tables and
791          * complete initialization after this.
792          */
793         tcp_set_state(sk, TCP_SYN_SENT);
794         err = tcp_v4_hash_connect(sk);
795         if (err)
796                 goto failure;
797
798         err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
799         if (err)
800                 goto failure;
801
802         /* OK, now commit destination to socket.  */
803         sk_setup_caps(sk, &rt->u.dst);
804
805         if (!tp->write_seq)
806                 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
807                                                            inet->daddr,
808                                                            inet->sport,
809                                                            usin->sin_port);
810
811         inet->id = tp->write_seq ^ jiffies;
812
813         err = tcp_connect(sk);
814         rt = NULL;
815         if (err)
816                 goto failure;
817
818         return 0;
819
820 failure:
821         /* This unhashes the socket and releases the local port, if necessary. */
822         tcp_set_state(sk, TCP_CLOSE);
823         ip_rt_put(rt);
824         sk->sk_route_caps = 0;
825         inet->dport = 0;
826         return err;
827 }
828
829 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
830 {
831         return ((struct rtable *)skb->dst)->rt_iif;
832 }
833
834 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
835 {
836         return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
837 }
838
839 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
840                                               struct request_sock ***prevp,
841                                               __u16 rport,
842                                               __u32 raddr, __u32 laddr)
843 {
844         struct listen_sock *lopt = tp->accept_queue.listen_opt;
845         struct request_sock *req, **prev;
846
847         for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
848              (req = *prev) != NULL;
849              prev = &req->dl_next) {
850                 const struct inet_request_sock *ireq = inet_rsk(req);
851
852                 if (ireq->rmt_port == rport &&
853                     ireq->rmt_addr == raddr &&
854                     ireq->loc_addr == laddr &&
855                     TCP_INET_FAMILY(req->rsk_ops->family)) {
856                         BUG_TRAP(!req->sk);
857                         *prevp = prev;
858                         break;
859                 }
860         }
861
862         return req;
863 }
864
865 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
866 {
867         struct tcp_sock *tp = tcp_sk(sk);
868         struct listen_sock *lopt = tp->accept_queue.listen_opt;
869         u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
870
871         reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
872         tcp_synq_added(sk);
873 }
874
875
876 /*
877  * This routine does path mtu discovery as defined in RFC1191.
878  */
879 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
880                                      u32 mtu)
881 {
882         struct dst_entry *dst;
883         struct inet_sock *inet = inet_sk(sk);
884         struct tcp_sock *tp = tcp_sk(sk);
885
886         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
887          * send out by Linux are always <576bytes so they should go through
888          * unfragmented).
889          */
890         if (sk->sk_state == TCP_LISTEN)
891                 return;
892
893         /* We don't check in the destentry if pmtu discovery is forbidden
894          * on this route. We just assume that no packet_to_big packets
895          * are send back when pmtu discovery is not active.
896          * There is a small race when the user changes this flag in the
897          * route, but I think that's acceptable.
898          */
899         if ((dst = __sk_dst_check(sk, 0)) == NULL)
900                 return;
901
902         dst->ops->update_pmtu(dst, mtu);
903
904         /* Something is about to be wrong... Remember soft error
905          * for the case, if this connection will not able to recover.
906          */
907         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
908                 sk->sk_err_soft = EMSGSIZE;
909
910         mtu = dst_mtu(dst);
911
912         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
913             tp->pmtu_cookie > mtu) {
914                 tcp_sync_mss(sk, mtu);
915
916                 /* Resend the TCP packet because it's
917                  * clear that the old packet has been
918                  * dropped. This is the new "fast" path mtu
919                  * discovery.
920                  */
921                 tcp_simple_retransmit(sk);
922         } /* else let the usual retransmit timer handle it */
923 }
924
925 /*
926  * This routine is called by the ICMP module when it gets some
927  * sort of error condition.  If err < 0 then the socket should
928  * be closed and the error returned to the user.  If err > 0
929  * it's just the icmp type << 8 | icmp code.  After adjustment
930  * header points to the first 8 bytes of the tcp header.  We need
931  * to find the appropriate port.
932  *
933  * The locking strategy used here is very "optimistic". When
934  * someone else accesses the socket the ICMP is just dropped
935  * and for some paths there is no check at all.
936  * A more general error queue to queue errors for later handling
937  * is probably better.
938  *
939  */
940
941 void tcp_v4_err(struct sk_buff *skb, u32 info)
942 {
943         struct iphdr *iph = (struct iphdr *)skb->data;
944         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
945         struct tcp_sock *tp;
946         struct inet_sock *inet;
947         int type = skb->h.icmph->type;
948         int code = skb->h.icmph->code;
949         struct sock *sk;
950         __u32 seq;
951         int err;
952
953         if (skb->len < (iph->ihl << 2) + 8) {
954                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
955                 return;
956         }
957
958         sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
959                            th->source, tcp_v4_iif(skb));
960         if (!sk) {
961                 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
962                 return;
963         }
964         if (sk->sk_state == TCP_TIME_WAIT) {
965                 tcp_tw_put((struct tcp_tw_bucket *)sk);
966                 return;
967         }
968
969         bh_lock_sock(sk);
970         /* If too many ICMPs get dropped on busy
971          * servers this needs to be solved differently.
972          */
973         if (sock_owned_by_user(sk))
974                 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
975
976         if (sk->sk_state == TCP_CLOSE)
977                 goto out;
978
979         tp = tcp_sk(sk);
980         seq = ntohl(th->seq);
981         if (sk->sk_state != TCP_LISTEN &&
982             !between(seq, tp->snd_una, tp->snd_nxt)) {
983                 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
984                 goto out;
985         }
986
987         switch (type) {
988         case ICMP_SOURCE_QUENCH:
989                 /* Just silently ignore these. */
990                 goto out;
991         case ICMP_PARAMETERPROB:
992                 err = EPROTO;
993                 break;
994         case ICMP_DEST_UNREACH:
995                 if (code > NR_ICMP_UNREACH)
996                         goto out;
997
998                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
999                         if (!sock_owned_by_user(sk))
1000                                 do_pmtu_discovery(sk, iph, info);
1001                         goto out;
1002                 }
1003
1004                 err = icmp_err_convert[code].errno;
1005                 break;
1006         case ICMP_TIME_EXCEEDED:
1007                 err = EHOSTUNREACH;
1008                 break;
1009         default:
1010                 goto out;
1011         }
1012
1013         switch (sk->sk_state) {
1014                 struct request_sock *req, **prev;
1015         case TCP_LISTEN:
1016                 if (sock_owned_by_user(sk))
1017                         goto out;
1018
1019                 req = tcp_v4_search_req(tp, &prev, th->dest,
1020                                         iph->daddr, iph->saddr);
1021                 if (!req)
1022                         goto out;
1023
1024                 /* ICMPs are not backlogged, hence we cannot get
1025                    an established socket here.
1026                  */
1027                 BUG_TRAP(!req->sk);
1028
1029                 if (seq != tcp_rsk(req)->snt_isn) {
1030                         NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1031                         goto out;
1032                 }
1033
1034                 /*
1035                  * Still in SYN_RECV, just remove it silently.
1036                  * There is no good way to pass the error to the newly
1037                  * created socket, and POSIX does not want network
1038                  * errors returned from accept().
1039                  */
1040                 tcp_synq_drop(sk, req, prev);
1041                 goto out;
1042
1043         case TCP_SYN_SENT:
1044         case TCP_SYN_RECV:  /* Cannot happen.
1045                                It can f.e. if SYNs crossed.
1046                              */
1047                 if (!sock_owned_by_user(sk)) {
1048                         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1049                         sk->sk_err = err;
1050
1051                         sk->sk_error_report(sk);
1052
1053                         tcp_done(sk);
1054                 } else {
1055                         sk->sk_err_soft = err;
1056                 }
1057                 goto out;
1058         }
1059
1060         /* If we've already connected we will keep trying
1061          * until we time out, or the user gives up.
1062          *
1063          * rfc1122 4.2.3.9 allows to consider as hard errors
1064          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1065          * but it is obsoleted by pmtu discovery).
1066          *
1067          * Note, that in modern internet, where routing is unreliable
1068          * and in each dark corner broken firewalls sit, sending random
1069          * errors ordered by their masters even this two messages finally lose
1070          * their original sense (even Linux sends invalid PORT_UNREACHs)
1071          *
1072          * Now we are in compliance with RFCs.
1073          *                                                      --ANK (980905)
1074          */
1075
1076         inet = inet_sk(sk);
1077         if (!sock_owned_by_user(sk) && inet->recverr) {
1078                 sk->sk_err = err;
1079                 sk->sk_error_report(sk);
1080         } else  { /* Only an error on timeout */
1081                 sk->sk_err_soft = err;
1082         }
1083
1084 out:
1085         bh_unlock_sock(sk);
1086         sock_put(sk);
1087 }
1088
1089 /* This routine computes an IPv4 TCP checksum. */
1090 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1091                        struct sk_buff *skb)
1092 {
1093         struct inet_sock *inet = inet_sk(sk);
1094
1095         if (skb->ip_summed == CHECKSUM_HW) {
1096                 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1097                 skb->csum = offsetof(struct tcphdr, check);
1098         } else {
1099                 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1100                                          csum_partial((char *)th,
1101                                                       th->doff << 2,
1102                                                       skb->csum));
1103         }
1104 }
1105
1106 /*
1107  *      This routine will send an RST to the other tcp.
1108  *
1109  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1110  *                    for reset.
1111  *      Answer: if a packet caused RST, it is not for a socket
1112  *              existing in our system, if it is matched to a socket,
1113  *              it is just duplicate segment or bug in other side's TCP.
1114  *              So that we build reply only basing on parameters
1115  *              arrived with segment.
1116  *      Exception: precedence violation. We do not implement it in any case.
1117  */
1118
1119 static void tcp_v4_send_reset(struct sk_buff *skb)
1120 {
1121         struct tcphdr *th = skb->h.th;
1122         struct tcphdr rth;
1123         struct ip_reply_arg arg;
1124
1125         /* Never send a reset in response to a reset. */
1126         if (th->rst)
1127                 return;
1128
1129         if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1130                 return;
1131
1132         /* Swap the send and the receive. */
1133         memset(&rth, 0, sizeof(struct tcphdr));
1134         rth.dest   = th->source;
1135         rth.source = th->dest;
1136         rth.doff   = sizeof(struct tcphdr) / 4;
1137         rth.rst    = 1;
1138
1139         if (th->ack) {
1140                 rth.seq = th->ack_seq;
1141         } else {
1142                 rth.ack = 1;
1143                 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1144                                     skb->len - (th->doff << 2));
1145         }
1146
1147         memset(&arg, 0, sizeof arg);
1148         arg.iov[0].iov_base = (unsigned char *)&rth;
1149         arg.iov[0].iov_len  = sizeof rth;
1150         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1151                                       skb->nh.iph->saddr, /*XXX*/
1152                                       sizeof(struct tcphdr), IPPROTO_TCP, 0);
1153         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1154
1155         ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1156
1157         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1158         TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1159 }
1160
1161 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1162    outside socket context is ugly, certainly. What can I do?
1163  */
1164
1165 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1166                             u32 win, u32 ts)
1167 {
1168         struct tcphdr *th = skb->h.th;
1169         struct {
1170                 struct tcphdr th;
1171                 u32 tsopt[3];
1172         } rep;
1173         struct ip_reply_arg arg;
1174
1175         memset(&rep.th, 0, sizeof(struct tcphdr));
1176         memset(&arg, 0, sizeof arg);
1177
1178         arg.iov[0].iov_base = (unsigned char *)&rep;
1179         arg.iov[0].iov_len  = sizeof(rep.th);
1180         if (ts) {
1181                 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1182                                      (TCPOPT_TIMESTAMP << 8) |
1183                                      TCPOLEN_TIMESTAMP);
1184                 rep.tsopt[1] = htonl(tcp_time_stamp);
1185                 rep.tsopt[2] = htonl(ts);
1186                 arg.iov[0].iov_len = sizeof(rep);
1187         }
1188
1189         /* Swap the send and the receive. */
1190         rep.th.dest    = th->source;
1191         rep.th.source  = th->dest;
1192         rep.th.doff    = arg.iov[0].iov_len / 4;
1193         rep.th.seq     = htonl(seq);
1194         rep.th.ack_seq = htonl(ack);
1195         rep.th.ack     = 1;
1196         rep.th.window  = htons(win);
1197
1198         arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1199                                       skb->nh.iph->saddr, /*XXX*/
1200                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
1201         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1202
1203         ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1204
1205         TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1206 }
1207
1208 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1209 {
1210         struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1211
1212         tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1213                         tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1214
1215         tcp_tw_put(tw);
1216 }
1217
1218 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1219 {
1220         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1221                         req->ts_recent);
1222 }
1223
1224 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1225                                           struct request_sock *req)
1226 {
1227         struct rtable *rt;
1228         const struct inet_request_sock *ireq = inet_rsk(req);
1229         struct ip_options *opt = inet_rsk(req)->opt;
1230         struct flowi fl = { .oif = sk->sk_bound_dev_if,
1231                             .nl_u = { .ip4_u =
1232                                       { .daddr = ((opt && opt->srr) ?
1233                                                   opt->faddr :
1234                                                   ireq->rmt_addr),
1235                                         .saddr = ireq->loc_addr,
1236                                         .tos = RT_CONN_FLAGS(sk) } },
1237                             .proto = IPPROTO_TCP,
1238                             .uli_u = { .ports =
1239                                        { .sport = inet_sk(sk)->sport,
1240                                          .dport = ireq->rmt_port } } };
1241
1242         if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1243                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1244                 return NULL;
1245         }
1246         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1247                 ip_rt_put(rt);
1248                 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1249                 return NULL;
1250         }
1251         return &rt->u.dst;
1252 }
1253
1254 /*
1255  *      Send a SYN-ACK after having received an ACK.
1256  *      This still operates on a request_sock only, not on a big
1257  *      socket.
1258  */
1259 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1260                               struct dst_entry *dst)
1261 {
1262         const struct inet_request_sock *ireq = inet_rsk(req);
1263         int err = -1;
1264         struct sk_buff * skb;
1265
1266         /* First, grab a route. */
1267         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1268                 goto out;
1269
1270         skb = tcp_make_synack(sk, dst, req);
1271
1272         if (skb) {
1273                 struct tcphdr *th = skb->h.th;
1274
1275                 th->check = tcp_v4_check(th, skb->len,
1276                                          ireq->loc_addr,
1277                                          ireq->rmt_addr,
1278                                          csum_partial((char *)th, skb->len,
1279                                                       skb->csum));
1280
1281                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1282                                             ireq->rmt_addr,
1283                                             ireq->opt);
1284                 if (err == NET_XMIT_CN)
1285                         err = 0;
1286         }
1287
1288 out:
1289         dst_release(dst);
1290         return err;
1291 }
1292
1293 /*
1294  *      IPv4 request_sock destructor.
1295  */
1296 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1297 {
1298         if (inet_rsk(req)->opt)
1299                 kfree(inet_rsk(req)->opt);
1300 }
1301
1302 static inline void syn_flood_warning(struct sk_buff *skb)
1303 {
1304         static unsigned long warntime;
1305
1306         if (time_after(jiffies, (warntime + HZ * 60))) {
1307                 warntime = jiffies;
1308                 printk(KERN_INFO
1309                        "possible SYN flooding on port %d. Sending cookies.\n",
1310                        ntohs(skb->h.th->dest));
1311         }
1312 }
1313
1314 /*
1315  * Save and compile IPv4 options into the request_sock if needed.
1316  */
1317 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1318                                                      struct sk_buff *skb)
1319 {
1320         struct ip_options *opt = &(IPCB(skb)->opt);
1321         struct ip_options *dopt = NULL;
1322
1323         if (opt && opt->optlen) {
1324                 int opt_size = optlength(opt);
1325                 dopt = kmalloc(opt_size, GFP_ATOMIC);
1326                 if (dopt) {
1327                         if (ip_options_echo(dopt, skb)) {
1328                                 kfree(dopt);
1329                                 dopt = NULL;
1330                         }
1331                 }
1332         }
1333         return dopt;
1334 }
1335
1336 struct request_sock_ops tcp_request_sock_ops = {
1337         .family         =       PF_INET,
1338         .obj_size       =       sizeof(struct tcp_request_sock),
1339         .rtx_syn_ack    =       tcp_v4_send_synack,
1340         .send_ack       =       tcp_v4_reqsk_send_ack,
1341         .destructor     =       tcp_v4_reqsk_destructor,
1342         .send_reset     =       tcp_v4_send_reset,
1343 };
1344
1345 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1346 {
1347         struct inet_request_sock *ireq;
1348         struct tcp_options_received tmp_opt;
1349         struct request_sock *req;
1350         __u32 saddr = skb->nh.iph->saddr;
1351         __u32 daddr = skb->nh.iph->daddr;
1352         __u32 isn = TCP_SKB_CB(skb)->when;
1353         struct dst_entry *dst = NULL;
1354 #ifdef CONFIG_SYN_COOKIES
1355         int want_cookie = 0;
1356 #else
1357 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1358 #endif
1359
1360         /* Never answer to SYNs send to broadcast or multicast */
1361         if (((struct rtable *)skb->dst)->rt_flags &
1362             (RTCF_BROADCAST | RTCF_MULTICAST))
1363                 goto drop;
1364
1365         /* TW buckets are converted to open requests without
1366          * limitations, they conserve resources and peer is
1367          * evidently real one.
1368          */
1369         if (tcp_synq_is_full(sk) && !isn) {
1370 #ifdef CONFIG_SYN_COOKIES
1371                 if (sysctl_tcp_syncookies) {
1372                         want_cookie = 1;
1373                 } else
1374 #endif
1375                 goto drop;
1376         }
1377
1378         /* Accept backlog is full. If we have already queued enough
1379          * of warm entries in syn queue, drop request. It is better than
1380          * clogging syn queue with openreqs with exponentially increasing
1381          * timeout.
1382          */
1383         if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1384                 goto drop;
1385
1386         req = reqsk_alloc(&tcp_request_sock_ops);
1387         if (!req)
1388                 goto drop;
1389
1390         tcp_clear_options(&tmp_opt);
1391         tmp_opt.mss_clamp = 536;
1392         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1393
1394         tcp_parse_options(skb, &tmp_opt, 0);
1395
1396         if (want_cookie) {
1397                 tcp_clear_options(&tmp_opt);
1398                 tmp_opt.saw_tstamp = 0;
1399         }
1400
1401         if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1402                 /* Some OSes (unknown ones, but I see them on web server, which
1403                  * contains information interesting only for windows'
1404                  * users) do not send their stamp in SYN. It is easy case.
1405                  * We simply do not advertise TS support.
1406                  */
1407                 tmp_opt.saw_tstamp = 0;
1408                 tmp_opt.tstamp_ok  = 0;
1409         }
1410         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1411
1412         tcp_openreq_init(req, &tmp_opt, skb);
1413
1414         ireq = inet_rsk(req);
1415         ireq->loc_addr = daddr;
1416         ireq->rmt_addr = saddr;
1417         ireq->opt = tcp_v4_save_options(sk, skb);
1418         if (!want_cookie)
1419                 TCP_ECN_create_request(req, skb->h.th);
1420
1421         if (want_cookie) {
1422 #ifdef CONFIG_SYN_COOKIES
1423                 syn_flood_warning(skb);
1424 #endif
1425                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1426         } else if (!isn) {
1427                 struct inet_peer *peer = NULL;
1428
1429                 /* VJ's idea. We save last timestamp seen
1430                  * from the destination in peer table, when entering
1431                  * state TIME-WAIT, and check against it before
1432                  * accepting new connection request.
1433                  *
1434                  * If "isn" is not zero, this request hit alive
1435                  * timewait bucket, so that all the necessary checks
1436                  * are made in the function processing timewait state.
1437                  */
1438                 if (tmp_opt.saw_tstamp &&
1439                     sysctl_tcp_tw_recycle &&
1440                     (dst = tcp_v4_route_req(sk, req)) != NULL &&
1441                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1442                     peer->v4daddr == saddr) {
1443                         if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1444                             (s32)(peer->tcp_ts - req->ts_recent) >
1445                                                         TCP_PAWS_WINDOW) {
1446                                 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1447                                 dst_release(dst);
1448                                 goto drop_and_free;
1449                         }
1450                 }
1451                 /* Kill the following clause, if you dislike this way. */
1452                 else if (!sysctl_tcp_syncookies &&
1453                          (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1454                           (sysctl_max_syn_backlog >> 2)) &&
1455                          (!peer || !peer->tcp_ts_stamp) &&
1456                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1457                         /* Without syncookies last quarter of
1458                          * backlog is filled with destinations,
1459                          * proven to be alive.
1460                          * It means that we continue to communicate
1461                          * to destinations, already remembered
1462                          * to the moment of synflood.
1463                          */
1464                         LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1465                                               "request from %u.%u."
1466                                               "%u.%u/%u\n",
1467                                               NIPQUAD(saddr),
1468                                               ntohs(skb->h.th->source)));
1469                         dst_release(dst);
1470                         goto drop_and_free;
1471                 }
1472
1473                 isn = tcp_v4_init_sequence(sk, skb);
1474         }
1475         tcp_rsk(req)->snt_isn = isn;
1476
1477         if (tcp_v4_send_synack(sk, req, dst))
1478                 goto drop_and_free;
1479
1480         if (want_cookie) {
1481                 reqsk_free(req);
1482         } else {
1483                 tcp_v4_synq_add(sk, req);
1484         }
1485         return 0;
1486
1487 drop_and_free:
1488         reqsk_free(req);
1489 drop:
1490         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1491         return 0;
1492 }
1493
1494
1495 /*
1496  * The three way handshake has completed - we got a valid synack -
1497  * now create the new socket.
1498  */
1499 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1500                                   struct request_sock *req,
1501                                   struct dst_entry *dst)
1502 {
1503         struct inet_request_sock *ireq;
1504         struct inet_sock *newinet;
1505         struct tcp_sock *newtp;
1506         struct sock *newsk;
1507
1508         if (sk_acceptq_is_full(sk))
1509                 goto exit_overflow;
1510
1511         if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1512                 goto exit;
1513
1514         newsk = tcp_create_openreq_child(sk, req, skb);
1515         if (!newsk)
1516                 goto exit;
1517
1518         sk_setup_caps(newsk, dst);
1519
1520         newtp                 = tcp_sk(newsk);
1521         newinet               = inet_sk(newsk);
1522         ireq                  = inet_rsk(req);
1523         newinet->daddr        = ireq->rmt_addr;
1524         newinet->rcv_saddr    = ireq->loc_addr;
1525         newinet->saddr        = ireq->loc_addr;
1526         newinet->opt          = ireq->opt;
1527         ireq->opt             = NULL;
1528         newinet->mc_index     = tcp_v4_iif(skb);
1529         newinet->mc_ttl       = skb->nh.iph->ttl;
1530         newtp->ext_header_len = 0;
1531         if (newinet->opt)
1532                 newtp->ext_header_len = newinet->opt->optlen;
1533         newinet->id = newtp->write_seq ^ jiffies;
1534
1535         tcp_sync_mss(newsk, dst_mtu(dst));
1536         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1537         tcp_initialize_rcv_mss(newsk);
1538
1539         __tcp_v4_hash(newsk, 0);
1540         __tcp_inherit_port(sk, newsk);
1541
1542         return newsk;
1543
1544 exit_overflow:
1545         NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1546 exit:
1547         NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1548         dst_release(dst);
1549         return NULL;
1550 }
1551
1552 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1553 {
1554         struct tcphdr *th = skb->h.th;
1555         struct iphdr *iph = skb->nh.iph;
1556         struct tcp_sock *tp = tcp_sk(sk);
1557         struct sock *nsk;
1558         struct request_sock **prev;
1559         /* Find possible connection requests. */
1560         struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1561                                                      iph->saddr, iph->daddr);
1562         if (req)
1563                 return tcp_check_req(sk, skb, req, prev);
1564
1565         nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1566                                           th->source,
1567                                           skb->nh.iph->daddr,
1568                                           ntohs(th->dest),
1569                                           tcp_v4_iif(skb));
1570
1571         if (nsk) {
1572                 if (nsk->sk_state != TCP_TIME_WAIT) {
1573                         bh_lock_sock(nsk);
1574                         return nsk;
1575                 }
1576                 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1577                 return NULL;
1578         }
1579
1580 #ifdef CONFIG_SYN_COOKIES
1581         if (!th->rst && !th->syn && th->ack)
1582                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1583 #endif
1584         return sk;
1585 }
1586
1587 static int tcp_v4_checksum_init(struct sk_buff *skb)
1588 {
1589         if (skb->ip_summed == CHECKSUM_HW) {
1590                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1591                 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1592                                   skb->nh.iph->daddr, skb->csum))
1593                         return 0;
1594
1595                 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1596                 skb->ip_summed = CHECKSUM_NONE;
1597         }
1598         if (skb->len <= 76) {
1599                 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1600                                  skb->nh.iph->daddr,
1601                                  skb_checksum(skb, 0, skb->len, 0)))
1602                         return -1;
1603                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1604         } else {
1605                 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1606                                           skb->nh.iph->saddr,
1607                                           skb->nh.iph->daddr, 0);
1608         }
1609         return 0;
1610 }
1611
1612
1613 /* The socket must have it's spinlock held when we get
1614  * here.
1615  *
1616  * We have a potential double-lock case here, so even when
1617  * doing backlog processing we use the BH locking scheme.
1618  * This is because we cannot sleep with the original spinlock
1619  * held.
1620  */
1621 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1622 {
1623         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1624                 TCP_CHECK_TIMER(sk);
1625                 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1626                         goto reset;
1627                 TCP_CHECK_TIMER(sk);
1628                 return 0;
1629         }
1630
1631         if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1632                 goto csum_err;
1633
1634         if (sk->sk_state == TCP_LISTEN) {
1635                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1636                 if (!nsk)
1637                         goto discard;
1638
1639                 if (nsk != sk) {
1640                         if (tcp_child_process(sk, nsk, skb))
1641                                 goto reset;
1642                         return 0;
1643                 }
1644         }
1645
1646         TCP_CHECK_TIMER(sk);
1647         if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1648                 goto reset;
1649         TCP_CHECK_TIMER(sk);
1650         return 0;
1651
1652 reset:
1653         tcp_v4_send_reset(skb);
1654 discard:
1655         kfree_skb(skb);
1656         /* Be careful here. If this function gets more complicated and
1657          * gcc suffers from register pressure on the x86, sk (in %ebx)
1658          * might be destroyed here. This current version compiles correctly,
1659          * but you have been warned.
1660          */
1661         return 0;
1662
1663 csum_err:
1664         TCP_INC_STATS_BH(TCP_MIB_INERRS);
1665         goto discard;
1666 }
1667
1668 /*
1669  *      From tcp_input.c
1670  */
1671
1672 int tcp_v4_rcv(struct sk_buff *skb)
1673 {
1674         struct tcphdr *th;
1675         struct sock *sk;
1676         int ret;
1677
1678         if (skb->pkt_type != PACKET_HOST)
1679                 goto discard_it;
1680
1681         /* Count it even if it's bad */
1682         TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1683
1684         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1685                 goto discard_it;
1686
1687         th = skb->h.th;
1688
1689         if (th->doff < sizeof(struct tcphdr) / 4)
1690                 goto bad_packet;
1691         if (!pskb_may_pull(skb, th->doff * 4))
1692                 goto discard_it;
1693
1694         /* An explanation is required here, I think.
1695          * Packet length and doff are validated by header prediction,
1696          * provided case of th->doff==0 is elimineted.
1697          * So, we defer the checks. */
1698         if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1699              tcp_v4_checksum_init(skb) < 0))
1700                 goto bad_packet;
1701
1702         th = skb->h.th;
1703         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1704         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1705                                     skb->len - th->doff * 4);
1706         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1707         TCP_SKB_CB(skb)->when    = 0;
1708         TCP_SKB_CB(skb)->flags   = skb->nh.iph->tos;
1709         TCP_SKB_CB(skb)->sacked  = 0;
1710
1711         sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1712                              skb->nh.iph->daddr, ntohs(th->dest),
1713                              tcp_v4_iif(skb));
1714
1715         if (!sk)
1716                 goto no_tcp_socket;
1717
1718 process:
1719         if (sk->sk_state == TCP_TIME_WAIT)
1720                 goto do_time_wait;
1721
1722         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1723                 goto discard_and_relse;
1724
1725         if (sk_filter(sk, skb, 0))
1726                 goto discard_and_relse;
1727
1728         skb->dev = NULL;
1729
1730         bh_lock_sock(sk);
1731         ret = 0;
1732         if (!sock_owned_by_user(sk)) {
1733                 if (!tcp_prequeue(sk, skb))
1734                         ret = tcp_v4_do_rcv(sk, skb);
1735         } else
1736                 sk_add_backlog(sk, skb);
1737         bh_unlock_sock(sk);
1738
1739         sock_put(sk);
1740
1741         return ret;
1742
1743 no_tcp_socket:
1744         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1745                 goto discard_it;
1746
1747         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1748 bad_packet:
1749                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1750         } else {
1751                 tcp_v4_send_reset(skb);
1752         }
1753
1754 discard_it:
1755         /* Discard frame. */
1756         kfree_skb(skb);
1757         return 0;
1758
1759 discard_and_relse:
1760         sock_put(sk);
1761         goto discard_it;
1762
1763 do_time_wait:
1764         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1765                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1766                 goto discard_it;
1767         }
1768
1769         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1770                 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1771                 tcp_tw_put((struct tcp_tw_bucket *) sk);
1772                 goto discard_it;
1773         }
1774         switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1775                                            skb, th, skb->len)) {
1776         case TCP_TW_SYN: {
1777                 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1778                                                           ntohs(th->dest),
1779                                                           tcp_v4_iif(skb));
1780                 if (sk2) {
1781                         tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1782                         tcp_tw_put((struct tcp_tw_bucket *)sk);
1783                         sk = sk2;
1784                         goto process;
1785                 }
1786                 /* Fall through to ACK */
1787         }
1788         case TCP_TW_ACK:
1789                 tcp_v4_timewait_ack(sk, skb);
1790                 break;
1791         case TCP_TW_RST:
1792                 goto no_tcp_socket;
1793         case TCP_TW_SUCCESS:;
1794         }
1795         goto discard_it;
1796 }
1797
1798 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1799 {
1800         struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1801         struct inet_sock *inet = inet_sk(sk);
1802
1803         sin->sin_family         = AF_INET;
1804         sin->sin_addr.s_addr    = inet->daddr;
1805         sin->sin_port           = inet->dport;
1806 }
1807
1808 /* VJ's idea. Save last timestamp seen from this destination
1809  * and hold it at least for normal timewait interval to use for duplicate
1810  * segment detection in subsequent connections, before they enter synchronized
1811  * state.
1812  */
1813
1814 int tcp_v4_remember_stamp(struct sock *sk)
1815 {
1816         struct inet_sock *inet = inet_sk(sk);
1817         struct tcp_sock *tp = tcp_sk(sk);
1818         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1819         struct inet_peer *peer = NULL;
1820         int release_it = 0;
1821
1822         if (!rt || rt->rt_dst != inet->daddr) {
1823                 peer = inet_getpeer(inet->daddr, 1);
1824                 release_it = 1;
1825         } else {
1826                 if (!rt->peer)
1827                         rt_bind_peer(rt, 1);
1828                 peer = rt->peer;
1829         }
1830
1831         if (peer) {
1832                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1833                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1834                      peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1835                         peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1836                         peer->tcp_ts = tp->rx_opt.ts_recent;
1837                 }
1838                 if (release_it)
1839                         inet_putpeer(peer);
1840                 return 1;
1841         }
1842
1843         return 0;
1844 }
1845
1846 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1847 {
1848         struct inet_peer *peer = NULL;
1849
1850         peer = inet_getpeer(tw->tw_daddr, 1);
1851
1852         if (peer) {
1853                 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1854                     (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1855                      peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1856                         peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1857                         peer->tcp_ts = tw->tw_ts_recent;
1858                 }
1859                 inet_putpeer(peer);
1860                 return 1;
1861         }
1862
1863         return 0;
1864 }
1865
1866 struct tcp_func ipv4_specific = {
1867         .queue_xmit     =       ip_queue_xmit,
1868         .send_check     =       tcp_v4_send_check,
1869         .rebuild_header =       inet_sk_rebuild_header,
1870         .conn_request   =       tcp_v4_conn_request,
1871         .syn_recv_sock  =       tcp_v4_syn_recv_sock,
1872         .remember_stamp =       tcp_v4_remember_stamp,
1873         .net_header_len =       sizeof(struct iphdr),
1874         .setsockopt     =       ip_setsockopt,
1875         .getsockopt     =       ip_getsockopt,
1876         .addr2sockaddr  =       v4_addr2sockaddr,
1877         .sockaddr_len   =       sizeof(struct sockaddr_in),
1878 };
1879
1880 /* NOTE: A lot of things set to zero explicitly by call to
1881  *       sk_alloc() so need not be done here.
1882  */
1883 static int tcp_v4_init_sock(struct sock *sk)
1884 {
1885         struct tcp_sock *tp = tcp_sk(sk);
1886
1887         skb_queue_head_init(&tp->out_of_order_queue);
1888         tcp_init_xmit_timers(sk);
1889         tcp_prequeue_init(tp);
1890
1891         tp->rto  = TCP_TIMEOUT_INIT;
1892         tp->mdev = TCP_TIMEOUT_INIT;
1893
1894         /* So many TCP implementations out there (incorrectly) count the
1895          * initial SYN frame in their delayed-ACK and congestion control
1896          * algorithms that we must have the following bandaid to talk
1897          * efficiently to them.  -DaveM
1898          */
1899         tp->snd_cwnd = 2;
1900
1901         /* See draft-stevens-tcpca-spec-01 for discussion of the
1902          * initialization of these values.
1903          */
1904         tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
1905         tp->snd_cwnd_clamp = ~0;
1906         tp->mss_cache = 536;
1907
1908         tp->reordering = sysctl_tcp_reordering;
1909         tp->ca_ops = &tcp_init_congestion_ops;
1910
1911         sk->sk_state = TCP_CLOSE;
1912
1913         sk->sk_write_space = sk_stream_write_space;
1914         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1915
1916         tp->af_specific = &ipv4_specific;
1917
1918         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1919         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1920
1921         atomic_inc(&tcp_sockets_allocated);
1922
1923         return 0;
1924 }
1925
1926 int tcp_v4_destroy_sock(struct sock *sk)
1927 {
1928         struct tcp_sock *tp = tcp_sk(sk);
1929
1930         tcp_clear_xmit_timers(sk);
1931
1932         tcp_cleanup_congestion_control(tp);
1933
1934         /* Cleanup up the write buffer. */
1935         sk_stream_writequeue_purge(sk);
1936
1937         /* Cleans up our, hopefully empty, out_of_order_queue. */
1938         __skb_queue_purge(&tp->out_of_order_queue);
1939
1940         /* Clean prequeue, it must be empty really */
1941         __skb_queue_purge(&tp->ucopy.prequeue);
1942
1943         /* Clean up a referenced TCP bind bucket. */
1944         if (inet_sk(sk)->bind_hash)
1945                 tcp_put_port(sk);
1946
1947         /*
1948          * If sendmsg cached page exists, toss it.
1949          */
1950         if (sk->sk_sndmsg_page) {
1951                 __free_page(sk->sk_sndmsg_page);
1952                 sk->sk_sndmsg_page = NULL;
1953         }
1954
1955         atomic_dec(&tcp_sockets_allocated);
1956
1957         return 0;
1958 }
1959
1960 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1961
1962 #ifdef CONFIG_PROC_FS
1963 /* Proc filesystem TCP sock list dumping. */
1964
1965 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1966 {
1967         return hlist_empty(head) ? NULL :
1968                 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1969 }
1970
1971 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1972 {
1973         return tw->tw_node.next ?
1974                 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1975 }
1976
1977 static void *listening_get_next(struct seq_file *seq, void *cur)
1978 {
1979         struct tcp_sock *tp;
1980         struct hlist_node *node;
1981         struct sock *sk = cur;
1982         struct tcp_iter_state* st = seq->private;
1983
1984         if (!sk) {
1985                 st->bucket = 0;
1986                 sk = sk_head(&tcp_listening_hash[0]);
1987                 goto get_sk;
1988         }
1989
1990         ++st->num;
1991
1992         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1993                 struct request_sock *req = cur;
1994
1995                 tp = tcp_sk(st->syn_wait_sk);
1996                 req = req->dl_next;
1997                 while (1) {
1998                         while (req) {
1999                                 if (req->rsk_ops->family == st->family) {
2000                                         cur = req;
2001                                         goto out;
2002                                 }
2003                                 req = req->dl_next;
2004                         }
2005                         if (++st->sbucket >= TCP_SYNQ_HSIZE)
2006                                 break;
2007 get_req:
2008                         req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
2009                 }
2010                 sk        = sk_next(st->syn_wait_sk);
2011                 st->state = TCP_SEQ_STATE_LISTENING;
2012                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2013         } else {
2014                 tp = tcp_sk(sk);
2015                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2016                 if (reqsk_queue_len(&tp->accept_queue))
2017                         goto start_req;
2018                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2019                 sk = sk_next(sk);
2020         }
2021 get_sk:
2022         sk_for_each_from(sk, node) {
2023                 if (sk->sk_family == st->family) {
2024                         cur = sk;
2025                         goto out;
2026                 }
2027                 tp = tcp_sk(sk);
2028                 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2029                 if (reqsk_queue_len(&tp->accept_queue)) {
2030 start_req:
2031                         st->uid         = sock_i_uid(sk);
2032                         st->syn_wait_sk = sk;
2033                         st->state       = TCP_SEQ_STATE_OPENREQ;
2034                         st->sbucket     = 0;
2035                         goto get_req;
2036                 }
2037                 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2038         }
2039         if (++st->bucket < INET_LHTABLE_SIZE) {
2040                 sk = sk_head(&tcp_listening_hash[st->bucket]);
2041                 goto get_sk;
2042         }
2043         cur = NULL;
2044 out:
2045         return cur;
2046 }
2047
2048 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2049 {
2050         void *rc = listening_get_next(seq, NULL);
2051
2052         while (rc && *pos) {
2053                 rc = listening_get_next(seq, rc);
2054                 --*pos;
2055         }
2056         return rc;
2057 }
2058
2059 static void *established_get_first(struct seq_file *seq)
2060 {
2061         struct tcp_iter_state* st = seq->private;
2062         void *rc = NULL;
2063
2064         for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2065                 struct sock *sk;
2066                 struct hlist_node *node;
2067                 struct tcp_tw_bucket *tw;
2068
2069                 /* We can reschedule _before_ having picked the target: */
2070                 cond_resched_softirq();
2071
2072                 read_lock(&tcp_ehash[st->bucket].lock);
2073                 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2074                         if (sk->sk_family != st->family) {
2075                                 continue;
2076                         }
2077                         rc = sk;
2078                         goto out;
2079                 }
2080                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2081                 tw_for_each(tw, node,
2082                             &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2083                         if (tw->tw_family != st->family) {
2084                                 continue;
2085                         }
2086                         rc = tw;
2087                         goto out;
2088                 }
2089                 read_unlock(&tcp_ehash[st->bucket].lock);
2090                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2091         }
2092 out:
2093         return rc;
2094 }
2095
2096 static void *established_get_next(struct seq_file *seq, void *cur)
2097 {
2098         struct sock *sk = cur;
2099         struct tcp_tw_bucket *tw;
2100         struct hlist_node *node;
2101         struct tcp_iter_state* st = seq->private;
2102
2103         ++st->num;
2104
2105         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2106                 tw = cur;
2107                 tw = tw_next(tw);
2108 get_tw:
2109                 while (tw && tw->tw_family != st->family) {
2110                         tw = tw_next(tw);
2111                 }
2112                 if (tw) {
2113                         cur = tw;
2114                         goto out;
2115                 }
2116                 read_unlock(&tcp_ehash[st->bucket].lock);
2117                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2118
2119                 /* We can reschedule between buckets: */
2120                 cond_resched_softirq();
2121
2122                 if (++st->bucket < tcp_ehash_size) {
2123                         read_lock(&tcp_ehash[st->bucket].lock);
2124                         sk = sk_head(&tcp_ehash[st->bucket].chain);
2125                 } else {
2126                         cur = NULL;
2127                         goto out;
2128                 }
2129         } else
2130                 sk = sk_next(sk);
2131
2132         sk_for_each_from(sk, node) {
2133                 if (sk->sk_family == st->family)
2134                         goto found;
2135         }
2136
2137         st->state = TCP_SEQ_STATE_TIME_WAIT;
2138         tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2139         goto get_tw;
2140 found:
2141         cur = sk;
2142 out:
2143         return cur;
2144 }
2145
2146 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2147 {
2148         void *rc = established_get_first(seq);
2149
2150         while (rc && pos) {
2151                 rc = established_get_next(seq, rc);
2152                 --pos;
2153         }               
2154         return rc;
2155 }
2156
2157 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2158 {
2159         void *rc;
2160         struct tcp_iter_state* st = seq->private;
2161
2162         tcp_listen_lock();
2163         st->state = TCP_SEQ_STATE_LISTENING;
2164         rc        = listening_get_idx(seq, &pos);
2165
2166         if (!rc) {
2167                 tcp_listen_unlock();
2168                 local_bh_disable();
2169                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2170                 rc        = established_get_idx(seq, pos);
2171         }
2172
2173         return rc;
2174 }
2175
2176 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2177 {
2178         struct tcp_iter_state* st = seq->private;
2179         st->state = TCP_SEQ_STATE_LISTENING;
2180         st->num = 0;
2181         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2182 }
2183
2184 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2185 {
2186         void *rc = NULL;
2187         struct tcp_iter_state* st;
2188
2189         if (v == SEQ_START_TOKEN) {
2190                 rc = tcp_get_idx(seq, 0);
2191                 goto out;
2192         }
2193         st = seq->private;
2194
2195         switch (st->state) {
2196         case TCP_SEQ_STATE_OPENREQ:
2197         case TCP_SEQ_STATE_LISTENING:
2198                 rc = listening_get_next(seq, v);
2199                 if (!rc) {
2200                         tcp_listen_unlock();
2201                         local_bh_disable();
2202                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2203                         rc        = established_get_first(seq);
2204                 }
2205                 break;
2206         case TCP_SEQ_STATE_ESTABLISHED:
2207         case TCP_SEQ_STATE_TIME_WAIT:
2208                 rc = established_get_next(seq, v);
2209                 break;
2210         }
2211 out:
2212         ++*pos;
2213         return rc;
2214 }
2215
2216 static void tcp_seq_stop(struct seq_file *seq, void *v)
2217 {
2218         struct tcp_iter_state* st = seq->private;
2219
2220         switch (st->state) {
2221         case TCP_SEQ_STATE_OPENREQ:
2222                 if (v) {
2223                         struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2224                         read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2225                 }
2226         case TCP_SEQ_STATE_LISTENING:
2227                 if (v != SEQ_START_TOKEN)
2228                         tcp_listen_unlock();
2229                 break;
2230         case TCP_SEQ_STATE_TIME_WAIT:
2231         case TCP_SEQ_STATE_ESTABLISHED:
2232                 if (v)
2233                         read_unlock(&tcp_ehash[st->bucket].lock);
2234                 local_bh_enable();
2235                 break;
2236         }
2237 }
2238
2239 static int tcp_seq_open(struct inode *inode, struct file *file)
2240 {
2241         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2242         struct seq_file *seq;
2243         struct tcp_iter_state *s;
2244         int rc;
2245
2246         if (unlikely(afinfo == NULL))
2247                 return -EINVAL;
2248
2249         s = kmalloc(sizeof(*s), GFP_KERNEL);
2250         if (!s)
2251                 return -ENOMEM;
2252         memset(s, 0, sizeof(*s));
2253         s->family               = afinfo->family;
2254         s->seq_ops.start        = tcp_seq_start;
2255         s->seq_ops.next         = tcp_seq_next;
2256         s->seq_ops.show         = afinfo->seq_show;
2257         s->seq_ops.stop         = tcp_seq_stop;
2258
2259         rc = seq_open(file, &s->seq_ops);
2260         if (rc)
2261                 goto out_kfree;
2262         seq          = file->private_data;
2263         seq->private = s;
2264 out:
2265         return rc;
2266 out_kfree:
2267         kfree(s);
2268         goto out;
2269 }
2270
2271 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2272 {
2273         int rc = 0;
2274         struct proc_dir_entry *p;
2275
2276         if (!afinfo)
2277                 return -EINVAL;
2278         afinfo->seq_fops->owner         = afinfo->owner;
2279         afinfo->seq_fops->open          = tcp_seq_open;
2280         afinfo->seq_fops->read          = seq_read;
2281         afinfo->seq_fops->llseek        = seq_lseek;
2282         afinfo->seq_fops->release       = seq_release_private;
2283         
2284         p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2285         if (p)
2286                 p->data = afinfo;
2287         else
2288                 rc = -ENOMEM;
2289         return rc;
2290 }
2291
2292 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2293 {
2294         if (!afinfo)
2295                 return;
2296         proc_net_remove(afinfo->name);
2297         memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
2298 }
2299
2300 static void get_openreq4(struct sock *sk, struct request_sock *req,
2301                          char *tmpbuf, int i, int uid)
2302 {
2303         const struct inet_request_sock *ireq = inet_rsk(req);
2304         int ttd = req->expires - jiffies;
2305
2306         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2307                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2308                 i,
2309                 ireq->loc_addr,
2310                 ntohs(inet_sk(sk)->sport),
2311                 ireq->rmt_addr,
2312                 ntohs(ireq->rmt_port),
2313                 TCP_SYN_RECV,
2314                 0, 0, /* could print option size, but that is af dependent. */
2315                 1,    /* timers active (only the expire timer) */
2316                 jiffies_to_clock_t(ttd),
2317                 req->retrans,
2318                 uid,
2319                 0,  /* non standard timer */
2320                 0, /* open_requests have no inode */
2321                 atomic_read(&sk->sk_refcnt),
2322                 req);
2323 }
2324
2325 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2326 {
2327         int timer_active;
2328         unsigned long timer_expires;
2329         struct tcp_sock *tp = tcp_sk(sp);
2330         struct inet_sock *inet = inet_sk(sp);
2331         unsigned int dest = inet->daddr;
2332         unsigned int src = inet->rcv_saddr;
2333         __u16 destp = ntohs(inet->dport);
2334         __u16 srcp = ntohs(inet->sport);
2335
2336         if (tp->pending == TCP_TIME_RETRANS) {
2337                 timer_active    = 1;
2338                 timer_expires   = tp->timeout;
2339         } else if (tp->pending == TCP_TIME_PROBE0) {
2340                 timer_active    = 4;
2341                 timer_expires   = tp->timeout;
2342         } else if (timer_pending(&sp->sk_timer)) {
2343                 timer_active    = 2;
2344                 timer_expires   = sp->sk_timer.expires;
2345         } else {
2346                 timer_active    = 0;
2347                 timer_expires = jiffies;
2348         }
2349
2350         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2351                         "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2352                 i, src, srcp, dest, destp, sp->sk_state,
2353                 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2354                 timer_active,
2355                 jiffies_to_clock_t(timer_expires - jiffies),
2356                 tp->retransmits,
2357                 sock_i_uid(sp),
2358                 tp->probes_out,
2359                 sock_i_ino(sp),
2360                 atomic_read(&sp->sk_refcnt), sp,
2361                 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2362                 tp->snd_cwnd,
2363                 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2364 }
2365
2366 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2367 {
2368         unsigned int dest, src;
2369         __u16 destp, srcp;
2370         int ttd = tw->tw_ttd - jiffies;
2371
2372         if (ttd < 0)
2373                 ttd = 0;
2374
2375         dest  = tw->tw_daddr;
2376         src   = tw->tw_rcv_saddr;
2377         destp = ntohs(tw->tw_dport);
2378         srcp  = ntohs(tw->tw_sport);
2379
2380         sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2381                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2382                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2383                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2384                 atomic_read(&tw->tw_refcnt), tw);
2385 }
2386
2387 #define TMPSZ 150
2388
2389 static int tcp4_seq_show(struct seq_file *seq, void *v)
2390 {
2391         struct tcp_iter_state* st;
2392         char tmpbuf[TMPSZ + 1];
2393
2394         if (v == SEQ_START_TOKEN) {
2395                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2396                            "  sl  local_address rem_address   st tx_queue "
2397                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2398                            "inode");
2399                 goto out;
2400         }
2401         st = seq->private;
2402
2403         switch (st->state) {
2404         case TCP_SEQ_STATE_LISTENING:
2405         case TCP_SEQ_STATE_ESTABLISHED:
2406                 get_tcp4_sock(v, tmpbuf, st->num);
2407                 break;
2408         case TCP_SEQ_STATE_OPENREQ:
2409                 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2410                 break;
2411         case TCP_SEQ_STATE_TIME_WAIT:
2412                 get_timewait4_sock(v, tmpbuf, st->num);
2413                 break;
2414         }
2415         seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2416 out:
2417         return 0;
2418 }
2419
2420 static struct file_operations tcp4_seq_fops;
2421 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2422         .owner          = THIS_MODULE,
2423         .name           = "tcp",
2424         .family         = AF_INET,
2425         .seq_show       = tcp4_seq_show,
2426         .seq_fops       = &tcp4_seq_fops,
2427 };
2428
2429 int __init tcp4_proc_init(void)
2430 {
2431         return tcp_proc_register(&tcp4_seq_afinfo);
2432 }
2433
2434 void tcp4_proc_exit(void)
2435 {
2436         tcp_proc_unregister(&tcp4_seq_afinfo);
2437 }
2438 #endif /* CONFIG_PROC_FS */
2439
2440 struct proto tcp_prot = {
2441         .name                   = "TCP",
2442         .owner                  = THIS_MODULE,
2443         .close                  = tcp_close,
2444         .connect                = tcp_v4_connect,
2445         .disconnect             = tcp_disconnect,
2446         .accept                 = tcp_accept,
2447         .ioctl                  = tcp_ioctl,
2448         .init                   = tcp_v4_init_sock,
2449         .destroy                = tcp_v4_destroy_sock,
2450         .shutdown               = tcp_shutdown,
2451         .setsockopt             = tcp_setsockopt,
2452         .getsockopt             = tcp_getsockopt,
2453         .sendmsg                = tcp_sendmsg,
2454         .recvmsg                = tcp_recvmsg,
2455         .backlog_rcv            = tcp_v4_do_rcv,
2456         .hash                   = tcp_v4_hash,
2457         .unhash                 = tcp_unhash,
2458         .get_port               = tcp_v4_get_port,
2459         .enter_memory_pressure  = tcp_enter_memory_pressure,
2460         .sockets_allocated      = &tcp_sockets_allocated,
2461         .memory_allocated       = &tcp_memory_allocated,
2462         .memory_pressure        = &tcp_memory_pressure,
2463         .sysctl_mem             = sysctl_tcp_mem,
2464         .sysctl_wmem            = sysctl_tcp_wmem,
2465         .sysctl_rmem            = sysctl_tcp_rmem,
2466         .max_header             = MAX_TCP_HEADER,
2467         .obj_size               = sizeof(struct tcp_sock),
2468         .rsk_prot               = &tcp_request_sock_ops,
2469 };
2470
2471
2472
2473 void __init tcp_v4_init(struct net_proto_family *ops)
2474 {
2475         int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2476         if (err < 0)
2477                 panic("Failed to create the TCP control socket.\n");
2478         tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
2479         inet_sk(tcp_socket->sk)->uc_ttl = -1;
2480
2481         /* Unhash it so that IP input processing does not even
2482          * see it, we do not wish this socket to see incoming
2483          * packets.
2484          */
2485         tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2486 }
2487
2488 EXPORT_SYMBOL(ipv4_specific);
2489 EXPORT_SYMBOL(tcp_bind_hash);
2490 EXPORT_SYMBOL(inet_bind_bucket_create);
2491 EXPORT_SYMBOL(tcp_hashinfo);
2492 EXPORT_SYMBOL(tcp_inherit_port);
2493 EXPORT_SYMBOL(tcp_listen_wlock);
2494 EXPORT_SYMBOL(tcp_port_rover);
2495 EXPORT_SYMBOL(tcp_prot);
2496 EXPORT_SYMBOL(tcp_put_port);
2497 EXPORT_SYMBOL(tcp_unhash);
2498 EXPORT_SYMBOL(tcp_v4_conn_request);
2499 EXPORT_SYMBOL(tcp_v4_connect);
2500 EXPORT_SYMBOL(tcp_v4_do_rcv);
2501 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2502 EXPORT_SYMBOL(tcp_v4_send_check);
2503 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2504
2505 #ifdef CONFIG_PROC_FS
2506 EXPORT_SYMBOL(tcp_proc_register);
2507 EXPORT_SYMBOL(tcp_proc_unregister);
2508 #endif
2509 EXPORT_SYMBOL(sysctl_local_port_range);
2510 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2511 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2512