blob: 58e36ed88f25bb861b954b200e916417fb75cd30 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26/*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
36 * ACK bit.
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -070039 * request_sock handling and moved
Linus Torvalds1da177e2005-04-16 15:20:36 -070040 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
47 * coma.
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
53 */
54
55#include <linux/config.h>
56
57#include <linux/types.h>
58#include <linux/fcntl.h>
59#include <linux/module.h>
60#include <linux/random.h>
61#include <linux/cache.h>
62#include <linux/jhash.h>
63#include <linux/init.h>
64#include <linux/times.h>
65
66#include <net/icmp.h>
Arnaldo Carvalho de Melo304a1612005-08-09 19:59:20 -070067#include <net/inet_hashtables.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070068#include <net/tcp.h>
69#include <net/ipv6.h>
70#include <net/inet_common.h>
71#include <net/xfrm.h>
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78
79extern int sysctl_ip_dynaddr;
80int sysctl_tcp_tw_reuse;
81int sysctl_tcp_low_latency;
82
83/* Check TCP sequence numbers in ICMP packets. */
84#define ICMP_MIN_LENGTH 8
85
86/* Socket used for sending RSTs */
87static struct socket *tcp_socket;
88
89void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
90 struct sk_buff *skb);
91
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -070092struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
Linus Torvalds1da177e2005-04-16 15:20:36 -070097};
98
99/*
100 * This array holds the first and last local port number.
101 * For high-usage systems, use sysctl to change this to
102 * 32768-61000
103 */
104int sysctl_local_port_range[2] = { 1024, 4999 };
105int tcp_port_rover = 1024 - 1;
106
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107/* Caller must disable local BH processing. */
108static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
109{
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700110 struct inet_bind_hashbucket *head =
111 &tcp_bhash[inet_bhashfn(inet_sk(child)->num,
112 tcp_bhash_size)];
113 struct inet_bind_bucket *tb;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114
115 spin_lock(&head->lock);
116 tb = tcp_sk(sk)->bind_hash;
117 sk_add_bind_node(child, &tb->owners);
118 tcp_sk(child)->bind_hash = tb;
119 spin_unlock(&head->lock);
120}
121
122inline void tcp_inherit_port(struct sock *sk, struct sock *child)
123{
124 local_bh_disable();
125 __tcp_inherit_port(sk, child);
126 local_bh_enable();
127}
128
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700129void tcp_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
130 const unsigned short snum)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700131{
132 inet_sk(sk)->num = snum;
133 sk_add_bind_node(sk, &tb->owners);
134 tcp_sk(sk)->bind_hash = tb;
135}
136
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700137static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700138{
139 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
140 struct sock *sk2;
141 struct hlist_node *node;
142 int reuse = sk->sk_reuse;
143
144 sk_for_each_bound(sk2, node, &tb->owners) {
145 if (sk != sk2 &&
146 !tcp_v6_ipv6only(sk2) &&
147 (!sk->sk_bound_dev_if ||
148 !sk2->sk_bound_dev_if ||
149 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
150 if (!reuse || !sk2->sk_reuse ||
151 sk2->sk_state == TCP_LISTEN) {
152 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
153 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
154 sk2_rcv_saddr == sk_rcv_saddr)
155 break;
156 }
157 }
158 }
159 return node != NULL;
160}
161
162/* Obtain a reference to a local port for the given sock,
163 * if snum is zero it means select any available local port.
164 */
165static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
166{
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700167 struct inet_bind_hashbucket *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700168 struct hlist_node *node;
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700169 struct inet_bind_bucket *tb;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700170 int ret;
171
172 local_bh_disable();
173 if (!snum) {
174 int low = sysctl_local_port_range[0];
175 int high = sysctl_local_port_range[1];
176 int remaining = (high - low) + 1;
177 int rover;
178
179 spin_lock(&tcp_portalloc_lock);
Folkert van Heusden0b2531b2005-05-03 14:36:08 -0700180 if (tcp_port_rover < low)
181 rover = low;
182 else
183 rover = tcp_port_rover;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184 do {
185 rover++;
Folkert van Heusden0b2531b2005-05-03 14:36:08 -0700186 if (rover > high)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700187 rover = low;
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700188 head = &tcp_bhash[inet_bhashfn(rover, tcp_bhash_size)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700189 spin_lock(&head->lock);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700190 inet_bind_bucket_for_each(tb, node, &head->chain)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700191 if (tb->port == rover)
192 goto next;
193 break;
194 next:
195 spin_unlock(&head->lock);
196 } while (--remaining > 0);
197 tcp_port_rover = rover;
198 spin_unlock(&tcp_portalloc_lock);
199
David S. Millerd5d28372005-08-23 10:49:54 -0700200 /* Exhausted local port range during search? It is not
201 * possible for us to be holding one of the bind hash
202 * locks if this test triggers, because if 'remaining'
203 * drops to zero, we broke out of the do/while loop at
204 * the top level, not from the 'break;' statement.
205 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206 ret = 1;
David S. Millerd5d28372005-08-23 10:49:54 -0700207 if (unlikely(remaining <= 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700208 goto fail;
209
210 /* OK, here is the one we will use. HEAD is
211 * non-NULL and we hold it's mutex.
212 */
213 snum = rover;
214 } else {
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700215 head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700216 spin_lock(&head->lock);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700217 inet_bind_bucket_for_each(tb, node, &head->chain)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700218 if (tb->port == snum)
219 goto tb_found;
220 }
221 tb = NULL;
222 goto tb_not_found;
223tb_found:
224 if (!hlist_empty(&tb->owners)) {
225 if (sk->sk_reuse > 1)
226 goto success;
227 if (tb->fastreuse > 0 &&
228 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
229 goto success;
230 } else {
231 ret = 1;
232 if (tcp_bind_conflict(sk, tb))
233 goto fail_unlock;
234 }
235 }
236tb_not_found:
237 ret = 1;
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700238 if (!tb && (tb = inet_bind_bucket_create(tcp_bucket_cachep, head, snum)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700239 goto fail_unlock;
240 if (hlist_empty(&tb->owners)) {
241 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
242 tb->fastreuse = 1;
243 else
244 tb->fastreuse = 0;
245 } else if (tb->fastreuse &&
246 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
247 tb->fastreuse = 0;
248success:
249 if (!tcp_sk(sk)->bind_hash)
250 tcp_bind_hash(sk, tb, snum);
251 BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
252 ret = 0;
253
254fail_unlock:
255 spin_unlock(&head->lock);
256fail:
257 local_bh_enable();
258 return ret;
259}
260
261/* Get rid of any references to a local port held by the
262 * given sock.
263 */
264static void __tcp_put_port(struct sock *sk)
265{
266 struct inet_sock *inet = inet_sk(sk);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700267 struct inet_bind_hashbucket *head = &tcp_bhash[inet_bhashfn(inet->num,
268 tcp_bhash_size)];
269 struct inet_bind_bucket *tb;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700270
271 spin_lock(&head->lock);
272 tb = tcp_sk(sk)->bind_hash;
273 __sk_del_bind_node(sk);
274 tcp_sk(sk)->bind_hash = NULL;
275 inet->num = 0;
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700276 inet_bind_bucket_destroy(tcp_bucket_cachep, tb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277 spin_unlock(&head->lock);
278}
279
280void tcp_put_port(struct sock *sk)
281{
282 local_bh_disable();
283 __tcp_put_port(sk);
284 local_bh_enable();
285}
286
287/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
288 * Look, when several writers sleep and reader wakes them up, all but one
289 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
290 * this, _but_ remember, it adds useless work on UP machines (wake up each
291 * exclusive lock release). It should be ifdefed really.
292 */
293
294void tcp_listen_wlock(void)
295{
296 write_lock(&tcp_lhash_lock);
297
298 if (atomic_read(&tcp_lhash_users)) {
299 DEFINE_WAIT(wait);
300
301 for (;;) {
302 prepare_to_wait_exclusive(&tcp_lhash_wait,
303 &wait, TASK_UNINTERRUPTIBLE);
304 if (!atomic_read(&tcp_lhash_users))
305 break;
306 write_unlock_bh(&tcp_lhash_lock);
307 schedule();
308 write_lock_bh(&tcp_lhash_lock);
309 }
310
311 finish_wait(&tcp_lhash_wait, &wait);
312 }
313}
314
315static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
316{
317 struct hlist_head *list;
318 rwlock_t *lock;
319
320 BUG_TRAP(sk_unhashed(sk));
321 if (listen_possible && sk->sk_state == TCP_LISTEN) {
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700322 list = &tcp_listening_hash[inet_sk_listen_hashfn(sk)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700323 lock = &tcp_lhash_lock;
324 tcp_listen_wlock();
325 } else {
Arnaldo Carvalho de Melo304a1612005-08-09 19:59:20 -0700326 sk->sk_hashent = inet_sk_ehashfn(sk, tcp_ehash_size);
327 list = &tcp_ehash[sk->sk_hashent].chain;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700328 lock = &tcp_ehash[sk->sk_hashent].lock;
329 write_lock(lock);
330 }
331 __sk_add_node(sk, list);
332 sock_prot_inc_use(sk->sk_prot);
333 write_unlock(lock);
334 if (listen_possible && sk->sk_state == TCP_LISTEN)
335 wake_up(&tcp_lhash_wait);
336}
337
338static void tcp_v4_hash(struct sock *sk)
339{
340 if (sk->sk_state != TCP_CLOSE) {
341 local_bh_disable();
342 __tcp_v4_hash(sk, 1);
343 local_bh_enable();
344 }
345}
346
347void tcp_unhash(struct sock *sk)
348{
349 rwlock_t *lock;
350
351 if (sk_unhashed(sk))
352 goto ende;
353
354 if (sk->sk_state == TCP_LISTEN) {
355 local_bh_disable();
356 tcp_listen_wlock();
357 lock = &tcp_lhash_lock;
358 } else {
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700359 struct inet_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700360 lock = &head->lock;
361 write_lock_bh(&head->lock);
362 }
363
364 if (__sk_del_node_init(sk))
365 sock_prot_dec_use(sk->sk_prot);
366 write_unlock_bh(lock);
367
368 ende:
369 if (sk->sk_state == TCP_LISTEN)
370 wake_up(&tcp_lhash_wait);
371}
372
373/* Don't inline this cruft. Here are some nice properties to
374 * exploit here. The BSD API does not allow a listening TCP
375 * to specify the remote port nor the remote address for the
376 * connection. So always assume those are both wildcarded
377 * during the search since they can never be otherwise.
378 */
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700379static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head,
380 const u32 daddr,
381 const unsigned short hnum,
382 const int dif)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700383{
384 struct sock *result = NULL, *sk;
385 struct hlist_node *node;
386 int score, hiscore;
387
388 hiscore=-1;
389 sk_for_each(sk, node, head) {
390 struct inet_sock *inet = inet_sk(sk);
391
392 if (inet->num == hnum && !ipv6_only_sock(sk)) {
393 __u32 rcv_saddr = inet->rcv_saddr;
394
395 score = (sk->sk_family == PF_INET ? 1 : 0);
396 if (rcv_saddr) {
397 if (rcv_saddr != daddr)
398 continue;
399 score+=2;
400 }
401 if (sk->sk_bound_dev_if) {
402 if (sk->sk_bound_dev_if != dif)
403 continue;
404 score+=2;
405 }
406 if (score == 5)
407 return sk;
408 if (score > hiscore) {
409 hiscore = score;
410 result = sk;
411 }
412 }
413 }
414 return result;
415}
416
417/* Optimize the common listener case. */
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700418static inline struct sock *tcp_v4_lookup_listener(const u32 daddr,
419 const unsigned short hnum,
420 const int dif)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700421{
422 struct sock *sk = NULL;
423 struct hlist_head *head;
424
425 read_lock(&tcp_lhash_lock);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700426 head = &tcp_listening_hash[inet_lhashfn(hnum)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700427 if (!hlist_empty(head)) {
428 struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
429
430 if (inet->num == hnum && !sk->sk_node.next &&
431 (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
432 (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
433 !sk->sk_bound_dev_if)
434 goto sherry_cache;
435 sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
436 }
437 if (sk) {
438sherry_cache:
439 sock_hold(sk);
440 }
441 read_unlock(&tcp_lhash_lock);
442 return sk;
443}
444
445/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
446 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
447 *
448 * Local BH must be disabled here.
449 */
450
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700451static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
452 const u16 sport,
453 const u32 daddr,
454 const u16 hnum,
455 const int dif)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700456{
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700457 struct inet_ehash_bucket *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700458 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
459 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
460 struct sock *sk;
461 struct hlist_node *node;
462 /* Optimize here for direct hit, only listening connections can
463 * have wildcards anyways.
464 */
Arnaldo Carvalho de Melo304a1612005-08-09 19:59:20 -0700465 const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_ehash_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466 head = &tcp_ehash[hash];
467 read_lock(&head->lock);
468 sk_for_each(sk, node, &head->chain) {
469 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
470 goto hit; /* You sunk my battleship! */
471 }
472
473 /* Must check for a TIME_WAIT'er before going to listener hash. */
474 sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
475 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
476 goto hit;
477 }
478 sk = NULL;
479out:
480 read_unlock(&head->lock);
481 return sk;
482hit:
483 sock_hold(sk);
484 goto out;
485}
486
487static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
488 u32 daddr, u16 hnum, int dif)
489{
490 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
491 daddr, hnum, dif);
492
493 return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
494}
495
496inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
497 u16 dport, int dif)
498{
499 struct sock *sk;
500
501 local_bh_disable();
502 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
503 local_bh_enable();
504
505 return sk;
506}
507
508EXPORT_SYMBOL_GPL(tcp_v4_lookup);
509
510static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
511{
512 return secure_tcp_sequence_number(skb->nh.iph->daddr,
513 skb->nh.iph->saddr,
514 skb->h.th->dest,
515 skb->h.th->source);
516}
517
518/* called with local bh disabled */
519static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
520 struct tcp_tw_bucket **twp)
521{
522 struct inet_sock *inet = inet_sk(sk);
523 u32 daddr = inet->rcv_saddr;
524 u32 saddr = inet->daddr;
525 int dif = sk->sk_bound_dev_if;
526 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
527 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
Arnaldo Carvalho de Melo304a1612005-08-09 19:59:20 -0700528 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_ehash_size);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700529 struct inet_ehash_bucket *head = &tcp_ehash[hash];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700530 struct sock *sk2;
531 struct hlist_node *node;
532 struct tcp_tw_bucket *tw;
533
534 write_lock(&head->lock);
535
536 /* Check TIME-WAIT sockets first. */
537 sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
538 tw = (struct tcp_tw_bucket *)sk2;
539
540 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
541 struct tcp_sock *tp = tcp_sk(sk);
542
543 /* With PAWS, it is safe from the viewpoint
544 of data integrity. Even without PAWS it
545 is safe provided sequence spaces do not
546 overlap i.e. at data rates <= 80Mbit/sec.
547
548 Actually, the idea is close to VJ's one,
549 only timestamp cache is held not per host,
550 but per port pair and TW bucket is used
551 as state holder.
552
553 If TW bucket has been already destroyed we
554 fall back to VJ's scheme and use initial
555 timestamp retrieved from peer table.
556 */
557 if (tw->tw_ts_recent_stamp &&
558 (!twp || (sysctl_tcp_tw_reuse &&
559 xtime.tv_sec -
560 tw->tw_ts_recent_stamp > 1))) {
561 if ((tp->write_seq =
562 tw->tw_snd_nxt + 65535 + 2) == 0)
563 tp->write_seq = 1;
564 tp->rx_opt.ts_recent = tw->tw_ts_recent;
565 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
566 sock_hold(sk2);
567 goto unique;
568 } else
569 goto not_unique;
570 }
571 }
572 tw = NULL;
573
574 /* And established part... */
575 sk_for_each(sk2, node, &head->chain) {
576 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
577 goto not_unique;
578 }
579
580unique:
581 /* Must record num and sport now. Otherwise we will see
582 * in hash table socket with a funny identity. */
583 inet->num = lport;
584 inet->sport = htons(lport);
585 sk->sk_hashent = hash;
586 BUG_TRAP(sk_unhashed(sk));
587 __sk_add_node(sk, &head->chain);
588 sock_prot_inc_use(sk->sk_prot);
589 write_unlock(&head->lock);
590
591 if (twp) {
592 *twp = tw;
593 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
594 } else if (tw) {
595 /* Silly. Should hash-dance instead... */
596 tcp_tw_deschedule(tw);
597 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
598
599 tcp_tw_put(tw);
600 }
601
602 return 0;
603
604not_unique:
605 write_unlock(&head->lock);
606 return -EADDRNOTAVAIL;
607}
608
609static inline u32 connect_port_offset(const struct sock *sk)
610{
611 const struct inet_sock *inet = inet_sk(sk);
612
613 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
614 inet->dport);
615}
616
617/*
618 * Bind a port for a connect operation and hash it.
619 */
620static inline int tcp_v4_hash_connect(struct sock *sk)
621{
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700622 const unsigned short snum = inet_sk(sk)->num;
623 struct inet_bind_hashbucket *head;
624 struct inet_bind_bucket *tb;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700625 int ret;
626
627 if (!snum) {
628 int low = sysctl_local_port_range[0];
629 int high = sysctl_local_port_range[1];
630 int range = high - low;
631 int i;
632 int port;
633 static u32 hint;
634 u32 offset = hint + connect_port_offset(sk);
635 struct hlist_node *node;
636 struct tcp_tw_bucket *tw = NULL;
637
638 local_bh_disable();
639 for (i = 1; i <= range; i++) {
640 port = low + (i + offset) % range;
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700641 head = &tcp_bhash[inet_bhashfn(port, tcp_bhash_size)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700642 spin_lock(&head->lock);
643
644 /* Does not bother with rcv_saddr checks,
645 * because the established check is already
646 * unique enough.
647 */
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700648 inet_bind_bucket_for_each(tb, node, &head->chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700649 if (tb->port == port) {
650 BUG_TRAP(!hlist_empty(&tb->owners));
651 if (tb->fastreuse >= 0)
652 goto next_port;
653 if (!__tcp_v4_check_established(sk,
654 port,
655 &tw))
656 goto ok;
657 goto next_port;
658 }
659 }
660
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700661 tb = inet_bind_bucket_create(tcp_bucket_cachep, head, port);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662 if (!tb) {
663 spin_unlock(&head->lock);
664 break;
665 }
666 tb->fastreuse = -1;
667 goto ok;
668
669 next_port:
670 spin_unlock(&head->lock);
671 }
672 local_bh_enable();
673
674 return -EADDRNOTAVAIL;
675
676ok:
677 hint += i;
678
679 /* Head lock still held and bh's disabled */
680 tcp_bind_hash(sk, tb, port);
681 if (sk_unhashed(sk)) {
682 inet_sk(sk)->sport = htons(port);
683 __tcp_v4_hash(sk, 0);
684 }
685 spin_unlock(&head->lock);
686
687 if (tw) {
688 tcp_tw_deschedule(tw);
689 tcp_tw_put(tw);
690 }
691
692 ret = 0;
693 goto out;
694 }
695
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -0700696 head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700697 tb = tcp_sk(sk)->bind_hash;
698 spin_lock_bh(&head->lock);
699 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
700 __tcp_v4_hash(sk, 0);
701 spin_unlock_bh(&head->lock);
702 return 0;
703 } else {
704 spin_unlock(&head->lock);
705 /* No definite answer... Walk to established hash table */
706 ret = __tcp_v4_check_established(sk, snum, NULL);
707out:
708 local_bh_enable();
709 return ret;
710 }
711}
712
713/* This will initiate an outgoing connection. */
714int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
715{
716 struct inet_sock *inet = inet_sk(sk);
717 struct tcp_sock *tp = tcp_sk(sk);
718 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
719 struct rtable *rt;
720 u32 daddr, nexthop;
721 int tmp;
722 int err;
723
724 if (addr_len < sizeof(struct sockaddr_in))
725 return -EINVAL;
726
727 if (usin->sin_family != AF_INET)
728 return -EAFNOSUPPORT;
729
730 nexthop = daddr = usin->sin_addr.s_addr;
731 if (inet->opt && inet->opt->srr) {
732 if (!daddr)
733 return -EINVAL;
734 nexthop = inet->opt->faddr;
735 }
736
737 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
738 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
739 IPPROTO_TCP,
740 inet->sport, usin->sin_port, sk);
741 if (tmp < 0)
742 return tmp;
743
744 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
745 ip_rt_put(rt);
746 return -ENETUNREACH;
747 }
748
749 if (!inet->opt || !inet->opt->srr)
750 daddr = rt->rt_dst;
751
752 if (!inet->saddr)
753 inet->saddr = rt->rt_src;
754 inet->rcv_saddr = inet->saddr;
755
756 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
757 /* Reset inherited state */
758 tp->rx_opt.ts_recent = 0;
759 tp->rx_opt.ts_recent_stamp = 0;
760 tp->write_seq = 0;
761 }
762
763 if (sysctl_tcp_tw_recycle &&
764 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
765 struct inet_peer *peer = rt_get_peer(rt);
766
767 /* VJ's idea. We save last timestamp seen from
768 * the destination in peer table, when entering state TIME-WAIT
769 * and initialize rx_opt.ts_recent from it, when trying new connection.
770 */
771
772 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
773 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
774 tp->rx_opt.ts_recent = peer->tcp_ts;
775 }
776 }
777
778 inet->dport = usin->sin_port;
779 inet->daddr = daddr;
780
781 tp->ext_header_len = 0;
782 if (inet->opt)
783 tp->ext_header_len = inet->opt->optlen;
784
785 tp->rx_opt.mss_clamp = 536;
786
787 /* Socket identity is still unknown (sport may be zero).
788 * However we set state to SYN-SENT and not releasing socket
789 * lock select source port, enter ourselves into the hash tables and
790 * complete initialization after this.
791 */
792 tcp_set_state(sk, TCP_SYN_SENT);
793 err = tcp_v4_hash_connect(sk);
794 if (err)
795 goto failure;
796
797 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
798 if (err)
799 goto failure;
800
801 /* OK, now commit destination to socket. */
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -0700802 sk_setup_caps(sk, &rt->u.dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700803
804 if (!tp->write_seq)
805 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
806 inet->daddr,
807 inet->sport,
808 usin->sin_port);
809
810 inet->id = tp->write_seq ^ jiffies;
811
812 err = tcp_connect(sk);
813 rt = NULL;
814 if (err)
815 goto failure;
816
817 return 0;
818
819failure:
820 /* This unhashes the socket and releases the local port, if necessary. */
821 tcp_set_state(sk, TCP_CLOSE);
822 ip_rt_put(rt);
823 sk->sk_route_caps = 0;
824 inet->dport = 0;
825 return err;
826}
827
828static __inline__ int tcp_v4_iif(struct sk_buff *skb)
829{
830 return ((struct rtable *)skb->dst)->rt_iif;
831}
832
833static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
834{
835 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
836}
837
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700838static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
839 struct request_sock ***prevp,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700840 __u16 rport,
841 __u32 raddr, __u32 laddr)
842{
Arnaldo Carvalho de Melo2ad69c52005-06-18 22:48:55 -0700843 struct listen_sock *lopt = tp->accept_queue.listen_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700844 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700845
846 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
847 (req = *prev) != NULL;
848 prev = &req->dl_next) {
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700849 const struct inet_request_sock *ireq = inet_rsk(req);
850
851 if (ireq->rmt_port == rport &&
852 ireq->rmt_addr == raddr &&
853 ireq->loc_addr == laddr &&
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700854 TCP_INET_FAMILY(req->rsk_ops->family)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700855 BUG_TRAP(!req->sk);
856 *prevp = prev;
857 break;
858 }
859 }
860
861 return req;
862}
863
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -0700864static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700865{
866 struct tcp_sock *tp = tcp_sk(sk);
Arnaldo Carvalho de Melo2ad69c52005-06-18 22:48:55 -0700867 struct listen_sock *lopt = tp->accept_queue.listen_opt;
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700868 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700869
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -0700870 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700871 tcp_synq_added(sk);
872}
873
874
875/*
876 * This routine does path mtu discovery as defined in RFC1191.
877 */
878static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
879 u32 mtu)
880{
881 struct dst_entry *dst;
882 struct inet_sock *inet = inet_sk(sk);
883 struct tcp_sock *tp = tcp_sk(sk);
884
885 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
886 * send out by Linux are always <576bytes so they should go through
887 * unfragmented).
888 */
889 if (sk->sk_state == TCP_LISTEN)
890 return;
891
892 /* We don't check in the destentry if pmtu discovery is forbidden
893 * on this route. We just assume that no packet_to_big packets
894 * are send back when pmtu discovery is not active.
895 * There is a small race when the user changes this flag in the
896 * route, but I think that's acceptable.
897 */
898 if ((dst = __sk_dst_check(sk, 0)) == NULL)
899 return;
900
901 dst->ops->update_pmtu(dst, mtu);
902
903 /* Something is about to be wrong... Remember soft error
904 * for the case, if this connection will not able to recover.
905 */
906 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
907 sk->sk_err_soft = EMSGSIZE;
908
909 mtu = dst_mtu(dst);
910
911 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
912 tp->pmtu_cookie > mtu) {
913 tcp_sync_mss(sk, mtu);
914
915 /* Resend the TCP packet because it's
916 * clear that the old packet has been
917 * dropped. This is the new "fast" path mtu
918 * discovery.
919 */
920 tcp_simple_retransmit(sk);
921 } /* else let the usual retransmit timer handle it */
922}
923
924/*
925 * This routine is called by the ICMP module when it gets some
926 * sort of error condition. If err < 0 then the socket should
927 * be closed and the error returned to the user. If err > 0
928 * it's just the icmp type << 8 | icmp code. After adjustment
929 * header points to the first 8 bytes of the tcp header. We need
930 * to find the appropriate port.
931 *
932 * The locking strategy used here is very "optimistic". When
933 * someone else accesses the socket the ICMP is just dropped
934 * and for some paths there is no check at all.
935 * A more general error queue to queue errors for later handling
936 * is probably better.
937 *
938 */
939
940void tcp_v4_err(struct sk_buff *skb, u32 info)
941{
942 struct iphdr *iph = (struct iphdr *)skb->data;
943 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
944 struct tcp_sock *tp;
945 struct inet_sock *inet;
946 int type = skb->h.icmph->type;
947 int code = skb->h.icmph->code;
948 struct sock *sk;
949 __u32 seq;
950 int err;
951
952 if (skb->len < (iph->ihl << 2) + 8) {
953 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
954 return;
955 }
956
957 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
958 th->source, tcp_v4_iif(skb));
959 if (!sk) {
960 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
961 return;
962 }
963 if (sk->sk_state == TCP_TIME_WAIT) {
964 tcp_tw_put((struct tcp_tw_bucket *)sk);
965 return;
966 }
967
968 bh_lock_sock(sk);
969 /* If too many ICMPs get dropped on busy
970 * servers this needs to be solved differently.
971 */
972 if (sock_owned_by_user(sk))
973 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
974
975 if (sk->sk_state == TCP_CLOSE)
976 goto out;
977
978 tp = tcp_sk(sk);
979 seq = ntohl(th->seq);
980 if (sk->sk_state != TCP_LISTEN &&
981 !between(seq, tp->snd_una, tp->snd_nxt)) {
982 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
983 goto out;
984 }
985
986 switch (type) {
987 case ICMP_SOURCE_QUENCH:
988 /* Just silently ignore these. */
989 goto out;
990 case ICMP_PARAMETERPROB:
991 err = EPROTO;
992 break;
993 case ICMP_DEST_UNREACH:
994 if (code > NR_ICMP_UNREACH)
995 goto out;
996
997 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
998 if (!sock_owned_by_user(sk))
999 do_pmtu_discovery(sk, iph, info);
1000 goto out;
1001 }
1002
1003 err = icmp_err_convert[code].errno;
1004 break;
1005 case ICMP_TIME_EXCEEDED:
1006 err = EHOSTUNREACH;
1007 break;
1008 default:
1009 goto out;
1010 }
1011
1012 switch (sk->sk_state) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001013 struct request_sock *req, **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001014 case TCP_LISTEN:
1015 if (sock_owned_by_user(sk))
1016 goto out;
1017
1018 req = tcp_v4_search_req(tp, &prev, th->dest,
1019 iph->daddr, iph->saddr);
1020 if (!req)
1021 goto out;
1022
1023 /* ICMPs are not backlogged, hence we cannot get
1024 an established socket here.
1025 */
1026 BUG_TRAP(!req->sk);
1027
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001028 if (seq != tcp_rsk(req)->snt_isn) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001029 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
1030 goto out;
1031 }
1032
1033 /*
1034 * Still in SYN_RECV, just remove it silently.
1035 * There is no good way to pass the error to the newly
1036 * created socket, and POSIX does not want network
1037 * errors returned from accept().
1038 */
1039 tcp_synq_drop(sk, req, prev);
1040 goto out;
1041
1042 case TCP_SYN_SENT:
1043 case TCP_SYN_RECV: /* Cannot happen.
1044 It can f.e. if SYNs crossed.
1045 */
1046 if (!sock_owned_by_user(sk)) {
1047 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1048 sk->sk_err = err;
1049
1050 sk->sk_error_report(sk);
1051
1052 tcp_done(sk);
1053 } else {
1054 sk->sk_err_soft = err;
1055 }
1056 goto out;
1057 }
1058
1059 /* If we've already connected we will keep trying
1060 * until we time out, or the user gives up.
1061 *
1062 * rfc1122 4.2.3.9 allows to consider as hard errors
1063 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1064 * but it is obsoleted by pmtu discovery).
1065 *
1066 * Note, that in modern internet, where routing is unreliable
1067 * and in each dark corner broken firewalls sit, sending random
1068 * errors ordered by their masters even this two messages finally lose
1069 * their original sense (even Linux sends invalid PORT_UNREACHs)
1070 *
1071 * Now we are in compliance with RFCs.
1072 * --ANK (980905)
1073 */
1074
1075 inet = inet_sk(sk);
1076 if (!sock_owned_by_user(sk) && inet->recverr) {
1077 sk->sk_err = err;
1078 sk->sk_error_report(sk);
1079 } else { /* Only an error on timeout */
1080 sk->sk_err_soft = err;
1081 }
1082
1083out:
1084 bh_unlock_sock(sk);
1085 sock_put(sk);
1086}
1087
1088/* This routine computes an IPv4 TCP checksum. */
1089void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1090 struct sk_buff *skb)
1091{
1092 struct inet_sock *inet = inet_sk(sk);
1093
1094 if (skb->ip_summed == CHECKSUM_HW) {
1095 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
1096 skb->csum = offsetof(struct tcphdr, check);
1097 } else {
1098 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
1099 csum_partial((char *)th,
1100 th->doff << 2,
1101 skb->csum));
1102 }
1103}
1104
1105/*
1106 * This routine will send an RST to the other tcp.
1107 *
1108 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1109 * for reset.
1110 * Answer: if a packet caused RST, it is not for a socket
1111 * existing in our system, if it is matched to a socket,
1112 * it is just duplicate segment or bug in other side's TCP.
1113 * So that we build reply only basing on parameters
1114 * arrived with segment.
1115 * Exception: precedence violation. We do not implement it in any case.
1116 */
1117
1118static void tcp_v4_send_reset(struct sk_buff *skb)
1119{
1120 struct tcphdr *th = skb->h.th;
1121 struct tcphdr rth;
1122 struct ip_reply_arg arg;
1123
1124 /* Never send a reset in response to a reset. */
1125 if (th->rst)
1126 return;
1127
1128 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
1129 return;
1130
1131 /* Swap the send and the receive. */
1132 memset(&rth, 0, sizeof(struct tcphdr));
1133 rth.dest = th->source;
1134 rth.source = th->dest;
1135 rth.doff = sizeof(struct tcphdr) / 4;
1136 rth.rst = 1;
1137
1138 if (th->ack) {
1139 rth.seq = th->ack_seq;
1140 } else {
1141 rth.ack = 1;
1142 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
1143 skb->len - (th->doff << 2));
1144 }
1145
1146 memset(&arg, 0, sizeof arg);
1147 arg.iov[0].iov_base = (unsigned char *)&rth;
1148 arg.iov[0].iov_len = sizeof rth;
1149 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1150 skb->nh.iph->saddr, /*XXX*/
1151 sizeof(struct tcphdr), IPPROTO_TCP, 0);
1152 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1153
1154 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1155
1156 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1157 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
1158}
1159
1160/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1161 outside socket context is ugly, certainly. What can I do?
1162 */
1163
1164static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
1165 u32 win, u32 ts)
1166{
1167 struct tcphdr *th = skb->h.th;
1168 struct {
1169 struct tcphdr th;
1170 u32 tsopt[3];
1171 } rep;
1172 struct ip_reply_arg arg;
1173
1174 memset(&rep.th, 0, sizeof(struct tcphdr));
1175 memset(&arg, 0, sizeof arg);
1176
1177 arg.iov[0].iov_base = (unsigned char *)&rep;
1178 arg.iov[0].iov_len = sizeof(rep.th);
1179 if (ts) {
1180 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
1181 (TCPOPT_TIMESTAMP << 8) |
1182 TCPOLEN_TIMESTAMP);
1183 rep.tsopt[1] = htonl(tcp_time_stamp);
1184 rep.tsopt[2] = htonl(ts);
1185 arg.iov[0].iov_len = sizeof(rep);
1186 }
1187
1188 /* Swap the send and the receive. */
1189 rep.th.dest = th->source;
1190 rep.th.source = th->dest;
1191 rep.th.doff = arg.iov[0].iov_len / 4;
1192 rep.th.seq = htonl(seq);
1193 rep.th.ack_seq = htonl(ack);
1194 rep.th.ack = 1;
1195 rep.th.window = htons(win);
1196
1197 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1198 skb->nh.iph->saddr, /*XXX*/
1199 arg.iov[0].iov_len, IPPROTO_TCP, 0);
1200 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1201
1202 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1203
1204 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1205}
1206
1207static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1208{
1209 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1210
1211 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1212 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1213
1214 tcp_tw_put(tw);
1215}
1216
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001217static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001218{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001219 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001220 req->ts_recent);
1221}
1222
1223static struct dst_entry* tcp_v4_route_req(struct sock *sk,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001224 struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001225{
1226 struct rtable *rt;
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001227 const struct inet_request_sock *ireq = inet_rsk(req);
1228 struct ip_options *opt = inet_rsk(req)->opt;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001229 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1230 .nl_u = { .ip4_u =
1231 { .daddr = ((opt && opt->srr) ?
1232 opt->faddr :
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001233 ireq->rmt_addr),
1234 .saddr = ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001235 .tos = RT_CONN_FLAGS(sk) } },
1236 .proto = IPPROTO_TCP,
1237 .uli_u = { .ports =
1238 { .sport = inet_sk(sk)->sport,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001239 .dport = ireq->rmt_port } } };
Linus Torvalds1da177e2005-04-16 15:20:36 -07001240
1241 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1242 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1243 return NULL;
1244 }
1245 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1246 ip_rt_put(rt);
1247 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1248 return NULL;
1249 }
1250 return &rt->u.dst;
1251}
1252
1253/*
1254 * Send a SYN-ACK after having received an ACK.
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001255 * This still operates on a request_sock only, not on a big
Linus Torvalds1da177e2005-04-16 15:20:36 -07001256 * socket.
1257 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001258static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001259 struct dst_entry *dst)
1260{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001261 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001262 int err = -1;
1263 struct sk_buff * skb;
1264
1265 /* First, grab a route. */
1266 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1267 goto out;
1268
1269 skb = tcp_make_synack(sk, dst, req);
1270
1271 if (skb) {
1272 struct tcphdr *th = skb->h.th;
1273
1274 th->check = tcp_v4_check(th, skb->len,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001275 ireq->loc_addr,
1276 ireq->rmt_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001277 csum_partial((char *)th, skb->len,
1278 skb->csum));
1279
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001280 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1281 ireq->rmt_addr,
1282 ireq->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001283 if (err == NET_XMIT_CN)
1284 err = 0;
1285 }
1286
1287out:
1288 dst_release(dst);
1289 return err;
1290}
1291
1292/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001293 * IPv4 request_sock destructor.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001294 */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001295static void tcp_v4_reqsk_destructor(struct request_sock *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001296{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001297 if (inet_rsk(req)->opt)
1298 kfree(inet_rsk(req)->opt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001299}
1300
1301static inline void syn_flood_warning(struct sk_buff *skb)
1302{
1303 static unsigned long warntime;
1304
1305 if (time_after(jiffies, (warntime + HZ * 60))) {
1306 warntime = jiffies;
1307 printk(KERN_INFO
1308 "possible SYN flooding on port %d. Sending cookies.\n",
1309 ntohs(skb->h.th->dest));
1310 }
1311}
1312
1313/*
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001314 * Save and compile IPv4 options into the request_sock if needed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001315 */
1316static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1317 struct sk_buff *skb)
1318{
1319 struct ip_options *opt = &(IPCB(skb)->opt);
1320 struct ip_options *dopt = NULL;
1321
1322 if (opt && opt->optlen) {
1323 int opt_size = optlength(opt);
1324 dopt = kmalloc(opt_size, GFP_ATOMIC);
1325 if (dopt) {
1326 if (ip_options_echo(dopt, skb)) {
1327 kfree(dopt);
1328 dopt = NULL;
1329 }
1330 }
1331 }
1332 return dopt;
1333}
1334
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001335struct request_sock_ops tcp_request_sock_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001336 .family = PF_INET,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001337 .obj_size = sizeof(struct tcp_request_sock),
Linus Torvalds1da177e2005-04-16 15:20:36 -07001338 .rtx_syn_ack = tcp_v4_send_synack,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001339 .send_ack = tcp_v4_reqsk_send_ack,
1340 .destructor = tcp_v4_reqsk_destructor,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001341 .send_reset = tcp_v4_send_reset,
1342};
1343
1344int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1345{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001346 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001347 struct tcp_options_received tmp_opt;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001348 struct request_sock *req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001349 __u32 saddr = skb->nh.iph->saddr;
1350 __u32 daddr = skb->nh.iph->daddr;
1351 __u32 isn = TCP_SKB_CB(skb)->when;
1352 struct dst_entry *dst = NULL;
1353#ifdef CONFIG_SYN_COOKIES
1354 int want_cookie = 0;
1355#else
1356#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1357#endif
1358
1359 /* Never answer to SYNs send to broadcast or multicast */
1360 if (((struct rtable *)skb->dst)->rt_flags &
1361 (RTCF_BROADCAST | RTCF_MULTICAST))
1362 goto drop;
1363
1364 /* TW buckets are converted to open requests without
1365 * limitations, they conserve resources and peer is
1366 * evidently real one.
1367 */
1368 if (tcp_synq_is_full(sk) && !isn) {
1369#ifdef CONFIG_SYN_COOKIES
1370 if (sysctl_tcp_syncookies) {
1371 want_cookie = 1;
1372 } else
1373#endif
1374 goto drop;
1375 }
1376
1377 /* Accept backlog is full. If we have already queued enough
1378 * of warm entries in syn queue, drop request. It is better than
1379 * clogging syn queue with openreqs with exponentially increasing
1380 * timeout.
1381 */
1382 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1383 goto drop;
1384
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001385 req = reqsk_alloc(&tcp_request_sock_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001386 if (!req)
1387 goto drop;
1388
1389 tcp_clear_options(&tmp_opt);
1390 tmp_opt.mss_clamp = 536;
1391 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1392
1393 tcp_parse_options(skb, &tmp_opt, 0);
1394
1395 if (want_cookie) {
1396 tcp_clear_options(&tmp_opt);
1397 tmp_opt.saw_tstamp = 0;
1398 }
1399
1400 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1401 /* Some OSes (unknown ones, but I see them on web server, which
1402 * contains information interesting only for windows'
1403 * users) do not send their stamp in SYN. It is easy case.
1404 * We simply do not advertise TS support.
1405 */
1406 tmp_opt.saw_tstamp = 0;
1407 tmp_opt.tstamp_ok = 0;
1408 }
1409 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1410
1411 tcp_openreq_init(req, &tmp_opt, skb);
1412
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001413 ireq = inet_rsk(req);
1414 ireq->loc_addr = daddr;
1415 ireq->rmt_addr = saddr;
1416 ireq->opt = tcp_v4_save_options(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001417 if (!want_cookie)
1418 TCP_ECN_create_request(req, skb->h.th);
1419
1420 if (want_cookie) {
1421#ifdef CONFIG_SYN_COOKIES
1422 syn_flood_warning(skb);
1423#endif
1424 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1425 } else if (!isn) {
1426 struct inet_peer *peer = NULL;
1427
1428 /* VJ's idea. We save last timestamp seen
1429 * from the destination in peer table, when entering
1430 * state TIME-WAIT, and check against it before
1431 * accepting new connection request.
1432 *
1433 * If "isn" is not zero, this request hit alive
1434 * timewait bucket, so that all the necessary checks
1435 * are made in the function processing timewait state.
1436 */
1437 if (tmp_opt.saw_tstamp &&
1438 sysctl_tcp_tw_recycle &&
1439 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1440 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1441 peer->v4daddr == saddr) {
1442 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1443 (s32)(peer->tcp_ts - req->ts_recent) >
1444 TCP_PAWS_WINDOW) {
1445 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1446 dst_release(dst);
1447 goto drop_and_free;
1448 }
1449 }
1450 /* Kill the following clause, if you dislike this way. */
1451 else if (!sysctl_tcp_syncookies &&
1452 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1453 (sysctl_max_syn_backlog >> 2)) &&
1454 (!peer || !peer->tcp_ts_stamp) &&
1455 (!dst || !dst_metric(dst, RTAX_RTT))) {
1456 /* Without syncookies last quarter of
1457 * backlog is filled with destinations,
1458 * proven to be alive.
1459 * It means that we continue to communicate
1460 * to destinations, already remembered
1461 * to the moment of synflood.
1462 */
Heikki Orsilaca933452005-08-08 14:26:52 -07001463 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1464 "request from %u.%u."
1465 "%u.%u/%u\n",
1466 NIPQUAD(saddr),
1467 ntohs(skb->h.th->source)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001468 dst_release(dst);
1469 goto drop_and_free;
1470 }
1471
1472 isn = tcp_v4_init_sequence(sk, skb);
1473 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001474 tcp_rsk(req)->snt_isn = isn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001475
1476 if (tcp_v4_send_synack(sk, req, dst))
1477 goto drop_and_free;
1478
1479 if (want_cookie) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001480 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001481 } else {
1482 tcp_v4_synq_add(sk, req);
1483 }
1484 return 0;
1485
1486drop_and_free:
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001487 reqsk_free(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001488drop:
1489 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1490 return 0;
1491}
1492
1493
1494/*
1495 * The three way handshake has completed - we got a valid synack -
1496 * now create the new socket.
1497 */
1498struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001499 struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001500 struct dst_entry *dst)
1501{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001502 struct inet_request_sock *ireq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001503 struct inet_sock *newinet;
1504 struct tcp_sock *newtp;
1505 struct sock *newsk;
1506
1507 if (sk_acceptq_is_full(sk))
1508 goto exit_overflow;
1509
1510 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1511 goto exit;
1512
1513 newsk = tcp_create_openreq_child(sk, req, skb);
1514 if (!newsk)
1515 goto exit;
1516
Arnaldo Carvalho de Melo6cbb0df2005-08-09 19:49:02 -07001517 sk_setup_caps(newsk, dst);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001518
1519 newtp = tcp_sk(newsk);
1520 newinet = inet_sk(newsk);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07001521 ireq = inet_rsk(req);
1522 newinet->daddr = ireq->rmt_addr;
1523 newinet->rcv_saddr = ireq->loc_addr;
1524 newinet->saddr = ireq->loc_addr;
1525 newinet->opt = ireq->opt;
1526 ireq->opt = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001527 newinet->mc_index = tcp_v4_iif(skb);
1528 newinet->mc_ttl = skb->nh.iph->ttl;
1529 newtp->ext_header_len = 0;
1530 if (newinet->opt)
1531 newtp->ext_header_len = newinet->opt->optlen;
1532 newinet->id = newtp->write_seq ^ jiffies;
1533
1534 tcp_sync_mss(newsk, dst_mtu(dst));
1535 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1536 tcp_initialize_rcv_mss(newsk);
1537
1538 __tcp_v4_hash(newsk, 0);
1539 __tcp_inherit_port(sk, newsk);
1540
1541 return newsk;
1542
1543exit_overflow:
1544 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1545exit:
1546 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1547 dst_release(dst);
1548 return NULL;
1549}
1550
1551static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1552{
1553 struct tcphdr *th = skb->h.th;
1554 struct iphdr *iph = skb->nh.iph;
1555 struct tcp_sock *tp = tcp_sk(sk);
1556 struct sock *nsk;
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001557 struct request_sock **prev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001558 /* Find possible connection requests. */
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001559 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001560 iph->saddr, iph->daddr);
1561 if (req)
1562 return tcp_check_req(sk, skb, req, prev);
1563
1564 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1565 th->source,
1566 skb->nh.iph->daddr,
1567 ntohs(th->dest),
1568 tcp_v4_iif(skb));
1569
1570 if (nsk) {
1571 if (nsk->sk_state != TCP_TIME_WAIT) {
1572 bh_lock_sock(nsk);
1573 return nsk;
1574 }
1575 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1576 return NULL;
1577 }
1578
1579#ifdef CONFIG_SYN_COOKIES
1580 if (!th->rst && !th->syn && th->ack)
1581 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1582#endif
1583 return sk;
1584}
1585
1586static int tcp_v4_checksum_init(struct sk_buff *skb)
1587{
1588 if (skb->ip_summed == CHECKSUM_HW) {
1589 skb->ip_summed = CHECKSUM_UNNECESSARY;
1590 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1591 skb->nh.iph->daddr, skb->csum))
1592 return 0;
1593
Heikki Orsilaca933452005-08-08 14:26:52 -07001594 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001595 skb->ip_summed = CHECKSUM_NONE;
1596 }
1597 if (skb->len <= 76) {
1598 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1599 skb->nh.iph->daddr,
1600 skb_checksum(skb, 0, skb->len, 0)))
1601 return -1;
1602 skb->ip_summed = CHECKSUM_UNNECESSARY;
1603 } else {
1604 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1605 skb->nh.iph->saddr,
1606 skb->nh.iph->daddr, 0);
1607 }
1608 return 0;
1609}
1610
1611
1612/* The socket must have it's spinlock held when we get
1613 * here.
1614 *
1615 * We have a potential double-lock case here, so even when
1616 * doing backlog processing we use the BH locking scheme.
1617 * This is because we cannot sleep with the original spinlock
1618 * held.
1619 */
1620int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1621{
1622 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1623 TCP_CHECK_TIMER(sk);
1624 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1625 goto reset;
1626 TCP_CHECK_TIMER(sk);
1627 return 0;
1628 }
1629
1630 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1631 goto csum_err;
1632
1633 if (sk->sk_state == TCP_LISTEN) {
1634 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1635 if (!nsk)
1636 goto discard;
1637
1638 if (nsk != sk) {
1639 if (tcp_child_process(sk, nsk, skb))
1640 goto reset;
1641 return 0;
1642 }
1643 }
1644
1645 TCP_CHECK_TIMER(sk);
1646 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1647 goto reset;
1648 TCP_CHECK_TIMER(sk);
1649 return 0;
1650
1651reset:
1652 tcp_v4_send_reset(skb);
1653discard:
1654 kfree_skb(skb);
1655 /* Be careful here. If this function gets more complicated and
1656 * gcc suffers from register pressure on the x86, sk (in %ebx)
1657 * might be destroyed here. This current version compiles correctly,
1658 * but you have been warned.
1659 */
1660 return 0;
1661
1662csum_err:
1663 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1664 goto discard;
1665}
1666
1667/*
1668 * From tcp_input.c
1669 */
1670
1671int tcp_v4_rcv(struct sk_buff *skb)
1672{
1673 struct tcphdr *th;
1674 struct sock *sk;
1675 int ret;
1676
1677 if (skb->pkt_type != PACKET_HOST)
1678 goto discard_it;
1679
1680 /* Count it even if it's bad */
1681 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1682
1683 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1684 goto discard_it;
1685
1686 th = skb->h.th;
1687
1688 if (th->doff < sizeof(struct tcphdr) / 4)
1689 goto bad_packet;
1690 if (!pskb_may_pull(skb, th->doff * 4))
1691 goto discard_it;
1692
1693 /* An explanation is required here, I think.
1694 * Packet length and doff are validated by header prediction,
1695 * provided case of th->doff==0 is elimineted.
1696 * So, we defer the checks. */
1697 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1698 tcp_v4_checksum_init(skb) < 0))
1699 goto bad_packet;
1700
1701 th = skb->h.th;
1702 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1703 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1704 skb->len - th->doff * 4);
1705 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1706 TCP_SKB_CB(skb)->when = 0;
1707 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1708 TCP_SKB_CB(skb)->sacked = 0;
1709
1710 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1711 skb->nh.iph->daddr, ntohs(th->dest),
1712 tcp_v4_iif(skb));
1713
1714 if (!sk)
1715 goto no_tcp_socket;
1716
1717process:
1718 if (sk->sk_state == TCP_TIME_WAIT)
1719 goto do_time_wait;
1720
1721 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1722 goto discard_and_relse;
1723
1724 if (sk_filter(sk, skb, 0))
1725 goto discard_and_relse;
1726
1727 skb->dev = NULL;
1728
1729 bh_lock_sock(sk);
1730 ret = 0;
1731 if (!sock_owned_by_user(sk)) {
1732 if (!tcp_prequeue(sk, skb))
1733 ret = tcp_v4_do_rcv(sk, skb);
1734 } else
1735 sk_add_backlog(sk, skb);
1736 bh_unlock_sock(sk);
1737
1738 sock_put(sk);
1739
1740 return ret;
1741
1742no_tcp_socket:
1743 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1744 goto discard_it;
1745
1746 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1747bad_packet:
1748 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1749 } else {
1750 tcp_v4_send_reset(skb);
1751 }
1752
1753discard_it:
1754 /* Discard frame. */
1755 kfree_skb(skb);
1756 return 0;
1757
1758discard_and_relse:
1759 sock_put(sk);
1760 goto discard_it;
1761
1762do_time_wait:
1763 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1764 tcp_tw_put((struct tcp_tw_bucket *) sk);
1765 goto discard_it;
1766 }
1767
1768 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1769 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1770 tcp_tw_put((struct tcp_tw_bucket *) sk);
1771 goto discard_it;
1772 }
1773 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1774 skb, th, skb->len)) {
1775 case TCP_TW_SYN: {
1776 struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
1777 ntohs(th->dest),
1778 tcp_v4_iif(skb));
1779 if (sk2) {
1780 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1781 tcp_tw_put((struct tcp_tw_bucket *)sk);
1782 sk = sk2;
1783 goto process;
1784 }
1785 /* Fall through to ACK */
1786 }
1787 case TCP_TW_ACK:
1788 tcp_v4_timewait_ack(sk, skb);
1789 break;
1790 case TCP_TW_RST:
1791 goto no_tcp_socket;
1792 case TCP_TW_SUCCESS:;
1793 }
1794 goto discard_it;
1795}
1796
Linus Torvalds1da177e2005-04-16 15:20:36 -07001797static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1798{
1799 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1800 struct inet_sock *inet = inet_sk(sk);
1801
1802 sin->sin_family = AF_INET;
1803 sin->sin_addr.s_addr = inet->daddr;
1804 sin->sin_port = inet->dport;
1805}
1806
1807/* VJ's idea. Save last timestamp seen from this destination
1808 * and hold it at least for normal timewait interval to use for duplicate
1809 * segment detection in subsequent connections, before they enter synchronized
1810 * state.
1811 */
1812
1813int tcp_v4_remember_stamp(struct sock *sk)
1814{
1815 struct inet_sock *inet = inet_sk(sk);
1816 struct tcp_sock *tp = tcp_sk(sk);
1817 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1818 struct inet_peer *peer = NULL;
1819 int release_it = 0;
1820
1821 if (!rt || rt->rt_dst != inet->daddr) {
1822 peer = inet_getpeer(inet->daddr, 1);
1823 release_it = 1;
1824 } else {
1825 if (!rt->peer)
1826 rt_bind_peer(rt, 1);
1827 peer = rt->peer;
1828 }
1829
1830 if (peer) {
1831 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1832 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1833 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1834 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1835 peer->tcp_ts = tp->rx_opt.ts_recent;
1836 }
1837 if (release_it)
1838 inet_putpeer(peer);
1839 return 1;
1840 }
1841
1842 return 0;
1843}
1844
1845int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1846{
1847 struct inet_peer *peer = NULL;
1848
1849 peer = inet_getpeer(tw->tw_daddr, 1);
1850
1851 if (peer) {
1852 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1853 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1854 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1855 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1856 peer->tcp_ts = tw->tw_ts_recent;
1857 }
1858 inet_putpeer(peer);
1859 return 1;
1860 }
1861
1862 return 0;
1863}
1864
1865struct tcp_func ipv4_specific = {
1866 .queue_xmit = ip_queue_xmit,
1867 .send_check = tcp_v4_send_check,
Arnaldo Carvalho de Melo32519f12005-08-09 19:50:02 -07001868 .rebuild_header = inet_sk_rebuild_header,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001869 .conn_request = tcp_v4_conn_request,
1870 .syn_recv_sock = tcp_v4_syn_recv_sock,
1871 .remember_stamp = tcp_v4_remember_stamp,
1872 .net_header_len = sizeof(struct iphdr),
1873 .setsockopt = ip_setsockopt,
1874 .getsockopt = ip_getsockopt,
1875 .addr2sockaddr = v4_addr2sockaddr,
1876 .sockaddr_len = sizeof(struct sockaddr_in),
1877};
1878
1879/* NOTE: A lot of things set to zero explicitly by call to
1880 * sk_alloc() so need not be done here.
1881 */
1882static int tcp_v4_init_sock(struct sock *sk)
1883{
1884 struct tcp_sock *tp = tcp_sk(sk);
1885
1886 skb_queue_head_init(&tp->out_of_order_queue);
1887 tcp_init_xmit_timers(sk);
1888 tcp_prequeue_init(tp);
1889
1890 tp->rto = TCP_TIMEOUT_INIT;
1891 tp->mdev = TCP_TIMEOUT_INIT;
1892
1893 /* So many TCP implementations out there (incorrectly) count the
1894 * initial SYN frame in their delayed-ACK and congestion control
1895 * algorithms that we must have the following bandaid to talk
1896 * efficiently to them. -DaveM
1897 */
1898 tp->snd_cwnd = 2;
1899
1900 /* See draft-stevens-tcpca-spec-01 for discussion of the
1901 * initialization of these values.
1902 */
1903 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1904 tp->snd_cwnd_clamp = ~0;
David S. Millerc1b4a7e2005-07-05 15:24:38 -07001905 tp->mss_cache = 536;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001906
1907 tp->reordering = sysctl_tcp_reordering;
Stephen Hemminger5f8ef482005-06-23 20:37:36 -07001908 tp->ca_ops = &tcp_init_congestion_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001909
1910 sk->sk_state = TCP_CLOSE;
1911
1912 sk->sk_write_space = sk_stream_write_space;
1913 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1914
1915 tp->af_specific = &ipv4_specific;
1916
1917 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1918 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1919
1920 atomic_inc(&tcp_sockets_allocated);
1921
1922 return 0;
1923}
1924
1925int tcp_v4_destroy_sock(struct sock *sk)
1926{
1927 struct tcp_sock *tp = tcp_sk(sk);
1928
1929 tcp_clear_xmit_timers(sk);
1930
Stephen Hemminger317a76f2005-06-23 12:19:55 -07001931 tcp_cleanup_congestion_control(tp);
1932
Linus Torvalds1da177e2005-04-16 15:20:36 -07001933 /* Cleanup up the write buffer. */
1934 sk_stream_writequeue_purge(sk);
1935
1936 /* Cleans up our, hopefully empty, out_of_order_queue. */
1937 __skb_queue_purge(&tp->out_of_order_queue);
1938
1939 /* Clean prequeue, it must be empty really */
1940 __skb_queue_purge(&tp->ucopy.prequeue);
1941
1942 /* Clean up a referenced TCP bind bucket. */
1943 if (tp->bind_hash)
1944 tcp_put_port(sk);
1945
1946 /*
1947 * If sendmsg cached page exists, toss it.
1948 */
1949 if (sk->sk_sndmsg_page) {
1950 __free_page(sk->sk_sndmsg_page);
1951 sk->sk_sndmsg_page = NULL;
1952 }
1953
1954 atomic_dec(&tcp_sockets_allocated);
1955
1956 return 0;
1957}
1958
1959EXPORT_SYMBOL(tcp_v4_destroy_sock);
1960
1961#ifdef CONFIG_PROC_FS
1962/* Proc filesystem TCP sock list dumping. */
1963
1964static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1965{
1966 return hlist_empty(head) ? NULL :
1967 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1968}
1969
1970static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1971{
1972 return tw->tw_node.next ?
1973 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1974}
1975
1976static void *listening_get_next(struct seq_file *seq, void *cur)
1977{
1978 struct tcp_sock *tp;
1979 struct hlist_node *node;
1980 struct sock *sk = cur;
1981 struct tcp_iter_state* st = seq->private;
1982
1983 if (!sk) {
1984 st->bucket = 0;
1985 sk = sk_head(&tcp_listening_hash[0]);
1986 goto get_sk;
1987 }
1988
1989 ++st->num;
1990
1991 if (st->state == TCP_SEQ_STATE_OPENREQ) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001992 struct request_sock *req = cur;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001993
1994 tp = tcp_sk(st->syn_wait_sk);
1995 req = req->dl_next;
1996 while (1) {
1997 while (req) {
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07001998 if (req->rsk_ops->family == st->family) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001999 cur = req;
2000 goto out;
2001 }
2002 req = req->dl_next;
2003 }
2004 if (++st->sbucket >= TCP_SYNQ_HSIZE)
2005 break;
2006get_req:
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002007 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
Linus Torvalds1da177e2005-04-16 15:20:36 -07002008 }
2009 sk = sk_next(st->syn_wait_sk);
2010 st->state = TCP_SEQ_STATE_LISTENING;
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002011 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002012 } else {
2013 tp = tcp_sk(sk);
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002014 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2015 if (reqsk_queue_len(&tp->accept_queue))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002016 goto start_req;
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002017 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002018 sk = sk_next(sk);
2019 }
2020get_sk:
2021 sk_for_each_from(sk, node) {
2022 if (sk->sk_family == st->family) {
2023 cur = sk;
2024 goto out;
2025 }
2026 tp = tcp_sk(sk);
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002027 read_lock_bh(&tp->accept_queue.syn_wait_lock);
2028 if (reqsk_queue_len(&tp->accept_queue)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002029start_req:
2030 st->uid = sock_i_uid(sk);
2031 st->syn_wait_sk = sk;
2032 st->state = TCP_SEQ_STATE_OPENREQ;
2033 st->sbucket = 0;
2034 goto get_req;
2035 }
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002036 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002037 }
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -07002038 if (++st->bucket < INET_LHTABLE_SIZE) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002039 sk = sk_head(&tcp_listening_hash[st->bucket]);
2040 goto get_sk;
2041 }
2042 cur = NULL;
2043out:
2044 return cur;
2045}
2046
2047static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2048{
2049 void *rc = listening_get_next(seq, NULL);
2050
2051 while (rc && *pos) {
2052 rc = listening_get_next(seq, rc);
2053 --*pos;
2054 }
2055 return rc;
2056}
2057
2058static void *established_get_first(struct seq_file *seq)
2059{
2060 struct tcp_iter_state* st = seq->private;
2061 void *rc = NULL;
2062
2063 for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
2064 struct sock *sk;
2065 struct hlist_node *node;
2066 struct tcp_tw_bucket *tw;
2067
2068 /* We can reschedule _before_ having picked the target: */
2069 cond_resched_softirq();
2070
2071 read_lock(&tcp_ehash[st->bucket].lock);
2072 sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
2073 if (sk->sk_family != st->family) {
2074 continue;
2075 }
2076 rc = sk;
2077 goto out;
2078 }
2079 st->state = TCP_SEQ_STATE_TIME_WAIT;
2080 tw_for_each(tw, node,
2081 &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
2082 if (tw->tw_family != st->family) {
2083 continue;
2084 }
2085 rc = tw;
2086 goto out;
2087 }
2088 read_unlock(&tcp_ehash[st->bucket].lock);
2089 st->state = TCP_SEQ_STATE_ESTABLISHED;
2090 }
2091out:
2092 return rc;
2093}
2094
2095static void *established_get_next(struct seq_file *seq, void *cur)
2096{
2097 struct sock *sk = cur;
2098 struct tcp_tw_bucket *tw;
2099 struct hlist_node *node;
2100 struct tcp_iter_state* st = seq->private;
2101
2102 ++st->num;
2103
2104 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2105 tw = cur;
2106 tw = tw_next(tw);
2107get_tw:
2108 while (tw && tw->tw_family != st->family) {
2109 tw = tw_next(tw);
2110 }
2111 if (tw) {
2112 cur = tw;
2113 goto out;
2114 }
2115 read_unlock(&tcp_ehash[st->bucket].lock);
2116 st->state = TCP_SEQ_STATE_ESTABLISHED;
2117
2118 /* We can reschedule between buckets: */
2119 cond_resched_softirq();
2120
2121 if (++st->bucket < tcp_ehash_size) {
2122 read_lock(&tcp_ehash[st->bucket].lock);
2123 sk = sk_head(&tcp_ehash[st->bucket].chain);
2124 } else {
2125 cur = NULL;
2126 goto out;
2127 }
2128 } else
2129 sk = sk_next(sk);
2130
2131 sk_for_each_from(sk, node) {
2132 if (sk->sk_family == st->family)
2133 goto found;
2134 }
2135
2136 st->state = TCP_SEQ_STATE_TIME_WAIT;
2137 tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
2138 goto get_tw;
2139found:
2140 cur = sk;
2141out:
2142 return cur;
2143}
2144
2145static void *established_get_idx(struct seq_file *seq, loff_t pos)
2146{
2147 void *rc = established_get_first(seq);
2148
2149 while (rc && pos) {
2150 rc = established_get_next(seq, rc);
2151 --pos;
2152 }
2153 return rc;
2154}
2155
2156static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2157{
2158 void *rc;
2159 struct tcp_iter_state* st = seq->private;
2160
2161 tcp_listen_lock();
2162 st->state = TCP_SEQ_STATE_LISTENING;
2163 rc = listening_get_idx(seq, &pos);
2164
2165 if (!rc) {
2166 tcp_listen_unlock();
2167 local_bh_disable();
2168 st->state = TCP_SEQ_STATE_ESTABLISHED;
2169 rc = established_get_idx(seq, pos);
2170 }
2171
2172 return rc;
2173}
2174
2175static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2176{
2177 struct tcp_iter_state* st = seq->private;
2178 st->state = TCP_SEQ_STATE_LISTENING;
2179 st->num = 0;
2180 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2181}
2182
2183static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2184{
2185 void *rc = NULL;
2186 struct tcp_iter_state* st;
2187
2188 if (v == SEQ_START_TOKEN) {
2189 rc = tcp_get_idx(seq, 0);
2190 goto out;
2191 }
2192 st = seq->private;
2193
2194 switch (st->state) {
2195 case TCP_SEQ_STATE_OPENREQ:
2196 case TCP_SEQ_STATE_LISTENING:
2197 rc = listening_get_next(seq, v);
2198 if (!rc) {
2199 tcp_listen_unlock();
2200 local_bh_disable();
2201 st->state = TCP_SEQ_STATE_ESTABLISHED;
2202 rc = established_get_first(seq);
2203 }
2204 break;
2205 case TCP_SEQ_STATE_ESTABLISHED:
2206 case TCP_SEQ_STATE_TIME_WAIT:
2207 rc = established_get_next(seq, v);
2208 break;
2209 }
2210out:
2211 ++*pos;
2212 return rc;
2213}
2214
2215static void tcp_seq_stop(struct seq_file *seq, void *v)
2216{
2217 struct tcp_iter_state* st = seq->private;
2218
2219 switch (st->state) {
2220 case TCP_SEQ_STATE_OPENREQ:
2221 if (v) {
2222 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
Arnaldo Carvalho de Melo0e875062005-06-18 22:47:59 -07002223 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002224 }
2225 case TCP_SEQ_STATE_LISTENING:
2226 if (v != SEQ_START_TOKEN)
2227 tcp_listen_unlock();
2228 break;
2229 case TCP_SEQ_STATE_TIME_WAIT:
2230 case TCP_SEQ_STATE_ESTABLISHED:
2231 if (v)
2232 read_unlock(&tcp_ehash[st->bucket].lock);
2233 local_bh_enable();
2234 break;
2235 }
2236}
2237
2238static int tcp_seq_open(struct inode *inode, struct file *file)
2239{
2240 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2241 struct seq_file *seq;
2242 struct tcp_iter_state *s;
2243 int rc;
2244
2245 if (unlikely(afinfo == NULL))
2246 return -EINVAL;
2247
2248 s = kmalloc(sizeof(*s), GFP_KERNEL);
2249 if (!s)
2250 return -ENOMEM;
2251 memset(s, 0, sizeof(*s));
2252 s->family = afinfo->family;
2253 s->seq_ops.start = tcp_seq_start;
2254 s->seq_ops.next = tcp_seq_next;
2255 s->seq_ops.show = afinfo->seq_show;
2256 s->seq_ops.stop = tcp_seq_stop;
2257
2258 rc = seq_open(file, &s->seq_ops);
2259 if (rc)
2260 goto out_kfree;
2261 seq = file->private_data;
2262 seq->private = s;
2263out:
2264 return rc;
2265out_kfree:
2266 kfree(s);
2267 goto out;
2268}
2269
2270int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2271{
2272 int rc = 0;
2273 struct proc_dir_entry *p;
2274
2275 if (!afinfo)
2276 return -EINVAL;
2277 afinfo->seq_fops->owner = afinfo->owner;
2278 afinfo->seq_fops->open = tcp_seq_open;
2279 afinfo->seq_fops->read = seq_read;
2280 afinfo->seq_fops->llseek = seq_lseek;
2281 afinfo->seq_fops->release = seq_release_private;
2282
2283 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2284 if (p)
2285 p->data = afinfo;
2286 else
2287 rc = -ENOMEM;
2288 return rc;
2289}
2290
2291void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2292{
2293 if (!afinfo)
2294 return;
2295 proc_net_remove(afinfo->name);
2296 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2297}
2298
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002299static void get_openreq4(struct sock *sk, struct request_sock *req,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002300 char *tmpbuf, int i, int uid)
2301{
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002302 const struct inet_request_sock *ireq = inet_rsk(req);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002303 int ttd = req->expires - jiffies;
2304
2305 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2306 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2307 i,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002308 ireq->loc_addr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002309 ntohs(inet_sk(sk)->sport),
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002310 ireq->rmt_addr,
2311 ntohs(ireq->rmt_port),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002312 TCP_SYN_RECV,
2313 0, 0, /* could print option size, but that is af dependent. */
2314 1, /* timers active (only the expire timer) */
2315 jiffies_to_clock_t(ttd),
2316 req->retrans,
2317 uid,
2318 0, /* non standard timer */
2319 0, /* open_requests have no inode */
2320 atomic_read(&sk->sk_refcnt),
2321 req);
2322}
2323
2324static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2325{
2326 int timer_active;
2327 unsigned long timer_expires;
2328 struct tcp_sock *tp = tcp_sk(sp);
2329 struct inet_sock *inet = inet_sk(sp);
2330 unsigned int dest = inet->daddr;
2331 unsigned int src = inet->rcv_saddr;
2332 __u16 destp = ntohs(inet->dport);
2333 __u16 srcp = ntohs(inet->sport);
2334
2335 if (tp->pending == TCP_TIME_RETRANS) {
2336 timer_active = 1;
2337 timer_expires = tp->timeout;
2338 } else if (tp->pending == TCP_TIME_PROBE0) {
2339 timer_active = 4;
2340 timer_expires = tp->timeout;
2341 } else if (timer_pending(&sp->sk_timer)) {
2342 timer_active = 2;
2343 timer_expires = sp->sk_timer.expires;
2344 } else {
2345 timer_active = 0;
2346 timer_expires = jiffies;
2347 }
2348
2349 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2350 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2351 i, src, srcp, dest, destp, sp->sk_state,
2352 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2353 timer_active,
2354 jiffies_to_clock_t(timer_expires - jiffies),
2355 tp->retransmits,
2356 sock_i_uid(sp),
2357 tp->probes_out,
2358 sock_i_ino(sp),
2359 atomic_read(&sp->sk_refcnt), sp,
2360 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2361 tp->snd_cwnd,
2362 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2363}
2364
2365static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2366{
2367 unsigned int dest, src;
2368 __u16 destp, srcp;
2369 int ttd = tw->tw_ttd - jiffies;
2370
2371 if (ttd < 0)
2372 ttd = 0;
2373
2374 dest = tw->tw_daddr;
2375 src = tw->tw_rcv_saddr;
2376 destp = ntohs(tw->tw_dport);
2377 srcp = ntohs(tw->tw_sport);
2378
2379 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2380 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2381 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2382 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2383 atomic_read(&tw->tw_refcnt), tw);
2384}
2385
2386#define TMPSZ 150
2387
2388static int tcp4_seq_show(struct seq_file *seq, void *v)
2389{
2390 struct tcp_iter_state* st;
2391 char tmpbuf[TMPSZ + 1];
2392
2393 if (v == SEQ_START_TOKEN) {
2394 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2395 " sl local_address rem_address st tx_queue "
2396 "rx_queue tr tm->when retrnsmt uid timeout "
2397 "inode");
2398 goto out;
2399 }
2400 st = seq->private;
2401
2402 switch (st->state) {
2403 case TCP_SEQ_STATE_LISTENING:
2404 case TCP_SEQ_STATE_ESTABLISHED:
2405 get_tcp4_sock(v, tmpbuf, st->num);
2406 break;
2407 case TCP_SEQ_STATE_OPENREQ:
2408 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2409 break;
2410 case TCP_SEQ_STATE_TIME_WAIT:
2411 get_timewait4_sock(v, tmpbuf, st->num);
2412 break;
2413 }
2414 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2415out:
2416 return 0;
2417}
2418
2419static struct file_operations tcp4_seq_fops;
2420static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2421 .owner = THIS_MODULE,
2422 .name = "tcp",
2423 .family = AF_INET,
2424 .seq_show = tcp4_seq_show,
2425 .seq_fops = &tcp4_seq_fops,
2426};
2427
2428int __init tcp4_proc_init(void)
2429{
2430 return tcp_proc_register(&tcp4_seq_afinfo);
2431}
2432
2433void tcp4_proc_exit(void)
2434{
2435 tcp_proc_unregister(&tcp4_seq_afinfo);
2436}
2437#endif /* CONFIG_PROC_FS */
2438
2439struct proto tcp_prot = {
2440 .name = "TCP",
2441 .owner = THIS_MODULE,
2442 .close = tcp_close,
2443 .connect = tcp_v4_connect,
2444 .disconnect = tcp_disconnect,
2445 .accept = tcp_accept,
2446 .ioctl = tcp_ioctl,
2447 .init = tcp_v4_init_sock,
2448 .destroy = tcp_v4_destroy_sock,
2449 .shutdown = tcp_shutdown,
2450 .setsockopt = tcp_setsockopt,
2451 .getsockopt = tcp_getsockopt,
2452 .sendmsg = tcp_sendmsg,
2453 .recvmsg = tcp_recvmsg,
2454 .backlog_rcv = tcp_v4_do_rcv,
2455 .hash = tcp_v4_hash,
2456 .unhash = tcp_unhash,
2457 .get_port = tcp_v4_get_port,
2458 .enter_memory_pressure = tcp_enter_memory_pressure,
2459 .sockets_allocated = &tcp_sockets_allocated,
2460 .memory_allocated = &tcp_memory_allocated,
2461 .memory_pressure = &tcp_memory_pressure,
2462 .sysctl_mem = sysctl_tcp_mem,
2463 .sysctl_wmem = sysctl_tcp_wmem,
2464 .sysctl_rmem = sysctl_tcp_rmem,
2465 .max_header = MAX_TCP_HEADER,
2466 .obj_size = sizeof(struct tcp_sock),
Arnaldo Carvalho de Melo60236fd2005-06-18 22:47:21 -07002467 .rsk_prot = &tcp_request_sock_ops,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002468};
2469
2470
2471
2472void __init tcp_v4_init(struct net_proto_family *ops)
2473{
2474 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2475 if (err < 0)
2476 panic("Failed to create the TCP control socket.\n");
2477 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2478 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2479
2480 /* Unhash it so that IP input processing does not even
2481 * see it, we do not wish this socket to see incoming
2482 * packets.
2483 */
2484 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2485}
2486
2487EXPORT_SYMBOL(ipv4_specific);
2488EXPORT_SYMBOL(tcp_bind_hash);
Arnaldo Carvalho de Melo0f7ff922005-08-09 19:59:44 -07002489EXPORT_SYMBOL(inet_bind_bucket_create);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002490EXPORT_SYMBOL(tcp_hashinfo);
2491EXPORT_SYMBOL(tcp_inherit_port);
2492EXPORT_SYMBOL(tcp_listen_wlock);
2493EXPORT_SYMBOL(tcp_port_rover);
2494EXPORT_SYMBOL(tcp_prot);
2495EXPORT_SYMBOL(tcp_put_port);
2496EXPORT_SYMBOL(tcp_unhash);
2497EXPORT_SYMBOL(tcp_v4_conn_request);
2498EXPORT_SYMBOL(tcp_v4_connect);
2499EXPORT_SYMBOL(tcp_v4_do_rcv);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002500EXPORT_SYMBOL(tcp_v4_remember_stamp);
2501EXPORT_SYMBOL(tcp_v4_send_check);
2502EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2503
2504#ifdef CONFIG_PROC_FS
2505EXPORT_SYMBOL(tcp_proc_register);
2506EXPORT_SYMBOL(tcp_proc_unregister);
2507#endif
2508EXPORT_SYMBOL(sysctl_local_port_range);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002509EXPORT_SYMBOL(sysctl_tcp_low_latency);
2510EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
2511