blob: 38820eaecd4347291bf434735e89c93600b0f3c6 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +090035 * code. The ACK stuff can wait and needs major
Linus Torvalds1da177e2005-04-16 15:20:36 -070036 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
Randy Dunlap4fc268d2006-01-11 12:17:47 -080092#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070093#include <linux/errno.h>
94#include <linux/types.h>
95#include <linux/socket.h>
96#include <linux/in.h>
97#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <linux/module.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/sched.h>
102#include <linux/timer.h>
103#include <linux/string.h>
104#include <linux/sockios.h>
105#include <linux/net.h>
106#include <linux/mm.h>
107#include <linux/slab.h>
108#include <linux/interrupt.h>
109#include <linux/poll.h>
110#include <linux/tcp.h>
111#include <linux/init.h>
Al Viroa1f8e7f72006-10-19 16:08:53 -0400112#include <linux/highmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700113
114#include <asm/uaccess.h>
115#include <asm/system.h>
116
117#include <linux/netdevice.h>
118#include <net/protocol.h>
119#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +0200120#include <net/net_namespace.h>
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700121#include <net/request_sock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122#include <net/sock.h>
Patrick Ohly20d49472009-02-12 05:03:38 +0000123#include <linux/net_tstamp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700124#include <net/xfrm.h>
125#include <linux/ipsec.h>
126
127#include <linux/filter.h>
128
129#ifdef CONFIG_INET
130#include <net/tcp.h>
131#endif
132
Ingo Molnarda21f242006-07-03 00:25:12 -0700133/*
134 * Each address family might have different locking rules, so we have
135 * one slock key per address family:
136 */
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700137static struct lock_class_key af_family_keys[AF_MAX];
138static struct lock_class_key af_family_slock_keys[AF_MAX];
139
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700140/*
141 * Make lock validator output more readable. (we pre-construct these
142 * strings build-time, so that runtime initialization of socket
143 * locks is fast):
144 */
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700145static const char *const af_family_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700146 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
147 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
148 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
149 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
150 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
151 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
152 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800153 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700154 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800155 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700156 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700157 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000158 "sk_lock-AF_IEEE802154",
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700159 "sk_lock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700160};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700161static const char *const af_family_slock_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700162 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
163 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
164 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
165 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
166 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
167 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
168 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800169 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700170 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800171 "slock-27" , "slock-28" , "slock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700172 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700173 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000174 "slock-AF_IEEE802154",
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700175 "slock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700176};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700177static const char *const af_family_clock_key_strings[AF_MAX+1] = {
Peter Zijlstra443aef02007-07-19 01:49:00 -0700178 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
179 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
180 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
181 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
182 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
183 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
184 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800185 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
Peter Zijlstra443aef02007-07-19 01:49:00 -0700186 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
Oliver Hartkoppb4942af2008-07-23 14:06:04 -0700187 "clock-27" , "clock-28" , "clock-AF_CAN" ,
David Howellse51f8022007-07-21 19:30:16 -0700188 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700189 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000190 "clock-AF_IEEE802154",
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700191 "clock-AF_MAX"
Peter Zijlstra443aef02007-07-19 01:49:00 -0700192};
Ingo Molnarda21f242006-07-03 00:25:12 -0700193
194/*
195 * sk_callback_lock locking rules are per-address-family,
196 * so split the lock classes by using a per-AF key:
197 */
198static struct lock_class_key af_callback_keys[AF_MAX];
199
Linus Torvalds1da177e2005-04-16 15:20:36 -0700200/* Take into consideration the size of the struct sk_buff overhead in the
201 * determination of these values, since that is non-constant across
202 * platforms. This makes socket queueing behavior and performance
203 * not depend upon such differences.
204 */
205#define _SK_MEM_PACKETS 256
206#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
207#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
208#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
209
210/* Run time adjustable parameters. */
Brian Haleyab32ea52006-09-22 14:15:41 -0700211__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
212__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
213__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
214__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700215
216/* Maximal space eaten by iovec or ancilliary data plus some space */
Brian Haleyab32ea52006-09-22 14:15:41 -0700217int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
Eric Dumazet2a915252009-05-27 11:30:05 +0000218EXPORT_SYMBOL(sysctl_optmem_max);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700219
220static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
221{
222 struct timeval tv;
223
224 if (optlen < sizeof(tv))
225 return -EINVAL;
226 if (copy_from_user(&tv, optval, sizeof(tv)))
227 return -EFAULT;
Vasily Averinba780732007-05-24 16:58:54 -0700228 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
229 return -EDOM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700230
Vasily Averinba780732007-05-24 16:58:54 -0700231 if (tv.tv_sec < 0) {
Andrew Morton6f11df82007-07-09 13:16:00 -0700232 static int warned __read_mostly;
233
Vasily Averinba780732007-05-24 16:58:54 -0700234 *timeo_p = 0;
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700235 if (warned < 10 && net_ratelimit()) {
Vasily Averinba780732007-05-24 16:58:54 -0700236 warned++;
237 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
238 "tries to set negative timeout\n",
Pavel Emelyanovba25f9d2007-10-18 23:40:40 -0700239 current->comm, task_pid_nr(current));
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700240 }
Vasily Averinba780732007-05-24 16:58:54 -0700241 return 0;
242 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700243 *timeo_p = MAX_SCHEDULE_TIMEOUT;
244 if (tv.tv_sec == 0 && tv.tv_usec == 0)
245 return 0;
246 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
247 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
248 return 0;
249}
250
251static void sock_warn_obsolete_bsdism(const char *name)
252{
253 static int warned;
254 static char warncomm[TASK_COMM_LEN];
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900255 if (strcmp(warncomm, current->comm) && warned < 5) {
256 strcpy(warncomm, current->comm);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257 printk(KERN_WARNING "process `%s' is using obsolete "
258 "%s SO_BSDCOMPAT\n", warncomm, name);
259 warned++;
260 }
261}
262
Patrick Ohly20d49472009-02-12 05:03:38 +0000263static void sock_disable_timestamp(struct sock *sk, int flag)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900264{
Patrick Ohly20d49472009-02-12 05:03:38 +0000265 if (sock_flag(sk, flag)) {
266 sock_reset_flag(sk, flag);
267 if (!sock_flag(sk, SOCK_TIMESTAMP) &&
268 !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
269 net_disable_timestamp();
270 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271 }
272}
273
274
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800275int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
276{
Eric Dumazet766e90372009-10-14 20:40:11 -0700277 int err;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800278 int skb_len;
Neil Horman3b885782009-10-12 13:26:31 -0700279 unsigned long flags;
280 struct sk_buff_head *list = &sk->sk_receive_queue;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800281
Rami Rosen9ee6b7f2008-05-14 03:50:03 -0700282 /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800283 number of warnings when compiling with -W --ANK
284 */
285 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
286 (unsigned)sk->sk_rcvbuf) {
Eric Dumazet766e90372009-10-14 20:40:11 -0700287 atomic_inc(&sk->sk_drops);
288 return -ENOMEM;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800289 }
290
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700291 err = sk_filter(sk, skb);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800292 if (err)
Eric Dumazet766e90372009-10-14 20:40:11 -0700293 return err;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800294
Hideo Aoki3ab224b2007-12-31 00:11:19 -0800295 if (!sk_rmem_schedule(sk, skb->truesize)) {
Eric Dumazet766e90372009-10-14 20:40:11 -0700296 atomic_inc(&sk->sk_drops);
297 return -ENOBUFS;
Hideo Aoki3ab224b2007-12-31 00:11:19 -0800298 }
299
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800300 skb->dev = NULL;
301 skb_set_owner_r(skb, sk);
David S. Miller49ad9592008-12-17 22:11:38 -0800302
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800303 /* Cache the SKB length before we tack it onto the receive
304 * queue. Once it is added it no longer belongs to us and
305 * may be freed by other threads of control pulling packets
306 * from the queue.
307 */
308 skb_len = skb->len;
309
Neil Horman3b885782009-10-12 13:26:31 -0700310 spin_lock_irqsave(&list->lock, flags);
311 skb->dropcount = atomic_read(&sk->sk_drops);
312 __skb_queue_tail(list, skb);
313 spin_unlock_irqrestore(&list->lock, flags);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800314
315 if (!sock_flag(sk, SOCK_DEAD))
316 sk->sk_data_ready(sk, skb_len);
Eric Dumazet766e90372009-10-14 20:40:11 -0700317 return 0;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800318}
319EXPORT_SYMBOL(sock_queue_rcv_skb);
320
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200321int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800322{
323 int rc = NET_RX_SUCCESS;
324
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700325 if (sk_filter(sk, skb))
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800326 goto discard_and_relse;
327
328 skb->dev = NULL;
329
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200330 if (nested)
331 bh_lock_sock_nested(sk);
332 else
333 bh_lock_sock(sk);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700334 if (!sock_owned_by_user(sk)) {
335 /*
336 * trylock + unlock semantics:
337 */
338 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
339
Peter Zijlstrac57943a2008-10-07 14:18:42 -0700340 rc = sk_backlog_rcv(sk, skb);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700341
342 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
343 } else
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800344 sk_add_backlog(sk, skb);
345 bh_unlock_sock(sk);
346out:
347 sock_put(sk);
348 return rc;
349discard_and_relse:
350 kfree_skb(skb);
351 goto out;
352}
353EXPORT_SYMBOL(sk_receive_skb);
354
Krishna Kumarea94ff32009-10-19 23:46:45 +0000355void sk_reset_txq(struct sock *sk)
356{
357 sk_tx_queue_clear(sk);
358}
359EXPORT_SYMBOL(sk_reset_txq);
360
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800361struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
362{
363 struct dst_entry *dst = sk->sk_dst_cache;
364
365 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
Krishna Kumare022f0b2009-10-19 23:46:20 +0000366 sk_tx_queue_clear(sk);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800367 sk->sk_dst_cache = NULL;
368 dst_release(dst);
369 return NULL;
370 }
371
372 return dst;
373}
374EXPORT_SYMBOL(__sk_dst_check);
375
376struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
377{
378 struct dst_entry *dst = sk_dst_get(sk);
379
380 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
381 sk_dst_reset(sk);
382 dst_release(dst);
383 return NULL;
384 }
385
386 return dst;
387}
388EXPORT_SYMBOL(sk_dst_check);
389
David S. Miller48788092007-09-14 16:41:03 -0700390static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
391{
392 int ret = -ENOPROTOOPT;
393#ifdef CONFIG_NETDEVICES
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +0900394 struct net *net = sock_net(sk);
David S. Miller48788092007-09-14 16:41:03 -0700395 char devname[IFNAMSIZ];
396 int index;
397
398 /* Sorry... */
399 ret = -EPERM;
400 if (!capable(CAP_NET_RAW))
401 goto out;
402
403 ret = -EINVAL;
404 if (optlen < 0)
405 goto out;
406
407 /* Bind this socket to a particular device like "eth0",
408 * as specified in the passed interface name. If the
409 * name is "" or the option length is zero the socket
410 * is not bound.
411 */
412 if (optlen > IFNAMSIZ - 1)
413 optlen = IFNAMSIZ - 1;
414 memset(devname, 0, sizeof(devname));
415
416 ret = -EFAULT;
417 if (copy_from_user(devname, optval, optlen))
418 goto out;
419
420 if (devname[0] == '\0') {
421 index = 0;
422 } else {
Eric Dumazetbf8e56b2009-11-05 21:03:39 -0800423 struct net_device *dev;
David S. Miller48788092007-09-14 16:41:03 -0700424
Eric Dumazetbf8e56b2009-11-05 21:03:39 -0800425 rcu_read_lock();
426 dev = dev_get_by_name_rcu(net, devname);
427 if (dev)
428 index = dev->ifindex;
429 rcu_read_unlock();
David S. Miller48788092007-09-14 16:41:03 -0700430 ret = -ENODEV;
431 if (!dev)
432 goto out;
David S. Miller48788092007-09-14 16:41:03 -0700433 }
434
435 lock_sock(sk);
436 sk->sk_bound_dev_if = index;
437 sk_dst_reset(sk);
438 release_sock(sk);
439
440 ret = 0;
441
442out:
443#endif
444
445 return ret;
446}
447
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800448static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
449{
450 if (valbool)
451 sock_set_flag(sk, bit);
452 else
453 sock_reset_flag(sk, bit);
454}
455
Linus Torvalds1da177e2005-04-16 15:20:36 -0700456/*
457 * This is meant for all protocols to use and covers goings on
458 * at the socket level. Everything here is generic.
459 */
460
461int sock_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -0700462 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463{
Eric Dumazet2a915252009-05-27 11:30:05 +0000464 struct sock *sk = sock->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700465 int val;
466 int valbool;
467 struct linger ling;
468 int ret = 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900469
Linus Torvalds1da177e2005-04-16 15:20:36 -0700470 /*
471 * Options without arguments
472 */
473
David S. Miller48788092007-09-14 16:41:03 -0700474 if (optname == SO_BINDTODEVICE)
475 return sock_bindtodevice(sk, optval, optlen);
476
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700477 if (optlen < sizeof(int))
478 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900479
Linus Torvalds1da177e2005-04-16 15:20:36 -0700480 if (get_user(val, (int __user *)optval))
481 return -EFAULT;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900482
Eric Dumazet2a915252009-05-27 11:30:05 +0000483 valbool = val ? 1 : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700484
485 lock_sock(sk);
486
Eric Dumazet2a915252009-05-27 11:30:05 +0000487 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700488 case SO_DEBUG:
Eric Dumazet2a915252009-05-27 11:30:05 +0000489 if (val && !capable(CAP_NET_ADMIN))
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700490 ret = -EACCES;
Eric Dumazet2a915252009-05-27 11:30:05 +0000491 else
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800492 sock_valbool_flag(sk, SOCK_DBG, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700493 break;
494 case SO_REUSEADDR:
495 sk->sk_reuse = valbool;
496 break;
497 case SO_TYPE:
Jan Engelhardt49c794e2009-08-04 07:28:28 +0000498 case SO_PROTOCOL:
Jan Engelhardt0d6038e2009-08-04 07:28:29 +0000499 case SO_DOMAIN:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700500 case SO_ERROR:
501 ret = -ENOPROTOOPT;
502 break;
503 case SO_DONTROUTE:
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800504 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700505 break;
506 case SO_BROADCAST:
507 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
508 break;
509 case SO_SNDBUF:
510 /* Don't error on this BSD doesn't and if you think
511 about it this is right. Otherwise apps have to
512 play 'guess the biggest size' games. RCVBUF/SNDBUF
513 are treated in BSD as hints */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900514
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700515 if (val > sysctl_wmem_max)
516 val = sysctl_wmem_max;
Patrick McHardyb0573de2005-08-09 19:30:51 -0700517set_sndbuf:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700518 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
519 if ((val * 2) < SOCK_MIN_SNDBUF)
520 sk->sk_sndbuf = SOCK_MIN_SNDBUF;
521 else
522 sk->sk_sndbuf = val * 2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700523
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700524 /*
525 * Wake up sending tasks if we
526 * upped the value.
527 */
528 sk->sk_write_space(sk);
529 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700530
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700531 case SO_SNDBUFFORCE:
532 if (!capable(CAP_NET_ADMIN)) {
533 ret = -EPERM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700534 break;
535 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700536 goto set_sndbuf;
537
538 case SO_RCVBUF:
539 /* Don't error on this BSD doesn't and if you think
540 about it this is right. Otherwise apps have to
541 play 'guess the biggest size' games. RCVBUF/SNDBUF
542 are treated in BSD as hints */
543
544 if (val > sysctl_rmem_max)
545 val = sysctl_rmem_max;
546set_rcvbuf:
547 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
548 /*
549 * We double it on the way in to account for
550 * "struct sk_buff" etc. overhead. Applications
551 * assume that the SO_RCVBUF setting they make will
552 * allow that much actual data to be received on that
553 * socket.
554 *
555 * Applications are unaware that "struct sk_buff" and
556 * other overheads allocate from the receive buffer
557 * during socket buffer allocation.
558 *
559 * And after considering the possible alternatives,
560 * returning the value we actually used in getsockopt
561 * is the most desirable behavior.
562 */
563 if ((val * 2) < SOCK_MIN_RCVBUF)
564 sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
565 else
566 sk->sk_rcvbuf = val * 2;
567 break;
568
569 case SO_RCVBUFFORCE:
570 if (!capable(CAP_NET_ADMIN)) {
571 ret = -EPERM;
572 break;
573 }
574 goto set_rcvbuf;
575
576 case SO_KEEPALIVE:
577#ifdef CONFIG_INET
578 if (sk->sk_protocol == IPPROTO_TCP)
579 tcp_set_keepalive(sk, valbool);
580#endif
581 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
582 break;
583
584 case SO_OOBINLINE:
585 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
586 break;
587
588 case SO_NO_CHECK:
589 sk->sk_no_check = valbool;
590 break;
591
592 case SO_PRIORITY:
593 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
594 sk->sk_priority = val;
595 else
596 ret = -EPERM;
597 break;
598
599 case SO_LINGER:
600 if (optlen < sizeof(ling)) {
601 ret = -EINVAL; /* 1003.1g */
602 break;
603 }
Eric Dumazet2a915252009-05-27 11:30:05 +0000604 if (copy_from_user(&ling, optval, sizeof(ling))) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700605 ret = -EFAULT;
606 break;
607 }
608 if (!ling.l_onoff)
609 sock_reset_flag(sk, SOCK_LINGER);
610 else {
611#if (BITS_PER_LONG == 32)
612 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
613 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
614 else
615#endif
616 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
617 sock_set_flag(sk, SOCK_LINGER);
618 }
619 break;
620
621 case SO_BSDCOMPAT:
622 sock_warn_obsolete_bsdism("setsockopt");
623 break;
624
625 case SO_PASSCRED:
626 if (valbool)
627 set_bit(SOCK_PASSCRED, &sock->flags);
628 else
629 clear_bit(SOCK_PASSCRED, &sock->flags);
630 break;
631
632 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700633 case SO_TIMESTAMPNS:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700634 if (valbool) {
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700635 if (optname == SO_TIMESTAMP)
636 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
637 else
638 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700639 sock_set_flag(sk, SOCK_RCVTSTAMP);
Patrick Ohly20d49472009-02-12 05:03:38 +0000640 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700641 } else {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700642 sock_reset_flag(sk, SOCK_RCVTSTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700643 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
644 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700645 break;
646
Patrick Ohly20d49472009-02-12 05:03:38 +0000647 case SO_TIMESTAMPING:
648 if (val & ~SOF_TIMESTAMPING_MASK) {
Rémi Denis-Courmontf249fb72009-07-20 00:47:04 +0000649 ret = -EINVAL;
Patrick Ohly20d49472009-02-12 05:03:38 +0000650 break;
651 }
652 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
653 val & SOF_TIMESTAMPING_TX_HARDWARE);
654 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
655 val & SOF_TIMESTAMPING_TX_SOFTWARE);
656 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
657 val & SOF_TIMESTAMPING_RX_HARDWARE);
658 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
659 sock_enable_timestamp(sk,
660 SOCK_TIMESTAMPING_RX_SOFTWARE);
661 else
662 sock_disable_timestamp(sk,
663 SOCK_TIMESTAMPING_RX_SOFTWARE);
664 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
665 val & SOF_TIMESTAMPING_SOFTWARE);
666 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
667 val & SOF_TIMESTAMPING_SYS_HARDWARE);
668 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
669 val & SOF_TIMESTAMPING_RAW_HARDWARE);
670 break;
671
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700672 case SO_RCVLOWAT:
673 if (val < 0)
674 val = INT_MAX;
675 sk->sk_rcvlowat = val ? : 1;
676 break;
677
678 case SO_RCVTIMEO:
679 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
680 break;
681
682 case SO_SNDTIMEO:
683 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
684 break;
685
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700686 case SO_ATTACH_FILTER:
687 ret = -EINVAL;
688 if (optlen == sizeof(struct sock_fprog)) {
689 struct sock_fprog fprog;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700690
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700691 ret = -EFAULT;
692 if (copy_from_user(&fprog, optval, sizeof(fprog)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700693 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700695 ret = sk_attach_filter(&fprog, sk);
696 }
697 break;
698
699 case SO_DETACH_FILTER:
Pavel Emelyanov55b33322007-10-17 21:21:26 -0700700 ret = sk_detach_filter(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700701 break;
702
703 case SO_PASSSEC:
704 if (valbool)
705 set_bit(SOCK_PASSSEC, &sock->flags);
706 else
707 clear_bit(SOCK_PASSSEC, &sock->flags);
708 break;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800709 case SO_MARK:
710 if (!capable(CAP_NET_ADMIN))
711 ret = -EPERM;
Eric Dumazet2a915252009-05-27 11:30:05 +0000712 else
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800713 sk->sk_mark = val;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800714 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700715
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716 /* We implement the SO_SNDLOWAT etc to
717 not be settable (1003.1g 5.3) */
Neil Horman3b885782009-10-12 13:26:31 -0700718 case SO_RXQ_OVFL:
719 if (valbool)
720 sock_set_flag(sk, SOCK_RXQ_OVFL);
721 else
722 sock_reset_flag(sk, SOCK_RXQ_OVFL);
723 break;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700724 default:
725 ret = -ENOPROTOOPT;
726 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900727 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700728 release_sock(sk);
729 return ret;
730}
Eric Dumazet2a915252009-05-27 11:30:05 +0000731EXPORT_SYMBOL(sock_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700732
733
734int sock_getsockopt(struct socket *sock, int level, int optname,
735 char __user *optval, int __user *optlen)
736{
737 struct sock *sk = sock->sk;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900738
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700739 union {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900740 int val;
741 struct linger ling;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700742 struct timeval tm;
743 } v;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900744
Linus Torvalds1da177e2005-04-16 15:20:36 -0700745 unsigned int lv = sizeof(int);
746 int len;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900747
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700748 if (get_user(len, optlen))
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900749 return -EFAULT;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700750 if (len < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700751 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900752
Eugene Teo50fee1d2009-02-23 15:38:41 -0800753 memset(&v, 0, sizeof(v));
Clément Lecignedf0bca02009-02-12 16:59:09 -0800754
Eric Dumazet2a915252009-05-27 11:30:05 +0000755 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700756 case SO_DEBUG:
757 v.val = sock_flag(sk, SOCK_DBG);
758 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900759
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700760 case SO_DONTROUTE:
761 v.val = sock_flag(sk, SOCK_LOCALROUTE);
762 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900763
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700764 case SO_BROADCAST:
765 v.val = !!sock_flag(sk, SOCK_BROADCAST);
766 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700767
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700768 case SO_SNDBUF:
769 v.val = sk->sk_sndbuf;
770 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900771
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700772 case SO_RCVBUF:
773 v.val = sk->sk_rcvbuf;
774 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700775
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700776 case SO_REUSEADDR:
777 v.val = sk->sk_reuse;
778 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700779
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700780 case SO_KEEPALIVE:
781 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
782 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700783
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700784 case SO_TYPE:
785 v.val = sk->sk_type;
786 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700787
Jan Engelhardt49c794e2009-08-04 07:28:28 +0000788 case SO_PROTOCOL:
789 v.val = sk->sk_protocol;
790 break;
791
Jan Engelhardt0d6038e2009-08-04 07:28:29 +0000792 case SO_DOMAIN:
793 v.val = sk->sk_family;
794 break;
795
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700796 case SO_ERROR:
797 v.val = -sock_error(sk);
Eric Dumazet2a915252009-05-27 11:30:05 +0000798 if (v.val == 0)
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700799 v.val = xchg(&sk->sk_err_soft, 0);
800 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700801
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700802 case SO_OOBINLINE:
803 v.val = !!sock_flag(sk, SOCK_URGINLINE);
804 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900805
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700806 case SO_NO_CHECK:
807 v.val = sk->sk_no_check;
808 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700809
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700810 case SO_PRIORITY:
811 v.val = sk->sk_priority;
812 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900813
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700814 case SO_LINGER:
815 lv = sizeof(v.ling);
816 v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER);
817 v.ling.l_linger = sk->sk_lingertime / HZ;
818 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900819
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700820 case SO_BSDCOMPAT:
821 sock_warn_obsolete_bsdism("getsockopt");
822 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700823
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700824 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700825 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
826 !sock_flag(sk, SOCK_RCVTSTAMPNS);
827 break;
828
829 case SO_TIMESTAMPNS:
830 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700831 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700832
Patrick Ohly20d49472009-02-12 05:03:38 +0000833 case SO_TIMESTAMPING:
834 v.val = 0;
835 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
836 v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
837 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
838 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
839 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
840 v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
841 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
842 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
843 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
844 v.val |= SOF_TIMESTAMPING_SOFTWARE;
845 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
846 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
847 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
848 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
849 break;
850
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700851 case SO_RCVTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +0000852 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700853 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
854 v.tm.tv_sec = 0;
855 v.tm.tv_usec = 0;
856 } else {
857 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
858 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700859 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700860 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700861
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700862 case SO_SNDTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +0000863 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700864 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
865 v.tm.tv_sec = 0;
866 v.tm.tv_usec = 0;
867 } else {
868 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
869 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
870 }
871 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700872
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700873 case SO_RCVLOWAT:
874 v.val = sk->sk_rcvlowat;
875 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700876
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700877 case SO_SNDLOWAT:
Eric Dumazet2a915252009-05-27 11:30:05 +0000878 v.val = 1;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700879 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700880
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700881 case SO_PASSCRED:
882 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
883 break;
884
885 case SO_PEERCRED:
886 if (len > sizeof(sk->sk_peercred))
887 len = sizeof(sk->sk_peercred);
888 if (copy_to_user(optval, &sk->sk_peercred, len))
889 return -EFAULT;
890 goto lenout;
891
892 case SO_PEERNAME:
893 {
894 char address[128];
895
896 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
897 return -ENOTCONN;
898 if (lv < len)
899 return -EINVAL;
900 if (copy_to_user(optval, address, len))
901 return -EFAULT;
902 goto lenout;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700903 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700904
905 /* Dubious BSD thing... Probably nobody even uses it, but
906 * the UNIX standard wants it for whatever reason... -DaveM
907 */
908 case SO_ACCEPTCONN:
909 v.val = sk->sk_state == TCP_LISTEN;
910 break;
911
912 case SO_PASSSEC:
913 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
914 break;
915
916 case SO_PEERSEC:
917 return security_socket_getpeersec_stream(sock, optval, optlen, len);
918
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800919 case SO_MARK:
920 v.val = sk->sk_mark;
921 break;
922
Neil Horman3b885782009-10-12 13:26:31 -0700923 case SO_RXQ_OVFL:
924 v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
925 break;
926
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700927 default:
928 return -ENOPROTOOPT;
929 }
930
Linus Torvalds1da177e2005-04-16 15:20:36 -0700931 if (len > lv)
932 len = lv;
933 if (copy_to_user(optval, &v, len))
934 return -EFAULT;
935lenout:
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900936 if (put_user(len, optlen))
937 return -EFAULT;
938 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700939}
940
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700941/*
942 * Initialize an sk_lock.
943 *
944 * (We also register the sk_lock with the lock validator.)
945 */
Dave Jonesb6f99a22007-03-22 12:27:49 -0700946static inline void sock_lock_init(struct sock *sk)
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700947{
Peter Zijlstraed075362006-12-06 20:35:24 -0800948 sock_lock_init_class_and_name(sk,
949 af_family_slock_key_strings[sk->sk_family],
950 af_family_slock_keys + sk->sk_family,
951 af_family_key_strings[sk->sk_family],
952 af_family_keys + sk->sk_family);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700953}
954
Eric Dumazet4dc6dc72009-07-15 23:13:10 +0000955/*
956 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
957 * even temporarly, because of RCU lookups. sk_node should also be left as is.
958 */
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -0700959static void sock_copy(struct sock *nsk, const struct sock *osk)
960{
961#ifdef CONFIG_SECURITY_NETWORK
962 void *sptr = nsk->sk_security;
963#endif
Eric Dumazet4dc6dc72009-07-15 23:13:10 +0000964 BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
Krishna Kumare022f0b2009-10-19 23:46:20 +0000965 sizeof(osk->sk_node) + sizeof(osk->sk_refcnt) +
966 sizeof(osk->sk_tx_queue_mapping));
Eric Dumazet4dc6dc72009-07-15 23:13:10 +0000967 memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
968 osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -0700969#ifdef CONFIG_SECURITY_NETWORK
970 nsk->sk_security = sptr;
971 security_sk_clone(osk, nsk);
972#endif
973}
974
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -0700975static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
976 int family)
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -0700977{
978 struct sock *sk;
979 struct kmem_cache *slab;
980
981 slab = prot->slab;
Eric Dumazete912b112009-07-08 19:36:05 +0000982 if (slab != NULL) {
983 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
984 if (!sk)
985 return sk;
986 if (priority & __GFP_ZERO) {
987 /*
988 * caches using SLAB_DESTROY_BY_RCU should let
989 * sk_node.next un-modified. Special care is taken
990 * when initializing object to zero.
991 */
992 if (offsetof(struct sock, sk_node.next) != 0)
993 memset(sk, 0, offsetof(struct sock, sk_node.next));
994 memset(&sk->sk_node.pprev, 0,
995 prot->obj_size - offsetof(struct sock,
996 sk_node.pprev));
997 }
998 }
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -0700999 else
1000 sk = kmalloc(prot->obj_size, priority);
1001
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001002 if (sk != NULL) {
Vegard Nossuma98b65a2009-02-26 14:46:57 +01001003 kmemcheck_annotate_bitfield(sk, flags);
1004
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001005 if (security_sk_alloc(sk, family, priority))
1006 goto out_free;
1007
1008 if (!try_module_get(prot->owner))
1009 goto out_free_sec;
Krishna Kumare022f0b2009-10-19 23:46:20 +00001010 sk_tx_queue_clear(sk);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001011 }
1012
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001013 return sk;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001014
1015out_free_sec:
1016 security_sk_free(sk);
1017out_free:
1018 if (slab != NULL)
1019 kmem_cache_free(slab, sk);
1020 else
1021 kfree(sk);
1022 return NULL;
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001023}
1024
1025static void sk_prot_free(struct proto *prot, struct sock *sk)
1026{
1027 struct kmem_cache *slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001028 struct module *owner;
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001029
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001030 owner = prot->owner;
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001031 slab = prot->slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001032
1033 security_sk_free(sk);
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001034 if (slab != NULL)
1035 kmem_cache_free(slab, sk);
1036 else
1037 kfree(sk);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001038 module_put(owner);
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001039}
1040
Linus Torvalds1da177e2005-04-16 15:20:36 -07001041/**
1042 * sk_alloc - All socket objects are allocated here
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001043 * @net: the applicable net namespace
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001044 * @family: protocol family
1045 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1046 * @prot: struct proto associated with this new sock instance
Linus Torvalds1da177e2005-04-16 15:20:36 -07001047 */
Eric W. Biederman1b8d7ae2007-10-08 23:24:22 -07001048struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
Pavel Emelyanov6257ff22007-11-01 00:39:31 -07001049 struct proto *prot)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001050{
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001051 struct sock *sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001052
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001053 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001054 if (sk) {
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001055 sk->sk_family = family;
1056 /*
1057 * See comment in struct sock definition to understand
1058 * why we need sk_prot_creator -acme
1059 */
1060 sk->sk_prot = sk->sk_prot_creator = prot;
1061 sock_lock_init(sk);
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001062 sock_net_set(sk, get_net(net));
Jarek Poplawskid66ee052009-08-30 23:15:36 +00001063 atomic_set(&sk->sk_wmem_alloc, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001064 }
Frank Filza79af592005-09-27 15:23:38 -07001065
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001066 return sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001067}
Eric Dumazet2a915252009-05-27 11:30:05 +00001068EXPORT_SYMBOL(sk_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001069
Eric Dumazet2b85a342009-06-11 02:55:43 -07001070static void __sk_free(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001071{
1072 struct sk_filter *filter;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001073
1074 if (sk->sk_destruct)
1075 sk->sk_destruct(sk);
1076
Dmitry Mishinfda9ef52006-08-31 15:28:39 -07001077 filter = rcu_dereference(sk->sk_filter);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001078 if (filter) {
Pavel Emelyanov309dd5f2007-10-17 21:21:51 -07001079 sk_filter_uncharge(sk, filter);
Dmitry Mishinfda9ef52006-08-31 15:28:39 -07001080 rcu_assign_pointer(sk->sk_filter, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001081 }
1082
Patrick Ohly20d49472009-02-12 05:03:38 +00001083 sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1084 sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001085
1086 if (atomic_read(&sk->sk_omem_alloc))
1087 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
Harvey Harrison0dc47872008-03-05 20:47:47 -08001088 __func__, atomic_read(&sk->sk_omem_alloc));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001089
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001090 put_net(sock_net(sk));
Pavel Emelyanovc308c1b2007-11-01 00:33:50 -07001091 sk_prot_free(sk->sk_prot_creator, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001092}
Eric Dumazet2b85a342009-06-11 02:55:43 -07001093
1094void sk_free(struct sock *sk)
1095{
1096 /*
1097 * We substract one from sk_wmem_alloc and can know if
1098 * some packets are still in some tx queue.
1099 * If not null, sock_wfree() will call __sk_free(sk) later
1100 */
1101 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1102 __sk_free(sk);
1103}
Eric Dumazet2a915252009-05-27 11:30:05 +00001104EXPORT_SYMBOL(sk_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001105
Denis V. Lunevedf02082008-02-29 11:18:32 -08001106/*
1107 * Last sock_put should drop referrence to sk->sk_net. It has already
1108 * been dropped in sk_change_net. Taking referrence to stopping namespace
1109 * is not an option.
1110 * Take referrence to a socket to remove it from hash _alive_ and after that
1111 * destroy it in the context of init_net.
1112 */
1113void sk_release_kernel(struct sock *sk)
1114{
1115 if (sk == NULL || sk->sk_socket == NULL)
1116 return;
1117
1118 sock_hold(sk);
1119 sock_release(sk->sk_socket);
Denis V. Lunev65a18ec2008-04-16 01:59:46 -07001120 release_net(sock_net(sk));
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001121 sock_net_set(sk, get_net(&init_net));
Denis V. Lunevedf02082008-02-29 11:18:32 -08001122 sock_put(sk);
1123}
David S. Miller45af1752008-02-29 11:33:19 -08001124EXPORT_SYMBOL(sk_release_kernel);
Denis V. Lunevedf02082008-02-29 11:18:32 -08001125
Al Virodd0fc662005-10-07 07:46:04 +01001126struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001127{
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001128 struct sock *newsk;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001129
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001130 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001131 if (newsk != NULL) {
1132 struct sk_filter *filter;
1133
Venkat Yekkirala892c1412006-08-04 23:08:56 -07001134 sock_copy(newsk, sk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001135
1136 /* SANITY */
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001137 get_net(sock_net(newsk));
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001138 sk_node_init(&newsk->sk_node);
1139 sock_lock_init(newsk);
1140 bh_lock_sock(newsk);
Eric Dumazetfa438cc2007-03-04 16:05:44 -08001141 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001142
1143 atomic_set(&newsk->sk_rmem_alloc, 0);
Eric Dumazet2b85a342009-06-11 02:55:43 -07001144 /*
1145 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1146 */
1147 atomic_set(&newsk->sk_wmem_alloc, 1);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001148 atomic_set(&newsk->sk_omem_alloc, 0);
1149 skb_queue_head_init(&newsk->sk_receive_queue);
1150 skb_queue_head_init(&newsk->sk_write_queue);
Chris Leech97fc2f02006-05-23 17:55:33 -07001151#ifdef CONFIG_NET_DMA
1152 skb_queue_head_init(&newsk->sk_async_wait_queue);
1153#endif
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001154
1155 rwlock_init(&newsk->sk_dst_lock);
1156 rwlock_init(&newsk->sk_callback_lock);
Peter Zijlstra443aef02007-07-19 01:49:00 -07001157 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1158 af_callback_keys + newsk->sk_family,
1159 af_family_clock_key_strings[newsk->sk_family]);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001160
1161 newsk->sk_dst_cache = NULL;
1162 newsk->sk_wmem_queued = 0;
1163 newsk->sk_forward_alloc = 0;
1164 newsk->sk_send_head = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001165 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1166
1167 sock_reset_flag(newsk, SOCK_DONE);
1168 skb_queue_head_init(&newsk->sk_error_queue);
1169
1170 filter = newsk->sk_filter;
1171 if (filter != NULL)
1172 sk_filter_charge(newsk, filter);
1173
1174 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1175 /* It is still raw copy of parent, so invalidate
1176 * destructor and make plain sk_free() */
1177 newsk->sk_destruct = NULL;
1178 sk_free(newsk);
1179 newsk = NULL;
1180 goto out;
1181 }
1182
1183 newsk->sk_err = 0;
1184 newsk->sk_priority = 0;
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001185 /*
1186 * Before updating sk_refcnt, we must commit prior changes to memory
1187 * (Documentation/RCU/rculist_nulls.txt for details)
1188 */
1189 smp_wmb();
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001190 atomic_set(&newsk->sk_refcnt, 2);
1191
1192 /*
1193 * Increment the counter in the same struct proto as the master
1194 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1195 * is the same as sk->sk_prot->socks, as this field was copied
1196 * with memcpy).
1197 *
1198 * This _changes_ the previous behaviour, where
1199 * tcp_create_openreq_child always was incrementing the
1200 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1201 * to be taken into account in all callers. -acme
1202 */
1203 sk_refcnt_debug_inc(newsk);
David S. Miller972692e2008-06-17 22:41:38 -07001204 sk_set_socket(newsk, NULL);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001205 newsk->sk_sleep = NULL;
1206
1207 if (newsk->sk_prot->sockets_allocated)
Eric Dumazet17483762008-11-25 21:16:35 -08001208 percpu_counter_inc(newsk->sk_prot->sockets_allocated);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001209 }
1210out:
1211 return newsk;
1212}
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001213EXPORT_SYMBOL_GPL(sk_clone);
1214
Andi Kleen99580892007-04-20 17:12:43 -07001215void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1216{
1217 __sk_dst_set(sk, dst);
1218 sk->sk_route_caps = dst->dev->features;
1219 if (sk->sk_route_caps & NETIF_F_GSO)
Herbert Xu4fcd6b92007-05-31 22:15:50 -07001220 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
Andi Kleen99580892007-04-20 17:12:43 -07001221 if (sk_can_gso(sk)) {
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001222 if (dst->header_len) {
Andi Kleen99580892007-04-20 17:12:43 -07001223 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001224 } else {
Andi Kleen99580892007-04-20 17:12:43 -07001225 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001226 sk->sk_gso_max_size = dst->dev->gso_max_size;
1227 }
Andi Kleen99580892007-04-20 17:12:43 -07001228 }
1229}
1230EXPORT_SYMBOL_GPL(sk_setup_caps);
1231
Linus Torvalds1da177e2005-04-16 15:20:36 -07001232void __init sk_init(void)
1233{
Jan Beulich44813742009-09-21 17:03:05 -07001234 if (totalram_pages <= 4096) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001235 sysctl_wmem_max = 32767;
1236 sysctl_rmem_max = 32767;
1237 sysctl_wmem_default = 32767;
1238 sysctl_rmem_default = 32767;
Jan Beulich44813742009-09-21 17:03:05 -07001239 } else if (totalram_pages >= 131072) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001240 sysctl_wmem_max = 131071;
1241 sysctl_rmem_max = 131071;
1242 }
1243}
1244
1245/*
1246 * Simple resource managers for sockets.
1247 */
1248
1249
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001250/*
1251 * Write buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001252 */
1253void sock_wfree(struct sk_buff *skb)
1254{
1255 struct sock *sk = skb->sk;
Eric Dumazetd99927f2009-09-24 10:49:24 +00001256 unsigned int len = skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001257
Eric Dumazetd99927f2009-09-24 10:49:24 +00001258 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1259 /*
1260 * Keep a reference on sk_wmem_alloc, this will be released
1261 * after sk_write_space() call
1262 */
1263 atomic_sub(len - 1, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001264 sk->sk_write_space(sk);
Eric Dumazetd99927f2009-09-24 10:49:24 +00001265 len = 1;
1266 }
Eric Dumazet2b85a342009-06-11 02:55:43 -07001267 /*
Eric Dumazetd99927f2009-09-24 10:49:24 +00001268 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1269 * could not do because of in-flight packets
Eric Dumazet2b85a342009-06-11 02:55:43 -07001270 */
Eric Dumazetd99927f2009-09-24 10:49:24 +00001271 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
Eric Dumazet2b85a342009-06-11 02:55:43 -07001272 __sk_free(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001273}
Eric Dumazet2a915252009-05-27 11:30:05 +00001274EXPORT_SYMBOL(sock_wfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001275
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001276/*
1277 * Read buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001278 */
1279void sock_rfree(struct sk_buff *skb)
1280{
1281 struct sock *sk = skb->sk;
1282
1283 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001284 sk_mem_uncharge(skb->sk, skb->truesize);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001285}
Eric Dumazet2a915252009-05-27 11:30:05 +00001286EXPORT_SYMBOL(sock_rfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001287
1288
1289int sock_i_uid(struct sock *sk)
1290{
1291 int uid;
1292
1293 read_lock(&sk->sk_callback_lock);
1294 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1295 read_unlock(&sk->sk_callback_lock);
1296 return uid;
1297}
Eric Dumazet2a915252009-05-27 11:30:05 +00001298EXPORT_SYMBOL(sock_i_uid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001299
1300unsigned long sock_i_ino(struct sock *sk)
1301{
1302 unsigned long ino;
1303
1304 read_lock(&sk->sk_callback_lock);
1305 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1306 read_unlock(&sk->sk_callback_lock);
1307 return ino;
1308}
Eric Dumazet2a915252009-05-27 11:30:05 +00001309EXPORT_SYMBOL(sock_i_ino);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001310
1311/*
1312 * Allocate a skb from the socket's send buffer.
1313 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001314struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001315 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001316{
1317 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Eric Dumazet2a915252009-05-27 11:30:05 +00001318 struct sk_buff *skb = alloc_skb(size, priority);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001319 if (skb) {
1320 skb_set_owner_w(skb, sk);
1321 return skb;
1322 }
1323 }
1324 return NULL;
1325}
Eric Dumazet2a915252009-05-27 11:30:05 +00001326EXPORT_SYMBOL(sock_wmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001327
1328/*
1329 * Allocate a skb from the socket's receive buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001330 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001331struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001332 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001333{
1334 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1335 struct sk_buff *skb = alloc_skb(size, priority);
1336 if (skb) {
1337 skb_set_owner_r(skb, sk);
1338 return skb;
1339 }
1340 }
1341 return NULL;
1342}
1343
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001344/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001345 * Allocate a memory block from the socket's option memory buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001346 */
Al Virodd0fc662005-10-07 07:46:04 +01001347void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001348{
1349 if ((unsigned)size <= sysctl_optmem_max &&
1350 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1351 void *mem;
1352 /* First do the add, to avoid the race if kmalloc
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001353 * might sleep.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001354 */
1355 atomic_add(size, &sk->sk_omem_alloc);
1356 mem = kmalloc(size, priority);
1357 if (mem)
1358 return mem;
1359 atomic_sub(size, &sk->sk_omem_alloc);
1360 }
1361 return NULL;
1362}
Eric Dumazet2a915252009-05-27 11:30:05 +00001363EXPORT_SYMBOL(sock_kmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001364
1365/*
1366 * Free an option memory block.
1367 */
1368void sock_kfree_s(struct sock *sk, void *mem, int size)
1369{
1370 kfree(mem);
1371 atomic_sub(size, &sk->sk_omem_alloc);
1372}
Eric Dumazet2a915252009-05-27 11:30:05 +00001373EXPORT_SYMBOL(sock_kfree_s);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001374
1375/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1376 I think, these locks should be removed for datagram sockets.
1377 */
Eric Dumazet2a915252009-05-27 11:30:05 +00001378static long sock_wait_for_wmem(struct sock *sk, long timeo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001379{
1380 DEFINE_WAIT(wait);
1381
1382 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1383 for (;;) {
1384 if (!timeo)
1385 break;
1386 if (signal_pending(current))
1387 break;
1388 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1389 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1390 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1391 break;
1392 if (sk->sk_shutdown & SEND_SHUTDOWN)
1393 break;
1394 if (sk->sk_err)
1395 break;
1396 timeo = schedule_timeout(timeo);
1397 }
1398 finish_wait(sk->sk_sleep, &wait);
1399 return timeo;
1400}
1401
1402
1403/*
1404 * Generic send/receive buffer handlers
1405 */
1406
Herbert Xu4cc7f682009-02-04 16:55:54 -08001407struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1408 unsigned long data_len, int noblock,
1409 int *errcode)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001410{
1411 struct sk_buff *skb;
Al Viro7d877f32005-10-21 03:20:43 -04001412 gfp_t gfp_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001413 long timeo;
1414 int err;
1415
1416 gfp_mask = sk->sk_allocation;
1417 if (gfp_mask & __GFP_WAIT)
1418 gfp_mask |= __GFP_REPEAT;
1419
1420 timeo = sock_sndtimeo(sk, noblock);
1421 while (1) {
1422 err = sock_error(sk);
1423 if (err != 0)
1424 goto failure;
1425
1426 err = -EPIPE;
1427 if (sk->sk_shutdown & SEND_SHUTDOWN)
1428 goto failure;
1429
1430 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Larry Woodmandb38c1792006-11-03 16:05:45 -08001431 skb = alloc_skb(header_len, gfp_mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001432 if (skb) {
1433 int npages;
1434 int i;
1435
1436 /* No pages, we're done... */
1437 if (!data_len)
1438 break;
1439
1440 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1441 skb->truesize += data_len;
1442 skb_shinfo(skb)->nr_frags = npages;
1443 for (i = 0; i < npages; i++) {
1444 struct page *page;
1445 skb_frag_t *frag;
1446
1447 page = alloc_pages(sk->sk_allocation, 0);
1448 if (!page) {
1449 err = -ENOBUFS;
1450 skb_shinfo(skb)->nr_frags = i;
1451 kfree_skb(skb);
1452 goto failure;
1453 }
1454
1455 frag = &skb_shinfo(skb)->frags[i];
1456 frag->page = page;
1457 frag->page_offset = 0;
1458 frag->size = (data_len >= PAGE_SIZE ?
1459 PAGE_SIZE :
1460 data_len);
1461 data_len -= PAGE_SIZE;
1462 }
1463
1464 /* Full success... */
1465 break;
1466 }
1467 err = -ENOBUFS;
1468 goto failure;
1469 }
1470 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1471 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1472 err = -EAGAIN;
1473 if (!timeo)
1474 goto failure;
1475 if (signal_pending(current))
1476 goto interrupted;
1477 timeo = sock_wait_for_wmem(sk, timeo);
1478 }
1479
1480 skb_set_owner_w(skb, sk);
1481 return skb;
1482
1483interrupted:
1484 err = sock_intr_errno(timeo);
1485failure:
1486 *errcode = err;
1487 return NULL;
1488}
Herbert Xu4cc7f682009-02-04 16:55:54 -08001489EXPORT_SYMBOL(sock_alloc_send_pskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001490
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001491struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001492 int noblock, int *errcode)
1493{
1494 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1495}
Eric Dumazet2a915252009-05-27 11:30:05 +00001496EXPORT_SYMBOL(sock_alloc_send_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001497
1498static void __lock_sock(struct sock *sk)
1499{
1500 DEFINE_WAIT(wait);
1501
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001502 for (;;) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001503 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1504 TASK_UNINTERRUPTIBLE);
1505 spin_unlock_bh(&sk->sk_lock.slock);
1506 schedule();
1507 spin_lock_bh(&sk->sk_lock.slock);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001508 if (!sock_owned_by_user(sk))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001509 break;
1510 }
1511 finish_wait(&sk->sk_lock.wq, &wait);
1512}
1513
1514static void __release_sock(struct sock *sk)
1515{
1516 struct sk_buff *skb = sk->sk_backlog.head;
1517
1518 do {
1519 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1520 bh_unlock_sock(sk);
1521
1522 do {
1523 struct sk_buff *next = skb->next;
1524
1525 skb->next = NULL;
Peter Zijlstrac57943a2008-10-07 14:18:42 -07001526 sk_backlog_rcv(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001527
1528 /*
1529 * We are in process context here with softirqs
1530 * disabled, use cond_resched_softirq() to preempt.
1531 * This is safe to do because we've taken the backlog
1532 * queue private:
1533 */
1534 cond_resched_softirq();
1535
1536 skb = next;
1537 } while (skb != NULL);
1538
1539 bh_lock_sock(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001540 } while ((skb = sk->sk_backlog.head) != NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001541}
1542
1543/**
1544 * sk_wait_data - wait for data to arrive at sk_receive_queue
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001545 * @sk: sock to wait on
1546 * @timeo: for how long
Linus Torvalds1da177e2005-04-16 15:20:36 -07001547 *
1548 * Now socket state including sk->sk_err is changed only under lock,
1549 * hence we may omit checks after joining wait queue.
1550 * We check receive queue before schedule() only as optimization;
1551 * it is very likely that release_sock() added new data.
1552 */
1553int sk_wait_data(struct sock *sk, long *timeo)
1554{
1555 int rc;
1556 DEFINE_WAIT(wait);
1557
1558 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1559 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1560 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1561 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1562 finish_wait(sk->sk_sleep, &wait);
1563 return rc;
1564}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001565EXPORT_SYMBOL(sk_wait_data);
1566
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001567/**
1568 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1569 * @sk: socket
1570 * @size: memory size to allocate
1571 * @kind: allocation type
1572 *
1573 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1574 * rmem allocation. This function assumes that protocols which have
1575 * memory_pressure use sk_wmem_queued as write buffer accounting.
1576 */
1577int __sk_mem_schedule(struct sock *sk, int size, int kind)
1578{
1579 struct proto *prot = sk->sk_prot;
1580 int amt = sk_mem_pages(size);
1581 int allocated;
1582
1583 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1584 allocated = atomic_add_return(amt, prot->memory_allocated);
1585
1586 /* Under limit. */
1587 if (allocated <= prot->sysctl_mem[0]) {
1588 if (prot->memory_pressure && *prot->memory_pressure)
1589 *prot->memory_pressure = 0;
1590 return 1;
1591 }
1592
1593 /* Under pressure. */
1594 if (allocated > prot->sysctl_mem[1])
1595 if (prot->enter_memory_pressure)
Pavel Emelyanov5c52ba12008-07-16 20:28:10 -07001596 prot->enter_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001597
1598 /* Over hard limit. */
1599 if (allocated > prot->sysctl_mem[2])
1600 goto suppress_allocation;
1601
1602 /* guarantee minimum buffer size under pressure */
1603 if (kind == SK_MEM_RECV) {
1604 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1605 return 1;
1606 } else { /* SK_MEM_SEND */
1607 if (sk->sk_type == SOCK_STREAM) {
1608 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1609 return 1;
1610 } else if (atomic_read(&sk->sk_wmem_alloc) <
1611 prot->sysctl_wmem[0])
1612 return 1;
1613 }
1614
1615 if (prot->memory_pressure) {
Eric Dumazet17483762008-11-25 21:16:35 -08001616 int alloc;
1617
1618 if (!*prot->memory_pressure)
1619 return 1;
1620 alloc = percpu_counter_read_positive(prot->sockets_allocated);
1621 if (prot->sysctl_mem[2] > alloc *
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001622 sk_mem_pages(sk->sk_wmem_queued +
1623 atomic_read(&sk->sk_rmem_alloc) +
1624 sk->sk_forward_alloc))
1625 return 1;
1626 }
1627
1628suppress_allocation:
1629
1630 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1631 sk_stream_moderate_sndbuf(sk);
1632
1633 /* Fail only if socket is _under_ its sndbuf.
1634 * In this case we cannot block, so that we have to fail.
1635 */
1636 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1637 return 1;
1638 }
1639
1640 /* Alas. Undo changes. */
1641 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1642 atomic_sub(amt, prot->memory_allocated);
1643 return 0;
1644}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001645EXPORT_SYMBOL(__sk_mem_schedule);
1646
1647/**
1648 * __sk_reclaim - reclaim memory_allocated
1649 * @sk: socket
1650 */
1651void __sk_mem_reclaim(struct sock *sk)
1652{
1653 struct proto *prot = sk->sk_prot;
1654
Eric Dumazet680a5a52007-12-31 15:00:50 -08001655 atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001656 prot->memory_allocated);
1657 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1658
1659 if (prot->memory_pressure && *prot->memory_pressure &&
1660 (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1661 *prot->memory_pressure = 0;
1662}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001663EXPORT_SYMBOL(__sk_mem_reclaim);
1664
1665
Linus Torvalds1da177e2005-04-16 15:20:36 -07001666/*
1667 * Set of default routines for initialising struct proto_ops when
1668 * the protocol does not support a particular function. In certain
1669 * cases where it makes no sense for a protocol to have a "do nothing"
1670 * function, some default processing is provided.
1671 */
1672
1673int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1674{
1675 return -EOPNOTSUPP;
1676}
Eric Dumazet2a915252009-05-27 11:30:05 +00001677EXPORT_SYMBOL(sock_no_bind);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001678
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001679int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001680 int len, int flags)
1681{
1682 return -EOPNOTSUPP;
1683}
Eric Dumazet2a915252009-05-27 11:30:05 +00001684EXPORT_SYMBOL(sock_no_connect);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001685
1686int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1687{
1688 return -EOPNOTSUPP;
1689}
Eric Dumazet2a915252009-05-27 11:30:05 +00001690EXPORT_SYMBOL(sock_no_socketpair);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001691
1692int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1693{
1694 return -EOPNOTSUPP;
1695}
Eric Dumazet2a915252009-05-27 11:30:05 +00001696EXPORT_SYMBOL(sock_no_accept);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001697
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001698int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001699 int *len, int peer)
1700{
1701 return -EOPNOTSUPP;
1702}
Eric Dumazet2a915252009-05-27 11:30:05 +00001703EXPORT_SYMBOL(sock_no_getname);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001704
Eric Dumazet2a915252009-05-27 11:30:05 +00001705unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001706{
1707 return 0;
1708}
Eric Dumazet2a915252009-05-27 11:30:05 +00001709EXPORT_SYMBOL(sock_no_poll);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001710
1711int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1712{
1713 return -EOPNOTSUPP;
1714}
Eric Dumazet2a915252009-05-27 11:30:05 +00001715EXPORT_SYMBOL(sock_no_ioctl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001716
1717int sock_no_listen(struct socket *sock, int backlog)
1718{
1719 return -EOPNOTSUPP;
1720}
Eric Dumazet2a915252009-05-27 11:30:05 +00001721EXPORT_SYMBOL(sock_no_listen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001722
1723int sock_no_shutdown(struct socket *sock, int how)
1724{
1725 return -EOPNOTSUPP;
1726}
Eric Dumazet2a915252009-05-27 11:30:05 +00001727EXPORT_SYMBOL(sock_no_shutdown);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001728
1729int sock_no_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07001730 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001731{
1732 return -EOPNOTSUPP;
1733}
Eric Dumazet2a915252009-05-27 11:30:05 +00001734EXPORT_SYMBOL(sock_no_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001735
1736int sock_no_getsockopt(struct socket *sock, int level, int optname,
1737 char __user *optval, int __user *optlen)
1738{
1739 return -EOPNOTSUPP;
1740}
Eric Dumazet2a915252009-05-27 11:30:05 +00001741EXPORT_SYMBOL(sock_no_getsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001742
1743int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1744 size_t len)
1745{
1746 return -EOPNOTSUPP;
1747}
Eric Dumazet2a915252009-05-27 11:30:05 +00001748EXPORT_SYMBOL(sock_no_sendmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001749
1750int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1751 size_t len, int flags)
1752{
1753 return -EOPNOTSUPP;
1754}
Eric Dumazet2a915252009-05-27 11:30:05 +00001755EXPORT_SYMBOL(sock_no_recvmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001756
1757int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1758{
1759 /* Mirror missing mmap method error code */
1760 return -ENODEV;
1761}
Eric Dumazet2a915252009-05-27 11:30:05 +00001762EXPORT_SYMBOL(sock_no_mmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001763
1764ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1765{
1766 ssize_t res;
1767 struct msghdr msg = {.msg_flags = flags};
1768 struct kvec iov;
1769 char *kaddr = kmap(page);
1770 iov.iov_base = kaddr + offset;
1771 iov.iov_len = size;
1772 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1773 kunmap(page);
1774 return res;
1775}
Eric Dumazet2a915252009-05-27 11:30:05 +00001776EXPORT_SYMBOL(sock_no_sendpage);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001777
1778/*
1779 * Default Socket Callbacks
1780 */
1781
1782static void sock_def_wakeup(struct sock *sk)
1783{
1784 read_lock(&sk->sk_callback_lock);
Jiri Olsaa57de0b2009-07-08 12:09:13 +00001785 if (sk_has_sleeper(sk))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001786 wake_up_interruptible_all(sk->sk_sleep);
1787 read_unlock(&sk->sk_callback_lock);
1788}
1789
1790static void sock_def_error_report(struct sock *sk)
1791{
1792 read_lock(&sk->sk_callback_lock);
Jiri Olsaa57de0b2009-07-08 12:09:13 +00001793 if (sk_has_sleeper(sk))
Davide Libenzi37e55402009-03-31 15:24:21 -07001794 wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001795 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001796 read_unlock(&sk->sk_callback_lock);
1797}
1798
1799static void sock_def_readable(struct sock *sk, int len)
1800{
1801 read_lock(&sk->sk_callback_lock);
Jiri Olsaa57de0b2009-07-08 12:09:13 +00001802 if (sk_has_sleeper(sk))
Davide Libenzi37e55402009-03-31 15:24:21 -07001803 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
1804 POLLRDNORM | POLLRDBAND);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001805 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001806 read_unlock(&sk->sk_callback_lock);
1807}
1808
1809static void sock_def_write_space(struct sock *sk)
1810{
1811 read_lock(&sk->sk_callback_lock);
1812
1813 /* Do not wake up a writer until he can make "significant"
1814 * progress. --DaveM
1815 */
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001816 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
Jiri Olsaa57de0b2009-07-08 12:09:13 +00001817 if (sk_has_sleeper(sk))
Davide Libenzi37e55402009-03-31 15:24:21 -07001818 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
1819 POLLWRNORM | POLLWRBAND);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001820
1821 /* Should agree with poll, otherwise some programs break */
1822 if (sock_writeable(sk))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001823 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001824 }
1825
1826 read_unlock(&sk->sk_callback_lock);
1827}
1828
1829static void sock_def_destruct(struct sock *sk)
1830{
Jesper Juhla51482b2005-11-08 09:41:34 -08001831 kfree(sk->sk_protinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001832}
1833
1834void sk_send_sigurg(struct sock *sk)
1835{
1836 if (sk->sk_socket && sk->sk_socket->file)
1837 if (send_sigurg(&sk->sk_socket->file->f_owner))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001838 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001839}
Eric Dumazet2a915252009-05-27 11:30:05 +00001840EXPORT_SYMBOL(sk_send_sigurg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001841
1842void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1843 unsigned long expires)
1844{
1845 if (!mod_timer(timer, expires))
1846 sock_hold(sk);
1847}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001848EXPORT_SYMBOL(sk_reset_timer);
1849
1850void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1851{
1852 if (timer_pending(timer) && del_timer(timer))
1853 __sock_put(sk);
1854}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001855EXPORT_SYMBOL(sk_stop_timer);
1856
1857void sock_init_data(struct socket *sock, struct sock *sk)
1858{
1859 skb_queue_head_init(&sk->sk_receive_queue);
1860 skb_queue_head_init(&sk->sk_write_queue);
1861 skb_queue_head_init(&sk->sk_error_queue);
Chris Leech97fc2f02006-05-23 17:55:33 -07001862#ifdef CONFIG_NET_DMA
1863 skb_queue_head_init(&sk->sk_async_wait_queue);
1864#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001865
1866 sk->sk_send_head = NULL;
1867
1868 init_timer(&sk->sk_timer);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001869
Linus Torvalds1da177e2005-04-16 15:20:36 -07001870 sk->sk_allocation = GFP_KERNEL;
1871 sk->sk_rcvbuf = sysctl_rmem_default;
1872 sk->sk_sndbuf = sysctl_wmem_default;
1873 sk->sk_state = TCP_CLOSE;
David S. Miller972692e2008-06-17 22:41:38 -07001874 sk_set_socket(sk, sock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001875
1876 sock_set_flag(sk, SOCK_ZAPPED);
1877
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001878 if (sock) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001879 sk->sk_type = sock->type;
1880 sk->sk_sleep = &sock->wait;
1881 sock->sk = sk;
1882 } else
1883 sk->sk_sleep = NULL;
1884
1885 rwlock_init(&sk->sk_dst_lock);
1886 rwlock_init(&sk->sk_callback_lock);
Peter Zijlstra443aef02007-07-19 01:49:00 -07001887 lockdep_set_class_and_name(&sk->sk_callback_lock,
1888 af_callback_keys + sk->sk_family,
1889 af_family_clock_key_strings[sk->sk_family]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001890
1891 sk->sk_state_change = sock_def_wakeup;
1892 sk->sk_data_ready = sock_def_readable;
1893 sk->sk_write_space = sock_def_write_space;
1894 sk->sk_error_report = sock_def_error_report;
1895 sk->sk_destruct = sock_def_destruct;
1896
1897 sk->sk_sndmsg_page = NULL;
1898 sk->sk_sndmsg_off = 0;
1899
1900 sk->sk_peercred.pid = 0;
1901 sk->sk_peercred.uid = -1;
1902 sk->sk_peercred.gid = -1;
1903 sk->sk_write_pending = 0;
1904 sk->sk_rcvlowat = 1;
1905 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1906 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
1907
Eric Dumazetf37f0af2008-04-13 21:39:26 -07001908 sk->sk_stamp = ktime_set(-1L, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001909
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001910 /*
1911 * Before updating sk_refcnt, we must commit prior changes to memory
1912 * (Documentation/RCU/rculist_nulls.txt for details)
1913 */
1914 smp_wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001915 atomic_set(&sk->sk_refcnt, 1);
Wang Chen33c732c2007-11-13 20:30:01 -08001916 atomic_set(&sk->sk_drops, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001917}
Eric Dumazet2a915252009-05-27 11:30:05 +00001918EXPORT_SYMBOL(sock_init_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001919
Harvey Harrisonb5606c22008-02-13 15:03:16 -08001920void lock_sock_nested(struct sock *sk, int subclass)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001921{
1922 might_sleep();
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001923 spin_lock_bh(&sk->sk_lock.slock);
John Heffnerd2e91172007-09-12 10:44:19 +02001924 if (sk->sk_lock.owned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001925 __lock_sock(sk);
John Heffnerd2e91172007-09-12 10:44:19 +02001926 sk->sk_lock.owned = 1;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001927 spin_unlock(&sk->sk_lock.slock);
1928 /*
1929 * The sk_lock has mutex_lock() semantics here:
1930 */
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08001931 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001932 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001933}
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08001934EXPORT_SYMBOL(lock_sock_nested);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001935
Harvey Harrisonb5606c22008-02-13 15:03:16 -08001936void release_sock(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001937{
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001938 /*
1939 * The sk_lock has mutex_unlock() semantics:
1940 */
1941 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1942
1943 spin_lock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001944 if (sk->sk_backlog.tail)
1945 __release_sock(sk);
John Heffnerd2e91172007-09-12 10:44:19 +02001946 sk->sk_lock.owned = 0;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001947 if (waitqueue_active(&sk->sk_lock.wq))
1948 wake_up(&sk->sk_lock.wq);
1949 spin_unlock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001950}
1951EXPORT_SYMBOL(release_sock);
1952
1953int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001954{
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001955 struct timeval tv;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001956 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00001957 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001958 tv = ktime_to_timeval(sk->sk_stamp);
1959 if (tv.tv_sec == -1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001960 return -ENOENT;
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001961 if (tv.tv_sec == 0) {
1962 sk->sk_stamp = ktime_get_real();
1963 tv = ktime_to_timeval(sk->sk_stamp);
1964 }
1965 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001966}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001967EXPORT_SYMBOL(sock_get_timestamp);
1968
Eric Dumazetae40eb12007-03-18 17:33:16 -07001969int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1970{
1971 struct timespec ts;
1972 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00001973 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetae40eb12007-03-18 17:33:16 -07001974 ts = ktime_to_timespec(sk->sk_stamp);
1975 if (ts.tv_sec == -1)
1976 return -ENOENT;
1977 if (ts.tv_sec == 0) {
1978 sk->sk_stamp = ktime_get_real();
1979 ts = ktime_to_timespec(sk->sk_stamp);
1980 }
1981 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1982}
1983EXPORT_SYMBOL(sock_get_timestampns);
1984
Patrick Ohly20d49472009-02-12 05:03:38 +00001985void sock_enable_timestamp(struct sock *sk, int flag)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001986{
Patrick Ohly20d49472009-02-12 05:03:38 +00001987 if (!sock_flag(sk, flag)) {
1988 sock_set_flag(sk, flag);
1989 /*
1990 * we just set one of the two flags which require net
1991 * time stamping, but time stamping might have been on
1992 * already because of the other one
1993 */
1994 if (!sock_flag(sk,
1995 flag == SOCK_TIMESTAMP ?
1996 SOCK_TIMESTAMPING_RX_SOFTWARE :
1997 SOCK_TIMESTAMP))
1998 net_enable_timestamp();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001999 }
2000}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002001
2002/*
2003 * Get a socket option on an socket.
2004 *
2005 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2006 * asynchronous errors should be reported by getsockopt. We assume
2007 * this means if you specify SO_ERROR (otherwise whats the point of it).
2008 */
2009int sock_common_getsockopt(struct socket *sock, int level, int optname,
2010 char __user *optval, int __user *optlen)
2011{
2012 struct sock *sk = sock->sk;
2013
2014 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2015}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002016EXPORT_SYMBOL(sock_common_getsockopt);
2017
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002018#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002019int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2020 char __user *optval, int __user *optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002021{
2022 struct sock *sk = sock->sk;
2023
Johannes Berg1e51f952007-03-06 13:44:06 -08002024 if (sk->sk_prot->compat_getsockopt != NULL)
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002025 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2026 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002027 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2028}
2029EXPORT_SYMBOL(compat_sock_common_getsockopt);
2030#endif
2031
Linus Torvalds1da177e2005-04-16 15:20:36 -07002032int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2033 struct msghdr *msg, size_t size, int flags)
2034{
2035 struct sock *sk = sock->sk;
2036 int addr_len = 0;
2037 int err;
2038
2039 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2040 flags & ~MSG_DONTWAIT, &addr_len);
2041 if (err >= 0)
2042 msg->msg_namelen = addr_len;
2043 return err;
2044}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002045EXPORT_SYMBOL(sock_common_recvmsg);
2046
2047/*
2048 * Set socket options on an inet socket.
2049 */
2050int sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002051 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002052{
2053 struct sock *sk = sock->sk;
2054
2055 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2056}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002057EXPORT_SYMBOL(sock_common_setsockopt);
2058
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002059#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002060int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002061 char __user *optval, unsigned int optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002062{
2063 struct sock *sk = sock->sk;
2064
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002065 if (sk->sk_prot->compat_setsockopt != NULL)
2066 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2067 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002068 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2069}
2070EXPORT_SYMBOL(compat_sock_common_setsockopt);
2071#endif
2072
Linus Torvalds1da177e2005-04-16 15:20:36 -07002073void sk_common_release(struct sock *sk)
2074{
2075 if (sk->sk_prot->destroy)
2076 sk->sk_prot->destroy(sk);
2077
2078 /*
2079 * Observation: when sock_common_release is called, processes have
2080 * no access to socket. But net still has.
2081 * Step one, detach it from networking:
2082 *
2083 * A. Remove from hash tables.
2084 */
2085
2086 sk->sk_prot->unhash(sk);
2087
2088 /*
2089 * In this point socket cannot receive new packets, but it is possible
2090 * that some packets are in flight because some CPU runs receiver and
2091 * did hash table lookup before we unhashed socket. They will achieve
2092 * receive queue and will be purged by socket destructor.
2093 *
2094 * Also we still have packets pending on receive queue and probably,
2095 * our own packets waiting in device queues. sock_destroy will drain
2096 * receive queue, but transmitted packets will delay socket destruction
2097 * until the last reference will be released.
2098 */
2099
2100 sock_orphan(sk);
2101
2102 xfrm_sk_free_policy(sk);
2103
Arnaldo Carvalho de Meloe6848972005-08-09 19:45:38 -07002104 sk_refcnt_debug_release(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002105 sock_put(sk);
2106}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002107EXPORT_SYMBOL(sk_common_release);
2108
2109static DEFINE_RWLOCK(proto_list_lock);
2110static LIST_HEAD(proto_list);
2111
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002112#ifdef CONFIG_PROC_FS
2113#define PROTO_INUSE_NR 64 /* should be enough for the first time */
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002114struct prot_inuse {
2115 int val[PROTO_INUSE_NR];
2116};
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002117
2118static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002119
2120#ifdef CONFIG_NET_NS
2121void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2122{
2123 int cpu = smp_processor_id();
2124 per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2125}
2126EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2127
2128int sock_prot_inuse_get(struct net *net, struct proto *prot)
2129{
2130 int cpu, idx = prot->inuse_idx;
2131 int res = 0;
2132
2133 for_each_possible_cpu(cpu)
2134 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2135
2136 return res >= 0 ? res : 0;
2137}
2138EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2139
2140static int sock_inuse_init_net(struct net *net)
2141{
2142 net->core.inuse = alloc_percpu(struct prot_inuse);
2143 return net->core.inuse ? 0 : -ENOMEM;
2144}
2145
2146static void sock_inuse_exit_net(struct net *net)
2147{
2148 free_percpu(net->core.inuse);
2149}
2150
2151static struct pernet_operations net_inuse_ops = {
2152 .init = sock_inuse_init_net,
2153 .exit = sock_inuse_exit_net,
2154};
2155
2156static __init int net_inuse_init(void)
2157{
2158 if (register_pernet_subsys(&net_inuse_ops))
2159 panic("Cannot initialize net inuse counters");
2160
2161 return 0;
2162}
2163
2164core_initcall(net_inuse_init);
2165#else
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002166static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2167
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002168void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002169{
2170 __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2171}
2172EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2173
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002174int sock_prot_inuse_get(struct net *net, struct proto *prot)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002175{
2176 int cpu, idx = prot->inuse_idx;
2177 int res = 0;
2178
2179 for_each_possible_cpu(cpu)
2180 res += per_cpu(prot_inuse, cpu).val[idx];
2181
2182 return res >= 0 ? res : 0;
2183}
2184EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002185#endif
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002186
2187static void assign_proto_idx(struct proto *prot)
2188{
2189 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2190
2191 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2192 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2193 return;
2194 }
2195
2196 set_bit(prot->inuse_idx, proto_inuse_idx);
2197}
2198
2199static void release_proto_idx(struct proto *prot)
2200{
2201 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2202 clear_bit(prot->inuse_idx, proto_inuse_idx);
2203}
2204#else
2205static inline void assign_proto_idx(struct proto *prot)
2206{
2207}
2208
2209static inline void release_proto_idx(struct proto *prot)
2210{
2211}
2212#endif
2213
Linus Torvalds1da177e2005-04-16 15:20:36 -07002214int proto_register(struct proto *prot, int alloc_slab)
2215{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002216 if (alloc_slab) {
2217 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
Eric Dumazet271b72c2008-10-29 02:11:14 -07002218 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2219 NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002220
2221 if (prot->slab == NULL) {
2222 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2223 prot->name);
Pavel Emelyanov60e76632008-03-28 16:39:10 -07002224 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002225 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002226
2227 if (prot->rsk_prot != NULL) {
2228 static const char mask[] = "request_sock_%s";
2229
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002230 prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2231 if (prot->rsk_prot->slab_name == NULL)
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002232 goto out_free_sock_slab;
2233
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002234 sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2235 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002236 prot->rsk_prot->obj_size, 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09002237 SLAB_HWCACHE_ALIGN, NULL);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002238
2239 if (prot->rsk_prot->slab == NULL) {
2240 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2241 prot->name);
2242 goto out_free_request_sock_slab_name;
2243 }
2244 }
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002245
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002246 if (prot->twsk_prot != NULL) {
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002247 static const char mask[] = "tw_sock_%s";
2248
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002249 prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002250
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002251 if (prot->twsk_prot->twsk_slab_name == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002252 goto out_free_request_sock_slab;
2253
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002254 sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002255 prot->twsk_prot->twsk_slab =
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002256 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002257 prot->twsk_prot->twsk_obj_size,
Eric Dumazet3ab5aee2008-11-16 19:40:17 -08002258 0,
2259 SLAB_HWCACHE_ALIGN |
2260 prot->slab_flags,
Paul Mundt20c2df82007-07-20 10:11:58 +09002261 NULL);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002262 if (prot->twsk_prot->twsk_slab == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002263 goto out_free_timewait_sock_slab_name;
2264 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002265 }
2266
Arnaldo Carvalho de Melo2a278052005-04-16 15:24:09 -07002267 write_lock(&proto_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002268 list_add(&prot->node, &proto_list);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002269 assign_proto_idx(prot);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002270 write_unlock(&proto_list_lock);
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002271 return 0;
2272
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002273out_free_timewait_sock_slab_name:
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002274 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002275out_free_request_sock_slab:
2276 if (prot->rsk_prot && prot->rsk_prot->slab) {
2277 kmem_cache_destroy(prot->rsk_prot->slab);
2278 prot->rsk_prot->slab = NULL;
2279 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002280out_free_request_sock_slab_name:
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002281 kfree(prot->rsk_prot->slab_name);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002282out_free_sock_slab:
2283 kmem_cache_destroy(prot->slab);
2284 prot->slab = NULL;
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002285out:
2286 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002287}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002288EXPORT_SYMBOL(proto_register);
2289
2290void proto_unregister(struct proto *prot)
2291{
2292 write_lock(&proto_list_lock);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002293 release_proto_idx(prot);
Patrick McHardy0a3f4352005-09-06 19:47:50 -07002294 list_del(&prot->node);
2295 write_unlock(&proto_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002296
2297 if (prot->slab != NULL) {
2298 kmem_cache_destroy(prot->slab);
2299 prot->slab = NULL;
2300 }
2301
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002302 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002303 kmem_cache_destroy(prot->rsk_prot->slab);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002304 kfree(prot->rsk_prot->slab_name);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002305 prot->rsk_prot->slab = NULL;
2306 }
2307
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002308 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002309 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002310 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002311 prot->twsk_prot->twsk_slab = NULL;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002312 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002313}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002314EXPORT_SYMBOL(proto_unregister);
2315
2316#ifdef CONFIG_PROC_FS
Linus Torvalds1da177e2005-04-16 15:20:36 -07002317static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
Eric Dumazet9a429c42008-01-01 21:58:02 -08002318 __acquires(proto_list_lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002319{
2320 read_lock(&proto_list_lock);
Pavel Emelianov60f04382007-07-09 13:15:14 -07002321 return seq_list_start_head(&proto_list, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002322}
2323
2324static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2325{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002326 return seq_list_next(v, &proto_list, pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002327}
2328
2329static void proto_seq_stop(struct seq_file *seq, void *v)
Eric Dumazet9a429c42008-01-01 21:58:02 -08002330 __releases(proto_list_lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002331{
2332 read_unlock(&proto_list_lock);
2333}
2334
2335static char proto_method_implemented(const void *method)
2336{
2337 return method == NULL ? 'n' : 'y';
2338}
2339
2340static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2341{
2342 seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s "
2343 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2344 proto->name,
2345 proto->obj_size,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002346 sock_prot_inuse_get(seq_file_net(seq), proto),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002347 proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2348 proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2349 proto->max_header,
2350 proto->slab == NULL ? "no" : "yes",
2351 module_name(proto->owner),
2352 proto_method_implemented(proto->close),
2353 proto_method_implemented(proto->connect),
2354 proto_method_implemented(proto->disconnect),
2355 proto_method_implemented(proto->accept),
2356 proto_method_implemented(proto->ioctl),
2357 proto_method_implemented(proto->init),
2358 proto_method_implemented(proto->destroy),
2359 proto_method_implemented(proto->shutdown),
2360 proto_method_implemented(proto->setsockopt),
2361 proto_method_implemented(proto->getsockopt),
2362 proto_method_implemented(proto->sendmsg),
2363 proto_method_implemented(proto->recvmsg),
2364 proto_method_implemented(proto->sendpage),
2365 proto_method_implemented(proto->bind),
2366 proto_method_implemented(proto->backlog_rcv),
2367 proto_method_implemented(proto->hash),
2368 proto_method_implemented(proto->unhash),
2369 proto_method_implemented(proto->get_port),
2370 proto_method_implemented(proto->enter_memory_pressure));
2371}
2372
2373static int proto_seq_show(struct seq_file *seq, void *v)
2374{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002375 if (v == &proto_list)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002376 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2377 "protocol",
2378 "size",
2379 "sockets",
2380 "memory",
2381 "press",
2382 "maxhdr",
2383 "slab",
2384 "module",
2385 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2386 else
Pavel Emelianov60f04382007-07-09 13:15:14 -07002387 proto_seq_printf(seq, list_entry(v, struct proto, node));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002388 return 0;
2389}
2390
Stephen Hemmingerf6908082007-03-12 14:34:29 -07002391static const struct seq_operations proto_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002392 .start = proto_seq_start,
2393 .next = proto_seq_next,
2394 .stop = proto_seq_stop,
2395 .show = proto_seq_show,
2396};
2397
2398static int proto_seq_open(struct inode *inode, struct file *file)
2399{
Eric Dumazet14e943d2008-11-19 15:14:01 -08002400 return seq_open_net(inode, file, &proto_seq_ops,
2401 sizeof(struct seq_net_private));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002402}
2403
Arjan van de Ven9a321442007-02-12 00:55:35 -08002404static const struct file_operations proto_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002405 .owner = THIS_MODULE,
2406 .open = proto_seq_open,
2407 .read = seq_read,
2408 .llseek = seq_lseek,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002409 .release = seq_release_net,
2410};
2411
2412static __net_init int proto_init_net(struct net *net)
2413{
2414 if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2415 return -ENOMEM;
2416
2417 return 0;
2418}
2419
2420static __net_exit void proto_exit_net(struct net *net)
2421{
2422 proc_net_remove(net, "protocols");
2423}
2424
2425
2426static __net_initdata struct pernet_operations proto_net_ops = {
2427 .init = proto_init_net,
2428 .exit = proto_exit_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002429};
2430
2431static int __init proto_init(void)
2432{
Eric Dumazet14e943d2008-11-19 15:14:01 -08002433 return register_pernet_subsys(&proto_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002434}
2435
2436subsys_initcall(proto_init);
2437
2438#endif /* PROC_FS */