[NET]: Detect hardware rx checksum faults correctly
[linux-2.6.git] / net / core / datagram.c
1 /*
2  *      SUCS NET3:
3  *
4  *      Generic datagram handling routines. These are generic for all
5  *      protocols. Possibly a generic IP version on top of these would
6  *      make sense. Not tonight however 8-).
7  *      This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and
8  *      NetROM layer all have identical poll code and mostly
9  *      identical recvmsg() code. So we share it here. The poll was
10  *      shared before but buried in udp.c so I moved it.
11  *
12  *      Authors:        Alan Cox <alan@redhat.com>. (datagram_poll() from old
13  *                                                   udp.c code)
14  *
15  *      Fixes:
16  *              Alan Cox        :       NULL return from skb_peek_copy()
17  *                                      understood
18  *              Alan Cox        :       Rewrote skb_read_datagram to avoid the
19  *                                      skb_peek_copy stuff.
20  *              Alan Cox        :       Added support for SOCK_SEQPACKET.
21  *                                      IPX can no longer use the SO_TYPE hack
22  *                                      but AX.25 now works right, and SPX is
23  *                                      feasible.
24  *              Alan Cox        :       Fixed write poll of non IP protocol
25  *                                      crash.
26  *              Florian  La Roche:      Changed for my new skbuff handling.
27  *              Darryl Miles    :       Fixed non-blocking SOCK_SEQPACKET.
28  *              Linus Torvalds  :       BSD semantic fixes.
29  *              Alan Cox        :       Datagram iovec handling
30  *              Darryl Miles    :       Fixed non-blocking SOCK_STREAM.
31  *              Alan Cox        :       POSIXisms
32  *              Pete Wyckoff    :       Unconnected accept() fix.
33  *
34  */
35
36 #include <linux/module.h>
37 #include <linux/types.h>
38 #include <linux/kernel.h>
39 #include <asm/uaccess.h>
40 #include <asm/system.h>
41 #include <linux/mm.h>
42 #include <linux/interrupt.h>
43 #include <linux/errno.h>
44 #include <linux/sched.h>
45 #include <linux/inet.h>
46 #include <linux/netdevice.h>
47 #include <linux/rtnetlink.h>
48 #include <linux/poll.h>
49 #include <linux/highmem.h>
50
51 #include <net/protocol.h>
52 #include <linux/skbuff.h>
53
54 #include <net/checksum.h>
55 #include <net/sock.h>
56 #include <net/tcp_states.h>
57
58 /*
59  *      Is a socket 'connection oriented' ?
60  */
61 static inline int connection_based(struct sock *sk)
62 {
63         return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
64 }
65
66 /*
67  * Wait for a packet..
68  */
69 static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
70 {
71         int error;
72         DEFINE_WAIT(wait);
73
74         prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
75
76         /* Socket errors? */
77         error = sock_error(sk);
78         if (error)
79                 goto out_err;
80
81         if (!skb_queue_empty(&sk->sk_receive_queue))
82                 goto out;
83
84         /* Socket shut down? */
85         if (sk->sk_shutdown & RCV_SHUTDOWN)
86                 goto out_noerr;
87
88         /* Sequenced packets can come disconnected.
89          * If so we report the problem
90          */
91         error = -ENOTCONN;
92         if (connection_based(sk) &&
93             !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
94                 goto out_err;
95
96         /* handle signals */
97         if (signal_pending(current))
98                 goto interrupted;
99
100         error = 0;
101         *timeo_p = schedule_timeout(*timeo_p);
102 out:
103         finish_wait(sk->sk_sleep, &wait);
104         return error;
105 interrupted:
106         error = sock_intr_errno(*timeo_p);
107 out_err:
108         *err = error;
109         goto out;
110 out_noerr:
111         *err = 0;
112         error = 1;
113         goto out;
114 }
115
116 /**
117  *      skb_recv_datagram - Receive a datagram skbuff
118  *      @sk: socket
119  *      @flags: MSG_ flags
120  *      @noblock: blocking operation?
121  *      @err: error code returned
122  *
123  *      Get a datagram skbuff, understands the peeking, nonblocking wakeups
124  *      and possible races. This replaces identical code in packet, raw and
125  *      udp, as well as the IPX AX.25 and Appletalk. It also finally fixes
126  *      the long standing peek and read race for datagram sockets. If you
127  *      alter this routine remember it must be re-entrant.
128  *
129  *      This function will lock the socket if a skb is returned, so the caller
130  *      needs to unlock the socket in that case (usually by calling
131  *      skb_free_datagram)
132  *
133  *      * It does not lock socket since today. This function is
134  *      * free of race conditions. This measure should/can improve
135  *      * significantly datagram socket latencies at high loads,
136  *      * when data copying to user space takes lots of time.
137  *      * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
138  *      *  8) Great win.)
139  *      *                                           --ANK (980729)
140  *
141  *      The order of the tests when we find no data waiting are specified
142  *      quite explicitly by POSIX 1003.1g, don't change them without having
143  *      the standard around please.
144  */
145 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
146                                   int noblock, int *err)
147 {
148         struct sk_buff *skb;
149         long timeo;
150         /*
151          * Caller is allowed not to check sk->sk_err before skb_recv_datagram()
152          */
153         int error = sock_error(sk);
154
155         if (error)
156                 goto no_packet;
157
158         timeo = sock_rcvtimeo(sk, noblock);
159
160         do {
161                 /* Again only user level code calls this function, so nothing
162                  * interrupt level will suddenly eat the receive_queue.
163                  *
164                  * Look at current nfs client by the way...
165                  * However, this function was corrent in any case. 8)
166                  */
167                 if (flags & MSG_PEEK) {
168                         unsigned long cpu_flags;
169
170                         spin_lock_irqsave(&sk->sk_receive_queue.lock,
171                                           cpu_flags);
172                         skb = skb_peek(&sk->sk_receive_queue);
173                         if (skb)
174                                 atomic_inc(&skb->users);
175                         spin_unlock_irqrestore(&sk->sk_receive_queue.lock,
176                                                cpu_flags);
177                 } else
178                         skb = skb_dequeue(&sk->sk_receive_queue);
179
180                 if (skb)
181                         return skb;
182
183                 /* User doesn't want to wait */
184                 error = -EAGAIN;
185                 if (!timeo)
186                         goto no_packet;
187
188         } while (!wait_for_packet(sk, err, &timeo));
189
190         return NULL;
191
192 no_packet:
193         *err = error;
194         return NULL;
195 }
196
197 void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
198 {
199         kfree_skb(skb);
200 }
201
202 /**
203  *      skb_copy_datagram_iovec - Copy a datagram to an iovec.
204  *      @skb: buffer to copy
205  *      @offset: offset in the buffer to start copying from
206  *      @to: io vector to copy to
207  *      @len: amount of data to copy from buffer to iovec
208  *
209  *      Note: the iovec is modified during the copy.
210  */
211 int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
212                             struct iovec *to, int len)
213 {
214         int i, err, fraglen, end = 0;
215         struct sk_buff *next = skb_shinfo(skb)->frag_list;
216
217         if (!len)
218                 return 0;
219
220 next_skb:
221         fraglen = skb_headlen(skb);
222         i = -1;
223
224         while (1) {
225                 int start = end;
226
227                 if ((end += fraglen) > offset) {
228                         int copy = end - offset, o = offset - start;
229
230                         if (copy > len)
231                                 copy = len;
232                         if (i == -1)
233                                 err = memcpy_toiovec(to, skb->data + o, copy);
234                         else {
235                                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
236                                 struct page *page = frag->page;
237                                 void *p = kmap(page) + frag->page_offset + o;
238                                 err = memcpy_toiovec(to, p, copy);
239                                 kunmap(page);
240                         }
241                         if (err)
242                                 goto fault;
243                         if (!(len -= copy))
244                                 return 0;
245                         offset += copy;
246                 }
247                 if (++i >= skb_shinfo(skb)->nr_frags)
248                         break;
249                 fraglen = skb_shinfo(skb)->frags[i].size;
250         }
251         if (next) {
252                 skb = next;
253                 BUG_ON(skb_shinfo(skb)->frag_list);
254                 next = skb->next;
255                 goto next_skb;
256         }
257 fault:
258         return -EFAULT;
259 }
260
261 static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
262                                       u8 __user *to, int len,
263                                       unsigned int *csump)
264 {
265         int start = skb_headlen(skb);
266         int pos = 0;
267         int i, copy = start - offset;
268
269         /* Copy header. */
270         if (copy > 0) {
271                 int err = 0;
272                 if (copy > len)
273                         copy = len;
274                 *csump = csum_and_copy_to_user(skb->data + offset, to, copy,
275                                                *csump, &err);
276                 if (err)
277                         goto fault;
278                 if ((len -= copy) == 0)
279                         return 0;
280                 offset += copy;
281                 to += copy;
282                 pos = copy;
283         }
284
285         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
286                 int end;
287
288                 BUG_TRAP(start <= offset + len);
289
290                 end = start + skb_shinfo(skb)->frags[i].size;
291                 if ((copy = end - offset) > 0) {
292                         unsigned int csum2;
293                         int err = 0;
294                         u8  *vaddr;
295                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
296                         struct page *page = frag->page;
297
298                         if (copy > len)
299                                 copy = len;
300                         vaddr = kmap(page);
301                         csum2 = csum_and_copy_to_user(vaddr +
302                                                         frag->page_offset +
303                                                         offset - start,
304                                                       to, copy, 0, &err);
305                         kunmap(page);
306                         if (err)
307                                 goto fault;
308                         *csump = csum_block_add(*csump, csum2, pos);
309                         if (!(len -= copy))
310                                 return 0;
311                         offset += copy;
312                         to += copy;
313                         pos += copy;
314                 }
315                 start = end;
316         }
317
318         if (skb_shinfo(skb)->frag_list) {
319                 struct sk_buff *list = skb_shinfo(skb)->frag_list;
320
321                 for (; list; list=list->next) {
322                         int end;
323
324                         BUG_TRAP(start <= offset + len);
325
326                         end = start + list->len;
327                         if ((copy = end - offset) > 0) {
328                                 unsigned int csum2 = 0;
329                                 if (copy > len)
330                                         copy = len;
331                                 if (skb_copy_and_csum_datagram(list,
332                                                                offset - start,
333                                                                to, copy,
334                                                                &csum2))
335                                         goto fault;
336                                 *csump = csum_block_add(*csump, csum2, pos);
337                                 if ((len -= copy) == 0)
338                                         return 0;
339                                 offset += copy;
340                                 to += copy;
341                                 pos += copy;
342                         }
343                         start = end;
344                 }
345         }
346         if (!len)
347                 return 0;
348
349 fault:
350         return -EFAULT;
351 }
352
353 unsigned int __skb_checksum_complete(struct sk_buff *skb)
354 {
355         unsigned int sum;
356
357         sum = (u16)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
358         if (likely(!sum)) {
359                 if (unlikely(skb->ip_summed == CHECKSUM_HW))
360                         netdev_rx_csum_fault(skb->dev);
361                 skb->ip_summed = CHECKSUM_UNNECESSARY;
362         }
363         return sum;
364 }
365 EXPORT_SYMBOL(__skb_checksum_complete);
366
367 /**
368  *      skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec.
369  *      @skb: skbuff
370  *      @hlen: hardware length
371  *      @iov: io vector
372  * 
373  *      Caller _must_ check that skb will fit to this iovec.
374  *
375  *      Returns: 0       - success.
376  *               -EINVAL - checksum failure.
377  *               -EFAULT - fault during copy. Beware, in this case iovec
378  *                         can be modified!
379  */
380 int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
381                                      int hlen, struct iovec *iov)
382 {
383         unsigned int csum;
384         int chunk = skb->len - hlen;
385
386         /* Skip filled elements.
387          * Pretty silly, look at memcpy_toiovec, though 8)
388          */
389         while (!iov->iov_len)
390                 iov++;
391
392         if (iov->iov_len < chunk) {
393                 if (__skb_checksum_complete(skb))
394                         goto csum_error;
395                 if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
396                         goto fault;
397         } else {
398                 csum = csum_partial(skb->data, hlen, skb->csum);
399                 if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base,
400                                                chunk, &csum))
401                         goto fault;
402                 if ((unsigned short)csum_fold(csum))
403                         goto csum_error;
404                 if (unlikely(skb->ip_summed == CHECKSUM_HW))
405                         netdev_rx_csum_fault(skb->dev);
406                 iov->iov_len -= chunk;
407                 iov->iov_base += chunk;
408         }
409         return 0;
410 csum_error:
411         return -EINVAL;
412 fault:
413         return -EFAULT;
414 }
415
416 /**
417  *      datagram_poll - generic datagram poll
418  *      @file: file struct
419  *      @sock: socket
420  *      @wait: poll table
421  *
422  *      Datagram poll: Again totally generic. This also handles
423  *      sequenced packet sockets providing the socket receive queue
424  *      is only ever holding data ready to receive.
425  *
426  *      Note: when you _don't_ use this routine for this protocol,
427  *      and you use a different write policy from sock_writeable()
428  *      then please supply your own write_space callback.
429  */
430 unsigned int datagram_poll(struct file *file, struct socket *sock,
431                            poll_table *wait)
432 {
433         struct sock *sk = sock->sk;
434         unsigned int mask;
435
436         poll_wait(file, sk->sk_sleep, wait);
437         mask = 0;
438
439         /* exceptional events? */
440         if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
441                 mask |= POLLERR;
442         if (sk->sk_shutdown == SHUTDOWN_MASK)
443                 mask |= POLLHUP;
444
445         /* readable? */
446         if (!skb_queue_empty(&sk->sk_receive_queue) ||
447             (sk->sk_shutdown & RCV_SHUTDOWN))
448                 mask |= POLLIN | POLLRDNORM;
449
450         /* Connection-based need to check for termination and startup */
451         if (connection_based(sk)) {
452                 if (sk->sk_state == TCP_CLOSE)
453                         mask |= POLLHUP;
454                 /* connection hasn't started yet? */
455                 if (sk->sk_state == TCP_SYN_SENT)
456                         return mask;
457         }
458
459         /* writable? */
460         if (sock_writeable(sk))
461                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
462         else
463                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
464
465         return mask;
466 }
467
468 EXPORT_SYMBOL(datagram_poll);
469 EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
470 EXPORT_SYMBOL(skb_copy_datagram_iovec);
471 EXPORT_SYMBOL(skb_free_datagram);
472 EXPORT_SYMBOL(skb_recv_datagram);