packet: Add GSO/csum offload support.
[linux-2.6.git] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *              Johann Baudy    :       Added TX RING.
43  *
44  *              This program is free software; you can redistribute it and/or
45  *              modify it under the terms of the GNU General Public License
46  *              as published by the Free Software Foundation; either version
47  *              2 of the License, or (at your option) any later version.
48  *
49  */
50
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <net/net_namespace.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82 #include <linux/if_vlan.h>
83 #include <linux/virtio_net.h>
84
85 #ifdef CONFIG_INET
86 #include <net/inet_common.h>
87 #endif
88
89 /*
90    Assumptions:
91    - if device has no dev->hard_header routine, it adds and removes ll header
92      inside itself. In this case ll header is invisible outside of device,
93      but higher levels still should reserve dev->hard_header_len.
94      Some devices are enough clever to reallocate skb, when header
95      will not fit to reserved space (tunnel), another ones are silly
96      (PPP).
97    - packet socket receives packets with pulled ll header,
98      so that SOCK_RAW should push it back.
99
100 On receive:
101 -----------
102
103 Incoming, dev->hard_header!=NULL
104    mac_header -> ll header
105    data       -> data
106
107 Outgoing, dev->hard_header!=NULL
108    mac_header -> ll header
109    data       -> ll header
110
111 Incoming, dev->hard_header==NULL
112    mac_header -> UNKNOWN position. It is very likely, that it points to ll
113                  header.  PPP makes it, that is wrong, because introduce
114                  assymetry between rx and tx paths.
115    data       -> data
116
117 Outgoing, dev->hard_header==NULL
118    mac_header -> data. ll header is still not built!
119    data       -> data
120
121 Resume
122   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
123
124
125 On transmit:
126 ------------
127
128 dev->hard_header != NULL
129    mac_header -> ll header
130    data       -> ll header
131
132 dev->hard_header == NULL (ll header is added by device, we cannot control it)
133    mac_header -> data
134    data       -> data
135
136    We should set nh.raw on output to correct posistion,
137    packet classifier depends on it.
138  */
139
140 /* Private packet socket structures. */
141
142 struct packet_mclist {
143         struct packet_mclist    *next;
144         int                     ifindex;
145         int                     count;
146         unsigned short          type;
147         unsigned short          alen;
148         unsigned char           addr[MAX_ADDR_LEN];
149 };
150 /* identical to struct packet_mreq except it has
151  * a longer address field.
152  */
153 struct packet_mreq_max {
154         int             mr_ifindex;
155         unsigned short  mr_type;
156         unsigned short  mr_alen;
157         unsigned char   mr_address[MAX_ADDR_LEN];
158 };
159
160 #ifdef CONFIG_PACKET_MMAP
161 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
162                 int closing, int tx_ring);
163
164 struct packet_ring_buffer {
165         char                    **pg_vec;
166         unsigned int            head;
167         unsigned int            frames_per_block;
168         unsigned int            frame_size;
169         unsigned int            frame_max;
170
171         unsigned int            pg_vec_order;
172         unsigned int            pg_vec_pages;
173         unsigned int            pg_vec_len;
174
175         atomic_t                pending;
176 };
177
178 struct packet_sock;
179 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
180 #endif
181
182 static void packet_flush_mclist(struct sock *sk);
183
184 struct packet_sock {
185         /* struct sock has to be the first member of packet_sock */
186         struct sock             sk;
187         struct tpacket_stats    stats;
188 #ifdef CONFIG_PACKET_MMAP
189         struct packet_ring_buffer       rx_ring;
190         struct packet_ring_buffer       tx_ring;
191         int                     copy_thresh;
192 #endif
193         spinlock_t              bind_lock;
194         struct mutex            pg_vec_lock;
195         unsigned int            running:1,      /* prot_hook is attached*/
196                                 auxdata:1,
197                                 origdev:1,
198                                 has_vnet_hdr:1;
199         int                     ifindex;        /* bound device         */
200         __be16                  num;
201         struct packet_mclist    *mclist;
202 #ifdef CONFIG_PACKET_MMAP
203         atomic_t                mapped;
204         enum tpacket_versions   tp_version;
205         unsigned int            tp_hdrlen;
206         unsigned int            tp_reserve;
207         unsigned int            tp_loss:1;
208 #endif
209         struct packet_type      prot_hook ____cacheline_aligned_in_smp;
210 };
211
212 struct packet_skb_cb {
213         unsigned int origlen;
214         union {
215                 struct sockaddr_pkt pkt;
216                 struct sockaddr_ll ll;
217         } sa;
218 };
219
220 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
221
222 #ifdef CONFIG_PACKET_MMAP
223
224 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
225 {
226         union {
227                 struct tpacket_hdr *h1;
228                 struct tpacket2_hdr *h2;
229                 void *raw;
230         } h;
231
232         h.raw = frame;
233         switch (po->tp_version) {
234         case TPACKET_V1:
235                 h.h1->tp_status = status;
236                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
237                 break;
238         case TPACKET_V2:
239                 h.h2->tp_status = status;
240                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
241                 break;
242         default:
243                 pr_err("TPACKET version not supported\n");
244                 BUG();
245         }
246
247         smp_wmb();
248 }
249
250 static int __packet_get_status(struct packet_sock *po, void *frame)
251 {
252         union {
253                 struct tpacket_hdr *h1;
254                 struct tpacket2_hdr *h2;
255                 void *raw;
256         } h;
257
258         smp_rmb();
259
260         h.raw = frame;
261         switch (po->tp_version) {
262         case TPACKET_V1:
263                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
264                 return h.h1->tp_status;
265         case TPACKET_V2:
266                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
267                 return h.h2->tp_status;
268         default:
269                 pr_err("TPACKET version not supported\n");
270                 BUG();
271                 return 0;
272         }
273 }
274
275 static void *packet_lookup_frame(struct packet_sock *po,
276                 struct packet_ring_buffer *rb,
277                 unsigned int position,
278                 int status)
279 {
280         unsigned int pg_vec_pos, frame_offset;
281         union {
282                 struct tpacket_hdr *h1;
283                 struct tpacket2_hdr *h2;
284                 void *raw;
285         } h;
286
287         pg_vec_pos = position / rb->frames_per_block;
288         frame_offset = position % rb->frames_per_block;
289
290         h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
291
292         if (status != __packet_get_status(po, h.raw))
293                 return NULL;
294
295         return h.raw;
296 }
297
298 static inline void *packet_current_frame(struct packet_sock *po,
299                 struct packet_ring_buffer *rb,
300                 int status)
301 {
302         return packet_lookup_frame(po, rb, rb->head, status);
303 }
304
305 static inline void *packet_previous_frame(struct packet_sock *po,
306                 struct packet_ring_buffer *rb,
307                 int status)
308 {
309         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
310         return packet_lookup_frame(po, rb, previous, status);
311 }
312
313 static inline void packet_increment_head(struct packet_ring_buffer *buff)
314 {
315         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
316 }
317
318 #endif
319
320 static inline struct packet_sock *pkt_sk(struct sock *sk)
321 {
322         return (struct packet_sock *)sk;
323 }
324
325 static void packet_sock_destruct(struct sock *sk)
326 {
327         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
328         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
329
330         if (!sock_flag(sk, SOCK_DEAD)) {
331                 pr_err("Attempt to release alive packet socket: %p\n", sk);
332                 return;
333         }
334
335         sk_refcnt_debug_dec(sk);
336 }
337
338
339 static const struct proto_ops packet_ops;
340
341 static const struct proto_ops packet_ops_spkt;
342
343 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
344                            struct packet_type *pt, struct net_device *orig_dev)
345 {
346         struct sock *sk;
347         struct sockaddr_pkt *spkt;
348
349         /*
350          *      When we registered the protocol we saved the socket in the data
351          *      field for just this event.
352          */
353
354         sk = pt->af_packet_priv;
355
356         /*
357          *      Yank back the headers [hope the device set this
358          *      right or kerboom...]
359          *
360          *      Incoming packets have ll header pulled,
361          *      push it back.
362          *
363          *      For outgoing ones skb->data == skb_mac_header(skb)
364          *      so that this procedure is noop.
365          */
366
367         if (skb->pkt_type == PACKET_LOOPBACK)
368                 goto out;
369
370         if (!net_eq(dev_net(dev), sock_net(sk)))
371                 goto out;
372
373         skb = skb_share_check(skb, GFP_ATOMIC);
374         if (skb == NULL)
375                 goto oom;
376
377         /* drop any routing info */
378         skb_dst_drop(skb);
379
380         /* drop conntrack reference */
381         nf_reset(skb);
382
383         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
384
385         skb_push(skb, skb->data - skb_mac_header(skb));
386
387         /*
388          *      The SOCK_PACKET socket receives _all_ frames.
389          */
390
391         spkt->spkt_family = dev->type;
392         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
393         spkt->spkt_protocol = skb->protocol;
394
395         /*
396          *      Charge the memory to the socket. This is done specifically
397          *      to prevent sockets using all the memory up.
398          */
399
400         if (sock_queue_rcv_skb(sk, skb) == 0)
401                 return 0;
402
403 out:
404         kfree_skb(skb);
405 oom:
406         return 0;
407 }
408
409
410 /*
411  *      Output a raw packet to a device layer. This bypasses all the other
412  *      protocol layers and you must therefore supply it with a complete frame
413  */
414
415 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
416                                struct msghdr *msg, size_t len)
417 {
418         struct sock *sk = sock->sk;
419         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
420         struct sk_buff *skb = NULL;
421         struct net_device *dev;
422         __be16 proto = 0;
423         int err;
424
425         /*
426          *      Get and verify the address.
427          */
428
429         if (saddr) {
430                 if (msg->msg_namelen < sizeof(struct sockaddr))
431                         return -EINVAL;
432                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
433                         proto = saddr->spkt_protocol;
434         } else
435                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
436
437         /*
438          *      Find the device first to size check it
439          */
440
441         saddr->spkt_device[13] = 0;
442 retry:
443         rcu_read_lock();
444         dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
445         err = -ENODEV;
446         if (dev == NULL)
447                 goto out_unlock;
448
449         err = -ENETDOWN;
450         if (!(dev->flags & IFF_UP))
451                 goto out_unlock;
452
453         /*
454          * You may not queue a frame bigger than the mtu. This is the lowest level
455          * raw protocol and you must do your own fragmentation at this level.
456          */
457
458         err = -EMSGSIZE;
459         if (len > dev->mtu + dev->hard_header_len)
460                 goto out_unlock;
461
462         if (!skb) {
463                 size_t reserved = LL_RESERVED_SPACE(dev);
464                 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
465
466                 rcu_read_unlock();
467                 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
468                 if (skb == NULL)
469                         return -ENOBUFS;
470                 /* FIXME: Save some space for broken drivers that write a hard
471                  * header at transmission time by themselves. PPP is the notable
472                  * one here. This should really be fixed at the driver level.
473                  */
474                 skb_reserve(skb, reserved);
475                 skb_reset_network_header(skb);
476
477                 /* Try to align data part correctly */
478                 if (hhlen) {
479                         skb->data -= hhlen;
480                         skb->tail -= hhlen;
481                         if (len < hhlen)
482                                 skb_reset_network_header(skb);
483                 }
484                 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
485                 if (err)
486                         goto out_free;
487                 goto retry;
488         }
489
490
491         skb->protocol = proto;
492         skb->dev = dev;
493         skb->priority = sk->sk_priority;
494         skb->mark = sk->sk_mark;
495
496         dev_queue_xmit(skb);
497         rcu_read_unlock();
498         return len;
499
500 out_unlock:
501         rcu_read_unlock();
502 out_free:
503         kfree_skb(skb);
504         return err;
505 }
506
507 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
508                                       unsigned int res)
509 {
510         struct sk_filter *filter;
511
512         rcu_read_lock_bh();
513         filter = rcu_dereference(sk->sk_filter);
514         if (filter != NULL)
515                 res = sk_run_filter(skb, filter->insns, filter->len);
516         rcu_read_unlock_bh();
517
518         return res;
519 }
520
521 /*
522    This function makes lazy skb cloning in hope that most of packets
523    are discarded by BPF.
524
525    Note tricky part: we DO mangle shared skb! skb->data, skb->len
526    and skb->cb are mangled. It works because (and until) packets
527    falling here are owned by current CPU. Output packets are cloned
528    by dev_queue_xmit_nit(), input packets are processed by net_bh
529    sequencially, so that if we return skb to original state on exit,
530    we will not harm anyone.
531  */
532
533 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
534                       struct packet_type *pt, struct net_device *orig_dev)
535 {
536         struct sock *sk;
537         struct sockaddr_ll *sll;
538         struct packet_sock *po;
539         u8 *skb_head = skb->data;
540         int skb_len = skb->len;
541         unsigned int snaplen, res;
542
543         if (skb->pkt_type == PACKET_LOOPBACK)
544                 goto drop;
545
546         sk = pt->af_packet_priv;
547         po = pkt_sk(sk);
548
549         if (!net_eq(dev_net(dev), sock_net(sk)))
550                 goto drop;
551
552         skb->dev = dev;
553
554         if (dev->header_ops) {
555                 /* The device has an explicit notion of ll header,
556                    exported to higher levels.
557
558                    Otherwise, the device hides datails of it frame
559                    structure, so that corresponding packet head
560                    never delivered to user.
561                  */
562                 if (sk->sk_type != SOCK_DGRAM)
563                         skb_push(skb, skb->data - skb_mac_header(skb));
564                 else if (skb->pkt_type == PACKET_OUTGOING) {
565                         /* Special case: outgoing packets have ll header at head */
566                         skb_pull(skb, skb_network_offset(skb));
567                 }
568         }
569
570         snaplen = skb->len;
571
572         res = run_filter(skb, sk, snaplen);
573         if (!res)
574                 goto drop_n_restore;
575         if (snaplen > res)
576                 snaplen = res;
577
578         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
579             (unsigned)sk->sk_rcvbuf)
580                 goto drop_n_acct;
581
582         if (skb_shared(skb)) {
583                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
584                 if (nskb == NULL)
585                         goto drop_n_acct;
586
587                 if (skb_head != skb->data) {
588                         skb->data = skb_head;
589                         skb->len = skb_len;
590                 }
591                 kfree_skb(skb);
592                 skb = nskb;
593         }
594
595         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
596                      sizeof(skb->cb));
597
598         sll = &PACKET_SKB_CB(skb)->sa.ll;
599         sll->sll_family = AF_PACKET;
600         sll->sll_hatype = dev->type;
601         sll->sll_protocol = skb->protocol;
602         sll->sll_pkttype = skb->pkt_type;
603         if (unlikely(po->origdev))
604                 sll->sll_ifindex = orig_dev->ifindex;
605         else
606                 sll->sll_ifindex = dev->ifindex;
607
608         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
609
610         PACKET_SKB_CB(skb)->origlen = skb->len;
611
612         if (pskb_trim(skb, snaplen))
613                 goto drop_n_acct;
614
615         skb_set_owner_r(skb, sk);
616         skb->dev = NULL;
617         skb_dst_drop(skb);
618
619         /* drop conntrack reference */
620         nf_reset(skb);
621
622         spin_lock(&sk->sk_receive_queue.lock);
623         po->stats.tp_packets++;
624         skb->dropcount = atomic_read(&sk->sk_drops);
625         __skb_queue_tail(&sk->sk_receive_queue, skb);
626         spin_unlock(&sk->sk_receive_queue.lock);
627         sk->sk_data_ready(sk, skb->len);
628         return 0;
629
630 drop_n_acct:
631         po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
632
633 drop_n_restore:
634         if (skb_head != skb->data && skb_shared(skb)) {
635                 skb->data = skb_head;
636                 skb->len = skb_len;
637         }
638 drop:
639         consume_skb(skb);
640         return 0;
641 }
642
643 #ifdef CONFIG_PACKET_MMAP
644 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
645                        struct packet_type *pt, struct net_device *orig_dev)
646 {
647         struct sock *sk;
648         struct packet_sock *po;
649         struct sockaddr_ll *sll;
650         union {
651                 struct tpacket_hdr *h1;
652                 struct tpacket2_hdr *h2;
653                 void *raw;
654         } h;
655         u8 *skb_head = skb->data;
656         int skb_len = skb->len;
657         unsigned int snaplen, res;
658         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
659         unsigned short macoff, netoff, hdrlen;
660         struct sk_buff *copy_skb = NULL;
661         struct timeval tv;
662         struct timespec ts;
663
664         if (skb->pkt_type == PACKET_LOOPBACK)
665                 goto drop;
666
667         sk = pt->af_packet_priv;
668         po = pkt_sk(sk);
669
670         if (!net_eq(dev_net(dev), sock_net(sk)))
671                 goto drop;
672
673         if (dev->header_ops) {
674                 if (sk->sk_type != SOCK_DGRAM)
675                         skb_push(skb, skb->data - skb_mac_header(skb));
676                 else if (skb->pkt_type == PACKET_OUTGOING) {
677                         /* Special case: outgoing packets have ll header at head */
678                         skb_pull(skb, skb_network_offset(skb));
679                 }
680         }
681
682         if (skb->ip_summed == CHECKSUM_PARTIAL)
683                 status |= TP_STATUS_CSUMNOTREADY;
684
685         snaplen = skb->len;
686
687         res = run_filter(skb, sk, snaplen);
688         if (!res)
689                 goto drop_n_restore;
690         if (snaplen > res)
691                 snaplen = res;
692
693         if (sk->sk_type == SOCK_DGRAM) {
694                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
695                                   po->tp_reserve;
696         } else {
697                 unsigned maclen = skb_network_offset(skb);
698                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
699                                        (maclen < 16 ? 16 : maclen)) +
700                         po->tp_reserve;
701                 macoff = netoff - maclen;
702         }
703
704         if (macoff + snaplen > po->rx_ring.frame_size) {
705                 if (po->copy_thresh &&
706                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
707                     (unsigned)sk->sk_rcvbuf) {
708                         if (skb_shared(skb)) {
709                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
710                         } else {
711                                 copy_skb = skb_get(skb);
712                                 skb_head = skb->data;
713                         }
714                         if (copy_skb)
715                                 skb_set_owner_r(copy_skb, sk);
716                 }
717                 snaplen = po->rx_ring.frame_size - macoff;
718                 if ((int)snaplen < 0)
719                         snaplen = 0;
720         }
721
722         spin_lock(&sk->sk_receive_queue.lock);
723         h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
724         if (!h.raw)
725                 goto ring_is_full;
726         packet_increment_head(&po->rx_ring);
727         po->stats.tp_packets++;
728         if (copy_skb) {
729                 status |= TP_STATUS_COPY;
730                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
731         }
732         if (!po->stats.tp_drops)
733                 status &= ~TP_STATUS_LOSING;
734         spin_unlock(&sk->sk_receive_queue.lock);
735
736         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
737
738         switch (po->tp_version) {
739         case TPACKET_V1:
740                 h.h1->tp_len = skb->len;
741                 h.h1->tp_snaplen = snaplen;
742                 h.h1->tp_mac = macoff;
743                 h.h1->tp_net = netoff;
744                 if (skb->tstamp.tv64)
745                         tv = ktime_to_timeval(skb->tstamp);
746                 else
747                         do_gettimeofday(&tv);
748                 h.h1->tp_sec = tv.tv_sec;
749                 h.h1->tp_usec = tv.tv_usec;
750                 hdrlen = sizeof(*h.h1);
751                 break;
752         case TPACKET_V2:
753                 h.h2->tp_len = skb->len;
754                 h.h2->tp_snaplen = snaplen;
755                 h.h2->tp_mac = macoff;
756                 h.h2->tp_net = netoff;
757                 if (skb->tstamp.tv64)
758                         ts = ktime_to_timespec(skb->tstamp);
759                 else
760                         getnstimeofday(&ts);
761                 h.h2->tp_sec = ts.tv_sec;
762                 h.h2->tp_nsec = ts.tv_nsec;
763                 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
764                 hdrlen = sizeof(*h.h2);
765                 break;
766         default:
767                 BUG();
768         }
769
770         sll = h.raw + TPACKET_ALIGN(hdrlen);
771         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
772         sll->sll_family = AF_PACKET;
773         sll->sll_hatype = dev->type;
774         sll->sll_protocol = skb->protocol;
775         sll->sll_pkttype = skb->pkt_type;
776         if (unlikely(po->origdev))
777                 sll->sll_ifindex = orig_dev->ifindex;
778         else
779                 sll->sll_ifindex = dev->ifindex;
780
781         __packet_set_status(po, h.raw, status);
782         smp_mb();
783         {
784                 struct page *p_start, *p_end;
785                 u8 *h_end = h.raw + macoff + snaplen - 1;
786
787                 p_start = virt_to_page(h.raw);
788                 p_end = virt_to_page(h_end);
789                 while (p_start <= p_end) {
790                         flush_dcache_page(p_start);
791                         p_start++;
792                 }
793         }
794
795         sk->sk_data_ready(sk, 0);
796
797 drop_n_restore:
798         if (skb_head != skb->data && skb_shared(skb)) {
799                 skb->data = skb_head;
800                 skb->len = skb_len;
801         }
802 drop:
803         kfree_skb(skb);
804         return 0;
805
806 ring_is_full:
807         po->stats.tp_drops++;
808         spin_unlock(&sk->sk_receive_queue.lock);
809
810         sk->sk_data_ready(sk, 0);
811         kfree_skb(copy_skb);
812         goto drop_n_restore;
813 }
814
815 static void tpacket_destruct_skb(struct sk_buff *skb)
816 {
817         struct packet_sock *po = pkt_sk(skb->sk);
818         void *ph;
819
820         BUG_ON(skb == NULL);
821
822         if (likely(po->tx_ring.pg_vec)) {
823                 ph = skb_shinfo(skb)->destructor_arg;
824                 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
825                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
826                 atomic_dec(&po->tx_ring.pending);
827                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
828         }
829
830         sock_wfree(skb);
831 }
832
833 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
834                 void *frame, struct net_device *dev, int size_max,
835                 __be16 proto, unsigned char *addr)
836 {
837         union {
838                 struct tpacket_hdr *h1;
839                 struct tpacket2_hdr *h2;
840                 void *raw;
841         } ph;
842         int to_write, offset, len, tp_len, nr_frags, len_max;
843         struct socket *sock = po->sk.sk_socket;
844         struct page *page;
845         void *data;
846         int err;
847
848         ph.raw = frame;
849
850         skb->protocol = proto;
851         skb->dev = dev;
852         skb->priority = po->sk.sk_priority;
853         skb->mark = po->sk.sk_mark;
854         skb_shinfo(skb)->destructor_arg = ph.raw;
855
856         switch (po->tp_version) {
857         case TPACKET_V2:
858                 tp_len = ph.h2->tp_len;
859                 break;
860         default:
861                 tp_len = ph.h1->tp_len;
862                 break;
863         }
864         if (unlikely(tp_len > size_max)) {
865                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
866                 return -EMSGSIZE;
867         }
868
869         skb_reserve(skb, LL_RESERVED_SPACE(dev));
870         skb_reset_network_header(skb);
871
872         data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
873         to_write = tp_len;
874
875         if (sock->type == SOCK_DGRAM) {
876                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
877                                 NULL, tp_len);
878                 if (unlikely(err < 0))
879                         return -EINVAL;
880         } else if (dev->hard_header_len) {
881                 /* net device doesn't like empty head */
882                 if (unlikely(tp_len <= dev->hard_header_len)) {
883                         pr_err("packet size is too short (%d < %d)\n",
884                                tp_len, dev->hard_header_len);
885                         return -EINVAL;
886                 }
887
888                 skb_push(skb, dev->hard_header_len);
889                 err = skb_store_bits(skb, 0, data,
890                                 dev->hard_header_len);
891                 if (unlikely(err))
892                         return err;
893
894                 data += dev->hard_header_len;
895                 to_write -= dev->hard_header_len;
896         }
897
898         err = -EFAULT;
899         page = virt_to_page(data);
900         offset = offset_in_page(data);
901         len_max = PAGE_SIZE - offset;
902         len = ((to_write > len_max) ? len_max : to_write);
903
904         skb->data_len = to_write;
905         skb->len += to_write;
906         skb->truesize += to_write;
907         atomic_add(to_write, &po->sk.sk_wmem_alloc);
908
909         while (likely(to_write)) {
910                 nr_frags = skb_shinfo(skb)->nr_frags;
911
912                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
913                         pr_err("Packet exceed the number of skb frags(%lu)\n",
914                                MAX_SKB_FRAGS);
915                         return -EFAULT;
916                 }
917
918                 flush_dcache_page(page);
919                 get_page(page);
920                 skb_fill_page_desc(skb,
921                                 nr_frags,
922                                 page++, offset, len);
923                 to_write -= len;
924                 offset = 0;
925                 len_max = PAGE_SIZE;
926                 len = ((to_write > len_max) ? len_max : to_write);
927         }
928
929         return tp_len;
930 }
931
932 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
933 {
934         struct socket *sock;
935         struct sk_buff *skb;
936         struct net_device *dev;
937         __be16 proto;
938         int ifindex, err, reserve = 0;
939         void *ph;
940         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
941         int tp_len, size_max;
942         unsigned char *addr;
943         int len_sum = 0;
944         int status = 0;
945
946         sock = po->sk.sk_socket;
947
948         mutex_lock(&po->pg_vec_lock);
949
950         err = -EBUSY;
951         if (saddr == NULL) {
952                 ifindex = po->ifindex;
953                 proto   = po->num;
954                 addr    = NULL;
955         } else {
956                 err = -EINVAL;
957                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
958                         goto out;
959                 if (msg->msg_namelen < (saddr->sll_halen
960                                         + offsetof(struct sockaddr_ll,
961                                                 sll_addr)))
962                         goto out;
963                 ifindex = saddr->sll_ifindex;
964                 proto   = saddr->sll_protocol;
965                 addr    = saddr->sll_addr;
966         }
967
968         dev = dev_get_by_index(sock_net(&po->sk), ifindex);
969         err = -ENXIO;
970         if (unlikely(dev == NULL))
971                 goto out;
972
973         reserve = dev->hard_header_len;
974
975         err = -ENETDOWN;
976         if (unlikely(!(dev->flags & IFF_UP)))
977                 goto out_put;
978
979         size_max = po->tx_ring.frame_size
980                 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
981
982         if (size_max > dev->mtu + reserve)
983                 size_max = dev->mtu + reserve;
984
985         do {
986                 ph = packet_current_frame(po, &po->tx_ring,
987                                 TP_STATUS_SEND_REQUEST);
988
989                 if (unlikely(ph == NULL)) {
990                         schedule();
991                         continue;
992                 }
993
994                 status = TP_STATUS_SEND_REQUEST;
995                 skb = sock_alloc_send_skb(&po->sk,
996                                 LL_ALLOCATED_SPACE(dev)
997                                 + sizeof(struct sockaddr_ll),
998                                 0, &err);
999
1000                 if (unlikely(skb == NULL))
1001                         goto out_status;
1002
1003                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1004                                 addr);
1005
1006                 if (unlikely(tp_len < 0)) {
1007                         if (po->tp_loss) {
1008                                 __packet_set_status(po, ph,
1009                                                 TP_STATUS_AVAILABLE);
1010                                 packet_increment_head(&po->tx_ring);
1011                                 kfree_skb(skb);
1012                                 continue;
1013                         } else {
1014                                 status = TP_STATUS_WRONG_FORMAT;
1015                                 err = tp_len;
1016                                 goto out_status;
1017                         }
1018                 }
1019
1020                 skb->destructor = tpacket_destruct_skb;
1021                 __packet_set_status(po, ph, TP_STATUS_SENDING);
1022                 atomic_inc(&po->tx_ring.pending);
1023
1024                 status = TP_STATUS_SEND_REQUEST;
1025                 err = dev_queue_xmit(skb);
1026                 if (unlikely(err > 0)) {
1027                         err = net_xmit_errno(err);
1028                         if (err && __packet_get_status(po, ph) ==
1029                                    TP_STATUS_AVAILABLE) {
1030                                 /* skb was destructed already */
1031                                 skb = NULL;
1032                                 goto out_status;
1033                         }
1034                         /*
1035                          * skb was dropped but not destructed yet;
1036                          * let's treat it like congestion or err < 0
1037                          */
1038                         err = 0;
1039                 }
1040                 packet_increment_head(&po->tx_ring);
1041                 len_sum += tp_len;
1042         } while (likely((ph != NULL) ||
1043                         ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1044                          (atomic_read(&po->tx_ring.pending))))
1045                 );
1046
1047         err = len_sum;
1048         goto out_put;
1049
1050 out_status:
1051         __packet_set_status(po, ph, status);
1052         kfree_skb(skb);
1053 out_put:
1054         dev_put(dev);
1055 out:
1056         mutex_unlock(&po->pg_vec_lock);
1057         return err;
1058 }
1059 #endif
1060
1061 static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1062                                                size_t reserve, size_t len,
1063                                                size_t linear, int noblock,
1064                                                int *err)
1065 {
1066         struct sk_buff *skb;
1067
1068         /* Under a page?  Don't bother with paged skb. */
1069         if (prepad + len < PAGE_SIZE || !linear)
1070                 linear = len;
1071
1072         skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1073                                    err);
1074         if (!skb)
1075                 return NULL;
1076
1077         skb_reserve(skb, reserve);
1078         skb_put(skb, linear);
1079         skb->data_len = len - linear;
1080         skb->len += len - linear;
1081
1082         return skb;
1083 }
1084
1085 static int packet_snd(struct socket *sock,
1086                           struct msghdr *msg, size_t len)
1087 {
1088         struct sock *sk = sock->sk;
1089         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1090         struct sk_buff *skb;
1091         struct net_device *dev;
1092         __be16 proto;
1093         unsigned char *addr;
1094         int ifindex, err, reserve = 0;
1095         struct virtio_net_hdr vnet_hdr = { 0 };
1096         int offset = 0;
1097         int vnet_hdr_len;
1098         struct packet_sock *po = pkt_sk(sk);
1099         unsigned short gso_type = 0;
1100
1101         /*
1102          *      Get and verify the address.
1103          */
1104
1105         if (saddr == NULL) {
1106                 ifindex = po->ifindex;
1107                 proto   = po->num;
1108                 addr    = NULL;
1109         } else {
1110                 err = -EINVAL;
1111                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1112                         goto out;
1113                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1114                         goto out;
1115                 ifindex = saddr->sll_ifindex;
1116                 proto   = saddr->sll_protocol;
1117                 addr    = saddr->sll_addr;
1118         }
1119
1120
1121         dev = dev_get_by_index(sock_net(sk), ifindex);
1122         err = -ENXIO;
1123         if (dev == NULL)
1124                 goto out_unlock;
1125         if (sock->type == SOCK_RAW)
1126                 reserve = dev->hard_header_len;
1127
1128         err = -ENETDOWN;
1129         if (!(dev->flags & IFF_UP))
1130                 goto out_unlock;
1131
1132         if (po->has_vnet_hdr) {
1133                 vnet_hdr_len = sizeof(vnet_hdr);
1134
1135                 err = -EINVAL;
1136                 if (len < vnet_hdr_len)
1137                         goto out_unlock;
1138
1139                 len -= vnet_hdr_len;
1140
1141                 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1142                                        vnet_hdr_len);
1143                 if (err < 0)
1144                         goto out_unlock;
1145
1146                 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1147                     (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1148                       vnet_hdr.hdr_len))
1149                         vnet_hdr.hdr_len = vnet_hdr.csum_start +
1150                                                  vnet_hdr.csum_offset + 2;
1151
1152                 err = -EINVAL;
1153                 if (vnet_hdr.hdr_len > len)
1154                         goto out_unlock;
1155
1156                 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1157                         switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1158                         case VIRTIO_NET_HDR_GSO_TCPV4:
1159                                 gso_type = SKB_GSO_TCPV4;
1160                                 break;
1161                         case VIRTIO_NET_HDR_GSO_TCPV6:
1162                                 gso_type = SKB_GSO_TCPV6;
1163                                 break;
1164                         case VIRTIO_NET_HDR_GSO_UDP:
1165                                 gso_type = SKB_GSO_UDP;
1166                                 break;
1167                         default:
1168                                 goto out_unlock;
1169                         }
1170
1171                         if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1172                                 gso_type |= SKB_GSO_TCP_ECN;
1173
1174                         if (vnet_hdr.gso_size == 0)
1175                                 goto out_unlock;
1176
1177                 }
1178         }
1179
1180         err = -EMSGSIZE;
1181         if (!gso_type && (len > dev->mtu+reserve))
1182                 goto out_unlock;
1183
1184         err = -ENOBUFS;
1185         skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1186                                LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1187                                msg->msg_flags & MSG_DONTWAIT, &err);
1188         if (skb == NULL)
1189                 goto out_unlock;
1190
1191         skb_set_network_header(skb, reserve);
1192
1193         err = -EINVAL;
1194         if (sock->type == SOCK_DGRAM &&
1195             (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
1196                 goto out_free;
1197
1198         /* Returns -EFAULT on error */
1199         err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1200         if (err)
1201                 goto out_free;
1202
1203         skb->protocol = proto;
1204         skb->dev = dev;
1205         skb->priority = sk->sk_priority;
1206         skb->mark = sk->sk_mark;
1207
1208         if (po->has_vnet_hdr) {
1209                 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1210                         if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1211                                                   vnet_hdr.csum_offset)) {
1212                                 err = -EINVAL;
1213                                 goto out_free;
1214                         }
1215                 }
1216
1217                 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1218                 skb_shinfo(skb)->gso_type = gso_type;
1219
1220                 /* Header must be checked, and gso_segs computed. */
1221                 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1222                 skb_shinfo(skb)->gso_segs = 0;
1223
1224                 len += vnet_hdr_len;
1225         }
1226
1227         /*
1228          *      Now send it
1229          */
1230
1231         err = dev_queue_xmit(skb);
1232         if (err > 0 && (err = net_xmit_errno(err)) != 0)
1233                 goto out_unlock;
1234
1235         dev_put(dev);
1236
1237         return len;
1238
1239 out_free:
1240         kfree_skb(skb);
1241 out_unlock:
1242         if (dev)
1243                 dev_put(dev);
1244 out:
1245         return err;
1246 }
1247
1248 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1249                 struct msghdr *msg, size_t len)
1250 {
1251 #ifdef CONFIG_PACKET_MMAP
1252         struct sock *sk = sock->sk;
1253         struct packet_sock *po = pkt_sk(sk);
1254         if (po->tx_ring.pg_vec)
1255                 return tpacket_snd(po, msg);
1256         else
1257 #endif
1258                 return packet_snd(sock, msg, len);
1259 }
1260
1261 /*
1262  *      Close a PACKET socket. This is fairly simple. We immediately go
1263  *      to 'closed' state and remove our protocol entry in the device list.
1264  */
1265
1266 static int packet_release(struct socket *sock)
1267 {
1268         struct sock *sk = sock->sk;
1269         struct packet_sock *po;
1270         struct net *net;
1271 #ifdef CONFIG_PACKET_MMAP
1272         struct tpacket_req req;
1273 #endif
1274
1275         if (!sk)
1276                 return 0;
1277
1278         net = sock_net(sk);
1279         po = pkt_sk(sk);
1280
1281         write_lock_bh(&net->packet.sklist_lock);
1282         sk_del_node_init(sk);
1283         sock_prot_inuse_add(net, sk->sk_prot, -1);
1284         write_unlock_bh(&net->packet.sklist_lock);
1285
1286         /*
1287          *      Unhook packet receive handler.
1288          */
1289
1290         if (po->running) {
1291                 /*
1292                  *      Remove the protocol hook
1293                  */
1294                 dev_remove_pack(&po->prot_hook);
1295                 po->running = 0;
1296                 po->num = 0;
1297                 __sock_put(sk);
1298         }
1299
1300         packet_flush_mclist(sk);
1301
1302 #ifdef CONFIG_PACKET_MMAP
1303         memset(&req, 0, sizeof(req));
1304
1305         if (po->rx_ring.pg_vec)
1306                 packet_set_ring(sk, &req, 1, 0);
1307
1308         if (po->tx_ring.pg_vec)
1309                 packet_set_ring(sk, &req, 1, 1);
1310 #endif
1311
1312         /*
1313          *      Now the socket is dead. No more input will appear.
1314          */
1315
1316         sock_orphan(sk);
1317         sock->sk = NULL;
1318
1319         /* Purge queues */
1320
1321         skb_queue_purge(&sk->sk_receive_queue);
1322         sk_refcnt_debug_release(sk);
1323
1324         sock_put(sk);
1325         return 0;
1326 }
1327
1328 /*
1329  *      Attach a packet hook.
1330  */
1331
1332 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1333 {
1334         struct packet_sock *po = pkt_sk(sk);
1335         /*
1336          *      Detach an existing hook if present.
1337          */
1338
1339         lock_sock(sk);
1340
1341         spin_lock(&po->bind_lock);
1342         if (po->running) {
1343                 __sock_put(sk);
1344                 po->running = 0;
1345                 po->num = 0;
1346                 spin_unlock(&po->bind_lock);
1347                 dev_remove_pack(&po->prot_hook);
1348                 spin_lock(&po->bind_lock);
1349         }
1350
1351         po->num = protocol;
1352         po->prot_hook.type = protocol;
1353         po->prot_hook.dev = dev;
1354
1355         po->ifindex = dev ? dev->ifindex : 0;
1356
1357         if (protocol == 0)
1358                 goto out_unlock;
1359
1360         if (!dev || (dev->flags & IFF_UP)) {
1361                 dev_add_pack(&po->prot_hook);
1362                 sock_hold(sk);
1363                 po->running = 1;
1364         } else {
1365                 sk->sk_err = ENETDOWN;
1366                 if (!sock_flag(sk, SOCK_DEAD))
1367                         sk->sk_error_report(sk);
1368         }
1369
1370 out_unlock:
1371         spin_unlock(&po->bind_lock);
1372         release_sock(sk);
1373         return 0;
1374 }
1375
1376 /*
1377  *      Bind a packet socket to a device
1378  */
1379
1380 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1381                             int addr_len)
1382 {
1383         struct sock *sk = sock->sk;
1384         char name[15];
1385         struct net_device *dev;
1386         int err = -ENODEV;
1387
1388         /*
1389          *      Check legality
1390          */
1391
1392         if (addr_len != sizeof(struct sockaddr))
1393                 return -EINVAL;
1394         strlcpy(name, uaddr->sa_data, sizeof(name));
1395
1396         dev = dev_get_by_name(sock_net(sk), name);
1397         if (dev) {
1398                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1399                 dev_put(dev);
1400         }
1401         return err;
1402 }
1403
1404 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1405 {
1406         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1407         struct sock *sk = sock->sk;
1408         struct net_device *dev = NULL;
1409         int err;
1410
1411
1412         /*
1413          *      Check legality
1414          */
1415
1416         if (addr_len < sizeof(struct sockaddr_ll))
1417                 return -EINVAL;
1418         if (sll->sll_family != AF_PACKET)
1419                 return -EINVAL;
1420
1421         if (sll->sll_ifindex) {
1422                 err = -ENODEV;
1423                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1424                 if (dev == NULL)
1425                         goto out;
1426         }
1427         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1428         if (dev)
1429                 dev_put(dev);
1430
1431 out:
1432         return err;
1433 }
1434
1435 static struct proto packet_proto = {
1436         .name     = "PACKET",
1437         .owner    = THIS_MODULE,
1438         .obj_size = sizeof(struct packet_sock),
1439 };
1440
1441 /*
1442  *      Create a packet of type SOCK_PACKET.
1443  */
1444
1445 static int packet_create(struct net *net, struct socket *sock, int protocol,
1446                          int kern)
1447 {
1448         struct sock *sk;
1449         struct packet_sock *po;
1450         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1451         int err;
1452
1453         if (!capable(CAP_NET_RAW))
1454                 return -EPERM;
1455         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1456             sock->type != SOCK_PACKET)
1457                 return -ESOCKTNOSUPPORT;
1458
1459         sock->state = SS_UNCONNECTED;
1460
1461         err = -ENOBUFS;
1462         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1463         if (sk == NULL)
1464                 goto out;
1465
1466         sock->ops = &packet_ops;
1467         if (sock->type == SOCK_PACKET)
1468                 sock->ops = &packet_ops_spkt;
1469
1470         sock_init_data(sock, sk);
1471
1472         po = pkt_sk(sk);
1473         sk->sk_family = PF_PACKET;
1474         po->num = proto;
1475
1476         sk->sk_destruct = packet_sock_destruct;
1477         sk_refcnt_debug_inc(sk);
1478
1479         /*
1480          *      Attach a protocol block
1481          */
1482
1483         spin_lock_init(&po->bind_lock);
1484         mutex_init(&po->pg_vec_lock);
1485         po->prot_hook.func = packet_rcv;
1486
1487         if (sock->type == SOCK_PACKET)
1488                 po->prot_hook.func = packet_rcv_spkt;
1489
1490         po->prot_hook.af_packet_priv = sk;
1491
1492         if (proto) {
1493                 po->prot_hook.type = proto;
1494                 dev_add_pack(&po->prot_hook);
1495                 sock_hold(sk);
1496                 po->running = 1;
1497         }
1498
1499         write_lock_bh(&net->packet.sklist_lock);
1500         sk_add_node(sk, &net->packet.sklist);
1501         sock_prot_inuse_add(net, &packet_proto, 1);
1502         write_unlock_bh(&net->packet.sklist_lock);
1503         return 0;
1504 out:
1505         return err;
1506 }
1507
1508 /*
1509  *      Pull a packet from our receive queue and hand it to the user.
1510  *      If necessary we block.
1511  */
1512
1513 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1514                           struct msghdr *msg, size_t len, int flags)
1515 {
1516         struct sock *sk = sock->sk;
1517         struct sk_buff *skb;
1518         int copied, err;
1519         struct sockaddr_ll *sll;
1520         int vnet_hdr_len = 0;
1521
1522         err = -EINVAL;
1523         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1524                 goto out;
1525
1526 #if 0
1527         /* What error should we return now? EUNATTACH? */
1528         if (pkt_sk(sk)->ifindex < 0)
1529                 return -ENODEV;
1530 #endif
1531
1532         /*
1533          *      Call the generic datagram receiver. This handles all sorts
1534          *      of horrible races and re-entrancy so we can forget about it
1535          *      in the protocol layers.
1536          *
1537          *      Now it will return ENETDOWN, if device have just gone down,
1538          *      but then it will block.
1539          */
1540
1541         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1542
1543         /*
1544          *      An error occurred so return it. Because skb_recv_datagram()
1545          *      handles the blocking we don't see and worry about blocking
1546          *      retries.
1547          */
1548
1549         if (skb == NULL)
1550                 goto out;
1551
1552         if (pkt_sk(sk)->has_vnet_hdr) {
1553                 struct virtio_net_hdr vnet_hdr = { 0 };
1554
1555                 err = -EINVAL;
1556                 vnet_hdr_len = sizeof(vnet_hdr);
1557                 if ((len -= vnet_hdr_len) < 0)
1558                         goto out_free;
1559
1560                 if (skb_is_gso(skb)) {
1561                         struct skb_shared_info *sinfo = skb_shinfo(skb);
1562
1563                         /* This is a hint as to how much should be linear. */
1564                         vnet_hdr.hdr_len = skb_headlen(skb);
1565                         vnet_hdr.gso_size = sinfo->gso_size;
1566                         if (sinfo->gso_type & SKB_GSO_TCPV4)
1567                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1568                         else if (sinfo->gso_type & SKB_GSO_TCPV6)
1569                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1570                         else if (sinfo->gso_type & SKB_GSO_UDP)
1571                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1572                         else if (sinfo->gso_type & SKB_GSO_FCOE)
1573                                 goto out_free;
1574                         else
1575                                 BUG();
1576                         if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1577                                 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1578                 } else
1579                         vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1580
1581                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1582                         vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1583                         vnet_hdr.csum_start = skb->csum_start -
1584                                                         skb_headroom(skb);
1585                         vnet_hdr.csum_offset = skb->csum_offset;
1586                 } /* else everything is zero */
1587
1588                 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1589                                      vnet_hdr_len);
1590                 if (err < 0)
1591                         goto out_free;
1592         }
1593
1594         /*
1595          *      If the address length field is there to be filled in, we fill
1596          *      it in now.
1597          */
1598
1599         sll = &PACKET_SKB_CB(skb)->sa.ll;
1600         if (sock->type == SOCK_PACKET)
1601                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1602         else
1603                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1604
1605         /*
1606          *      You lose any data beyond the buffer you gave. If it worries a
1607          *      user program they can ask the device for its MTU anyway.
1608          */
1609
1610         copied = skb->len;
1611         if (copied > len) {
1612                 copied = len;
1613                 msg->msg_flags |= MSG_TRUNC;
1614         }
1615
1616         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1617         if (err)
1618                 goto out_free;
1619
1620         sock_recv_ts_and_drops(msg, sk, skb);
1621
1622         if (msg->msg_name)
1623                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1624                        msg->msg_namelen);
1625
1626         if (pkt_sk(sk)->auxdata) {
1627                 struct tpacket_auxdata aux;
1628
1629                 aux.tp_status = TP_STATUS_USER;
1630                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1631                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1632                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1633                 aux.tp_snaplen = skb->len;
1634                 aux.tp_mac = 0;
1635                 aux.tp_net = skb_network_offset(skb);
1636                 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1637
1638                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1639         }
1640
1641         /*
1642          *      Free or return the buffer as appropriate. Again this
1643          *      hides all the races and re-entrancy issues from us.
1644          */
1645         err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1646
1647 out_free:
1648         skb_free_datagram(sk, skb);
1649 out:
1650         return err;
1651 }
1652
1653 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1654                                int *uaddr_len, int peer)
1655 {
1656         struct net_device *dev;
1657         struct sock *sk = sock->sk;
1658
1659         if (peer)
1660                 return -EOPNOTSUPP;
1661
1662         uaddr->sa_family = AF_PACKET;
1663         rcu_read_lock();
1664         dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1665         if (dev)
1666                 strlcpy(uaddr->sa_data, dev->name, 15);
1667         else
1668                 memset(uaddr->sa_data, 0, 14);
1669         rcu_read_unlock();
1670         *uaddr_len = sizeof(*uaddr);
1671
1672         return 0;
1673 }
1674
1675 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1676                           int *uaddr_len, int peer)
1677 {
1678         struct net_device *dev;
1679         struct sock *sk = sock->sk;
1680         struct packet_sock *po = pkt_sk(sk);
1681         DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1682
1683         if (peer)
1684                 return -EOPNOTSUPP;
1685
1686         sll->sll_family = AF_PACKET;
1687         sll->sll_ifindex = po->ifindex;
1688         sll->sll_protocol = po->num;
1689         rcu_read_lock();
1690         dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1691         if (dev) {
1692                 sll->sll_hatype = dev->type;
1693                 sll->sll_halen = dev->addr_len;
1694                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1695         } else {
1696                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1697                 sll->sll_halen = 0;
1698         }
1699         rcu_read_unlock();
1700         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1701
1702         return 0;
1703 }
1704
1705 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1706                          int what)
1707 {
1708         switch (i->type) {
1709         case PACKET_MR_MULTICAST:
1710                 if (what > 0)
1711                         return dev_mc_add(dev, i->addr, i->alen, 0);
1712                 else
1713                         return dev_mc_delete(dev, i->addr, i->alen, 0);
1714                 break;
1715         case PACKET_MR_PROMISC:
1716                 return dev_set_promiscuity(dev, what);
1717                 break;
1718         case PACKET_MR_ALLMULTI:
1719                 return dev_set_allmulti(dev, what);
1720                 break;
1721         case PACKET_MR_UNICAST:
1722                 if (what > 0)
1723                         return dev_unicast_add(dev, i->addr);
1724                 else
1725                         return dev_unicast_delete(dev, i->addr);
1726                 break;
1727         default:
1728                 break;
1729         }
1730         return 0;
1731 }
1732
1733 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1734 {
1735         for ( ; i; i = i->next) {
1736                 if (i->ifindex == dev->ifindex)
1737                         packet_dev_mc(dev, i, what);
1738         }
1739 }
1740
1741 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1742 {
1743         struct packet_sock *po = pkt_sk(sk);
1744         struct packet_mclist *ml, *i;
1745         struct net_device *dev;
1746         int err;
1747
1748         rtnl_lock();
1749
1750         err = -ENODEV;
1751         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1752         if (!dev)
1753                 goto done;
1754
1755         err = -EINVAL;
1756         if (mreq->mr_alen > dev->addr_len)
1757                 goto done;
1758
1759         err = -ENOBUFS;
1760         i = kmalloc(sizeof(*i), GFP_KERNEL);
1761         if (i == NULL)
1762                 goto done;
1763
1764         err = 0;
1765         for (ml = po->mclist; ml; ml = ml->next) {
1766                 if (ml->ifindex == mreq->mr_ifindex &&
1767                     ml->type == mreq->mr_type &&
1768                     ml->alen == mreq->mr_alen &&
1769                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1770                         ml->count++;
1771                         /* Free the new element ... */
1772                         kfree(i);
1773                         goto done;
1774                 }
1775         }
1776
1777         i->type = mreq->mr_type;
1778         i->ifindex = mreq->mr_ifindex;
1779         i->alen = mreq->mr_alen;
1780         memcpy(i->addr, mreq->mr_address, i->alen);
1781         i->count = 1;
1782         i->next = po->mclist;
1783         po->mclist = i;
1784         err = packet_dev_mc(dev, i, 1);
1785         if (err) {
1786                 po->mclist = i->next;
1787                 kfree(i);
1788         }
1789
1790 done:
1791         rtnl_unlock();
1792         return err;
1793 }
1794
1795 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1796 {
1797         struct packet_mclist *ml, **mlp;
1798
1799         rtnl_lock();
1800
1801         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1802                 if (ml->ifindex == mreq->mr_ifindex &&
1803                     ml->type == mreq->mr_type &&
1804                     ml->alen == mreq->mr_alen &&
1805                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1806                         if (--ml->count == 0) {
1807                                 struct net_device *dev;
1808                                 *mlp = ml->next;
1809                                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1810                                 if (dev)
1811                                         packet_dev_mc(dev, ml, -1);
1812                                 kfree(ml);
1813                         }
1814                         rtnl_unlock();
1815                         return 0;
1816                 }
1817         }
1818         rtnl_unlock();
1819         return -EADDRNOTAVAIL;
1820 }
1821
1822 static void packet_flush_mclist(struct sock *sk)
1823 {
1824         struct packet_sock *po = pkt_sk(sk);
1825         struct packet_mclist *ml;
1826
1827         if (!po->mclist)
1828                 return;
1829
1830         rtnl_lock();
1831         while ((ml = po->mclist) != NULL) {
1832                 struct net_device *dev;
1833
1834                 po->mclist = ml->next;
1835                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1836                 if (dev != NULL)
1837                         packet_dev_mc(dev, ml, -1);
1838                 kfree(ml);
1839         }
1840         rtnl_unlock();
1841 }
1842
1843 static int
1844 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1845 {
1846         struct sock *sk = sock->sk;
1847         struct packet_sock *po = pkt_sk(sk);
1848         int ret;
1849
1850         if (level != SOL_PACKET)
1851                 return -ENOPROTOOPT;
1852
1853         switch (optname) {
1854         case PACKET_ADD_MEMBERSHIP:
1855         case PACKET_DROP_MEMBERSHIP:
1856         {
1857                 struct packet_mreq_max mreq;
1858                 int len = optlen;
1859                 memset(&mreq, 0, sizeof(mreq));
1860                 if (len < sizeof(struct packet_mreq))
1861                         return -EINVAL;
1862                 if (len > sizeof(mreq))
1863                         len = sizeof(mreq);
1864                 if (copy_from_user(&mreq, optval, len))
1865                         return -EFAULT;
1866                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1867                         return -EINVAL;
1868                 if (optname == PACKET_ADD_MEMBERSHIP)
1869                         ret = packet_mc_add(sk, &mreq);
1870                 else
1871                         ret = packet_mc_drop(sk, &mreq);
1872                 return ret;
1873         }
1874
1875 #ifdef CONFIG_PACKET_MMAP
1876         case PACKET_RX_RING:
1877         case PACKET_TX_RING:
1878         {
1879                 struct tpacket_req req;
1880
1881                 if (optlen < sizeof(req))
1882                         return -EINVAL;
1883                 if (pkt_sk(sk)->has_vnet_hdr)
1884                         return -EINVAL;
1885                 if (copy_from_user(&req, optval, sizeof(req)))
1886                         return -EFAULT;
1887                 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1888         }
1889         case PACKET_COPY_THRESH:
1890         {
1891                 int val;
1892
1893                 if (optlen != sizeof(val))
1894                         return -EINVAL;
1895                 if (copy_from_user(&val, optval, sizeof(val)))
1896                         return -EFAULT;
1897
1898                 pkt_sk(sk)->copy_thresh = val;
1899                 return 0;
1900         }
1901         case PACKET_VERSION:
1902         {
1903                 int val;
1904
1905                 if (optlen != sizeof(val))
1906                         return -EINVAL;
1907                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1908                         return -EBUSY;
1909                 if (copy_from_user(&val, optval, sizeof(val)))
1910                         return -EFAULT;
1911                 switch (val) {
1912                 case TPACKET_V1:
1913                 case TPACKET_V2:
1914                         po->tp_version = val;
1915                         return 0;
1916                 default:
1917                         return -EINVAL;
1918                 }
1919         }
1920         case PACKET_RESERVE:
1921         {
1922                 unsigned int val;
1923
1924                 if (optlen != sizeof(val))
1925                         return -EINVAL;
1926                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1927                         return -EBUSY;
1928                 if (copy_from_user(&val, optval, sizeof(val)))
1929                         return -EFAULT;
1930                 po->tp_reserve = val;
1931                 return 0;
1932         }
1933         case PACKET_LOSS:
1934         {
1935                 unsigned int val;
1936
1937                 if (optlen != sizeof(val))
1938                         return -EINVAL;
1939                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1940                         return -EBUSY;
1941                 if (copy_from_user(&val, optval, sizeof(val)))
1942                         return -EFAULT;
1943                 po->tp_loss = !!val;
1944                 return 0;
1945         }
1946 #endif
1947         case PACKET_AUXDATA:
1948         {
1949                 int val;
1950
1951                 if (optlen < sizeof(val))
1952                         return -EINVAL;
1953                 if (copy_from_user(&val, optval, sizeof(val)))
1954                         return -EFAULT;
1955
1956                 po->auxdata = !!val;
1957                 return 0;
1958         }
1959         case PACKET_ORIGDEV:
1960         {
1961                 int val;
1962
1963                 if (optlen < sizeof(val))
1964                         return -EINVAL;
1965                 if (copy_from_user(&val, optval, sizeof(val)))
1966                         return -EFAULT;
1967
1968                 po->origdev = !!val;
1969                 return 0;
1970         }
1971         case PACKET_VNET_HDR:
1972         {
1973                 int val;
1974
1975                 if (sock->type != SOCK_RAW)
1976                         return -EINVAL;
1977                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1978                         return -EBUSY;
1979                 if (optlen < sizeof(val))
1980                         return -EINVAL;
1981                 if (copy_from_user(&val, optval, sizeof(val)))
1982                         return -EFAULT;
1983
1984                 po->has_vnet_hdr = !!val;
1985                 return 0;
1986         }
1987         default:
1988                 return -ENOPROTOOPT;
1989         }
1990 }
1991
1992 static int packet_getsockopt(struct socket *sock, int level, int optname,
1993                              char __user *optval, int __user *optlen)
1994 {
1995         int len;
1996         int val;
1997         struct sock *sk = sock->sk;
1998         struct packet_sock *po = pkt_sk(sk);
1999         void *data;
2000         struct tpacket_stats st;
2001
2002         if (level != SOL_PACKET)
2003                 return -ENOPROTOOPT;
2004
2005         if (get_user(len, optlen))
2006                 return -EFAULT;
2007
2008         if (len < 0)
2009                 return -EINVAL;
2010
2011         switch (optname) {
2012         case PACKET_STATISTICS:
2013                 if (len > sizeof(struct tpacket_stats))
2014                         len = sizeof(struct tpacket_stats);
2015                 spin_lock_bh(&sk->sk_receive_queue.lock);
2016                 st = po->stats;
2017                 memset(&po->stats, 0, sizeof(st));
2018                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2019                 st.tp_packets += st.tp_drops;
2020
2021                 data = &st;
2022                 break;
2023         case PACKET_AUXDATA:
2024                 if (len > sizeof(int))
2025                         len = sizeof(int);
2026                 val = po->auxdata;
2027
2028                 data = &val;
2029                 break;
2030         case PACKET_ORIGDEV:
2031                 if (len > sizeof(int))
2032                         len = sizeof(int);
2033                 val = po->origdev;
2034
2035                 data = &val;
2036                 break;
2037         case PACKET_VNET_HDR:
2038                 if (len > sizeof(int))
2039                         len = sizeof(int);
2040                 val = po->has_vnet_hdr;
2041
2042                 data = &val;
2043                 break;
2044 #ifdef CONFIG_PACKET_MMAP
2045         case PACKET_VERSION:
2046                 if (len > sizeof(int))
2047                         len = sizeof(int);
2048                 val = po->tp_version;
2049                 data = &val;
2050                 break;
2051         case PACKET_HDRLEN:
2052                 if (len > sizeof(int))
2053                         len = sizeof(int);
2054                 if (copy_from_user(&val, optval, len))
2055                         return -EFAULT;
2056                 switch (val) {
2057                 case TPACKET_V1:
2058                         val = sizeof(struct tpacket_hdr);
2059                         break;
2060                 case TPACKET_V2:
2061                         val = sizeof(struct tpacket2_hdr);
2062                         break;
2063                 default:
2064                         return -EINVAL;
2065                 }
2066                 data = &val;
2067                 break;
2068         case PACKET_RESERVE:
2069                 if (len > sizeof(unsigned int))
2070                         len = sizeof(unsigned int);
2071                 val = po->tp_reserve;
2072                 data = &val;
2073                 break;
2074         case PACKET_LOSS:
2075                 if (len > sizeof(unsigned int))
2076                         len = sizeof(unsigned int);
2077                 val = po->tp_loss;
2078                 data = &val;
2079                 break;
2080 #endif
2081         default:
2082                 return -ENOPROTOOPT;
2083         }
2084
2085         if (put_user(len, optlen))
2086                 return -EFAULT;
2087         if (copy_to_user(optval, data, len))
2088                 return -EFAULT;
2089         return 0;
2090 }
2091
2092
2093 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2094 {
2095         struct sock *sk;
2096         struct hlist_node *node;
2097         struct net_device *dev = data;
2098         struct net *net = dev_net(dev);
2099
2100         read_lock(&net->packet.sklist_lock);
2101         sk_for_each(sk, node, &net->packet.sklist) {
2102                 struct packet_sock *po = pkt_sk(sk);
2103
2104                 switch (msg) {
2105                 case NETDEV_UNREGISTER:
2106                         if (po->mclist)
2107                                 packet_dev_mclist(dev, po->mclist, -1);
2108                         /* fallthrough */
2109
2110                 case NETDEV_DOWN:
2111                         if (dev->ifindex == po->ifindex) {
2112                                 spin_lock(&po->bind_lock);
2113                                 if (po->running) {
2114                                         __dev_remove_pack(&po->prot_hook);
2115                                         __sock_put(sk);
2116                                         po->running = 0;
2117                                         sk->sk_err = ENETDOWN;
2118                                         if (!sock_flag(sk, SOCK_DEAD))
2119                                                 sk->sk_error_report(sk);
2120                                 }
2121                                 if (msg == NETDEV_UNREGISTER) {
2122                                         po->ifindex = -1;
2123                                         po->prot_hook.dev = NULL;
2124                                 }
2125                                 spin_unlock(&po->bind_lock);
2126                         }
2127                         break;
2128                 case NETDEV_UP:
2129                         spin_lock(&po->bind_lock);
2130                         if (dev->ifindex == po->ifindex && po->num &&
2131                             !po->running) {
2132                                 dev_add_pack(&po->prot_hook);
2133                                 sock_hold(sk);
2134                                 po->running = 1;
2135                         }
2136                         spin_unlock(&po->bind_lock);
2137                         break;
2138                 }
2139         }
2140         read_unlock(&net->packet.sklist_lock);
2141         return NOTIFY_DONE;
2142 }
2143
2144
2145 static int packet_ioctl(struct socket *sock, unsigned int cmd,
2146                         unsigned long arg)
2147 {
2148         struct sock *sk = sock->sk;
2149
2150         switch (cmd) {
2151         case SIOCOUTQ:
2152         {
2153                 int amount = sk_wmem_alloc_get(sk);
2154
2155                 return put_user(amount, (int __user *)arg);
2156         }
2157         case SIOCINQ:
2158         {
2159                 struct sk_buff *skb;
2160                 int amount = 0;
2161
2162                 spin_lock_bh(&sk->sk_receive_queue.lock);
2163                 skb = skb_peek(&sk->sk_receive_queue);
2164                 if (skb)
2165                         amount = skb->len;
2166                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2167                 return put_user(amount, (int __user *)arg);
2168         }
2169         case SIOCGSTAMP:
2170                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2171         case SIOCGSTAMPNS:
2172                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2173
2174 #ifdef CONFIG_INET
2175         case SIOCADDRT:
2176         case SIOCDELRT:
2177         case SIOCDARP:
2178         case SIOCGARP:
2179         case SIOCSARP:
2180         case SIOCGIFADDR:
2181         case SIOCSIFADDR:
2182         case SIOCGIFBRDADDR:
2183         case SIOCSIFBRDADDR:
2184         case SIOCGIFNETMASK:
2185         case SIOCSIFNETMASK:
2186         case SIOCGIFDSTADDR:
2187         case SIOCSIFDSTADDR:
2188         case SIOCSIFFLAGS:
2189                 if (!net_eq(sock_net(sk), &init_net))
2190                         return -ENOIOCTLCMD;
2191                 return inet_dgram_ops.ioctl(sock, cmd, arg);
2192 #endif
2193
2194         default:
2195                 return -ENOIOCTLCMD;
2196         }
2197         return 0;
2198 }
2199
2200 #ifndef CONFIG_PACKET_MMAP
2201 #define packet_mmap sock_no_mmap
2202 #define packet_poll datagram_poll
2203 #else
2204
2205 static unsigned int packet_poll(struct file *file, struct socket *sock,
2206                                 poll_table *wait)
2207 {
2208         struct sock *sk = sock->sk;
2209         struct packet_sock *po = pkt_sk(sk);
2210         unsigned int mask = datagram_poll(file, sock, wait);
2211
2212         spin_lock_bh(&sk->sk_receive_queue.lock);
2213         if (po->rx_ring.pg_vec) {
2214                 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2215                         mask |= POLLIN | POLLRDNORM;
2216         }
2217         spin_unlock_bh(&sk->sk_receive_queue.lock);
2218         spin_lock_bh(&sk->sk_write_queue.lock);
2219         if (po->tx_ring.pg_vec) {
2220                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2221                         mask |= POLLOUT | POLLWRNORM;
2222         }
2223         spin_unlock_bh(&sk->sk_write_queue.lock);
2224         return mask;
2225 }
2226
2227
2228 /* Dirty? Well, I still did not learn better way to account
2229  * for user mmaps.
2230  */
2231
2232 static void packet_mm_open(struct vm_area_struct *vma)
2233 {
2234         struct file *file = vma->vm_file;
2235         struct socket *sock = file->private_data;
2236         struct sock *sk = sock->sk;
2237
2238         if (sk)
2239                 atomic_inc(&pkt_sk(sk)->mapped);
2240 }
2241
2242 static void packet_mm_close(struct vm_area_struct *vma)
2243 {
2244         struct file *file = vma->vm_file;
2245         struct socket *sock = file->private_data;
2246         struct sock *sk = sock->sk;
2247
2248         if (sk)
2249                 atomic_dec(&pkt_sk(sk)->mapped);
2250 }
2251
2252 static const struct vm_operations_struct packet_mmap_ops = {
2253         .open   =       packet_mm_open,
2254         .close  =       packet_mm_close,
2255 };
2256
2257 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2258 {
2259         int i;
2260
2261         for (i = 0; i < len; i++) {
2262                 if (likely(pg_vec[i]))
2263                         free_pages((unsigned long) pg_vec[i], order);
2264         }
2265         kfree(pg_vec);
2266 }
2267
2268 static inline char *alloc_one_pg_vec_page(unsigned long order)
2269 {
2270         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2271
2272         return (char *) __get_free_pages(gfp_flags, order);
2273 }
2274
2275 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2276 {
2277         unsigned int block_nr = req->tp_block_nr;
2278         char **pg_vec;
2279         int i;
2280
2281         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2282         if (unlikely(!pg_vec))
2283                 goto out;
2284
2285         for (i = 0; i < block_nr; i++) {
2286                 pg_vec[i] = alloc_one_pg_vec_page(order);
2287                 if (unlikely(!pg_vec[i]))
2288                         goto out_free_pgvec;
2289         }
2290
2291 out:
2292         return pg_vec;
2293
2294 out_free_pgvec:
2295         free_pg_vec(pg_vec, order, block_nr);
2296         pg_vec = NULL;
2297         goto out;
2298 }
2299
2300 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2301                 int closing, int tx_ring)
2302 {
2303         char **pg_vec = NULL;
2304         struct packet_sock *po = pkt_sk(sk);
2305         int was_running, order = 0;
2306         struct packet_ring_buffer *rb;
2307         struct sk_buff_head *rb_queue;
2308         __be16 num;
2309         int err;
2310
2311         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2312         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2313
2314         err = -EBUSY;
2315         if (!closing) {
2316                 if (atomic_read(&po->mapped))
2317                         goto out;
2318                 if (atomic_read(&rb->pending))
2319                         goto out;
2320         }
2321
2322         if (req->tp_block_nr) {
2323                 /* Sanity tests and some calculations */
2324                 err = -EBUSY;
2325                 if (unlikely(rb->pg_vec))
2326                         goto out;
2327
2328                 switch (po->tp_version) {
2329                 case TPACKET_V1:
2330                         po->tp_hdrlen = TPACKET_HDRLEN;
2331                         break;
2332                 case TPACKET_V2:
2333                         po->tp_hdrlen = TPACKET2_HDRLEN;
2334                         break;
2335                 }
2336
2337                 err = -EINVAL;
2338                 if (unlikely((int)req->tp_block_size <= 0))
2339                         goto out;
2340                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2341                         goto out;
2342                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2343                                         po->tp_reserve))
2344                         goto out;
2345                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2346                         goto out;
2347
2348                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2349                 if (unlikely(rb->frames_per_block <= 0))
2350                         goto out;
2351                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2352                                         req->tp_frame_nr))
2353                         goto out;
2354
2355                 err = -ENOMEM;
2356                 order = get_order(req->tp_block_size);
2357                 pg_vec = alloc_pg_vec(req, order);
2358                 if (unlikely(!pg_vec))
2359                         goto out;
2360         }
2361         /* Done */
2362         else {
2363                 err = -EINVAL;
2364                 if (unlikely(req->tp_frame_nr))
2365                         goto out;
2366         }
2367
2368         lock_sock(sk);
2369
2370         /* Detach socket from network */
2371         spin_lock(&po->bind_lock);
2372         was_running = po->running;
2373         num = po->num;
2374         if (was_running) {
2375                 __dev_remove_pack(&po->prot_hook);
2376                 po->num = 0;
2377                 po->running = 0;
2378                 __sock_put(sk);
2379         }
2380         spin_unlock(&po->bind_lock);
2381
2382         synchronize_net();
2383
2384         err = -EBUSY;
2385         mutex_lock(&po->pg_vec_lock);
2386         if (closing || atomic_read(&po->mapped) == 0) {
2387                 err = 0;
2388 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2389                 spin_lock_bh(&rb_queue->lock);
2390                 pg_vec = XC(rb->pg_vec, pg_vec);
2391                 rb->frame_max = (req->tp_frame_nr - 1);
2392                 rb->head = 0;
2393                 rb->frame_size = req->tp_frame_size;
2394                 spin_unlock_bh(&rb_queue->lock);
2395
2396                 order = XC(rb->pg_vec_order, order);
2397                 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2398
2399                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2400                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2401                                                 tpacket_rcv : packet_rcv;
2402                 skb_queue_purge(rb_queue);
2403 #undef XC
2404                 if (atomic_read(&po->mapped))
2405                         pr_err("packet_mmap: vma is busy: %d\n",
2406                                atomic_read(&po->mapped));
2407         }
2408         mutex_unlock(&po->pg_vec_lock);
2409
2410         spin_lock(&po->bind_lock);
2411         if (was_running && !po->running) {
2412                 sock_hold(sk);
2413                 po->running = 1;
2414                 po->num = num;
2415                 dev_add_pack(&po->prot_hook);
2416         }
2417         spin_unlock(&po->bind_lock);
2418
2419         release_sock(sk);
2420
2421         if (pg_vec)
2422                 free_pg_vec(pg_vec, order, req->tp_block_nr);
2423 out:
2424         return err;
2425 }
2426
2427 static int packet_mmap(struct file *file, struct socket *sock,
2428                 struct vm_area_struct *vma)
2429 {
2430         struct sock *sk = sock->sk;
2431         struct packet_sock *po = pkt_sk(sk);
2432         unsigned long size, expected_size;
2433         struct packet_ring_buffer *rb;
2434         unsigned long start;
2435         int err = -EINVAL;
2436         int i;
2437
2438         if (vma->vm_pgoff)
2439                 return -EINVAL;
2440
2441         mutex_lock(&po->pg_vec_lock);
2442
2443         expected_size = 0;
2444         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2445                 if (rb->pg_vec) {
2446                         expected_size += rb->pg_vec_len
2447                                                 * rb->pg_vec_pages
2448                                                 * PAGE_SIZE;
2449                 }
2450         }
2451
2452         if (expected_size == 0)
2453                 goto out;
2454
2455         size = vma->vm_end - vma->vm_start;
2456         if (size != expected_size)
2457                 goto out;
2458
2459         start = vma->vm_start;
2460         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2461                 if (rb->pg_vec == NULL)
2462                         continue;
2463
2464                 for (i = 0; i < rb->pg_vec_len; i++) {
2465                         struct page *page = virt_to_page(rb->pg_vec[i]);
2466                         int pg_num;
2467
2468                         for (pg_num = 0; pg_num < rb->pg_vec_pages;
2469                                         pg_num++, page++) {
2470                                 err = vm_insert_page(vma, start, page);
2471                                 if (unlikely(err))
2472                                         goto out;
2473                                 start += PAGE_SIZE;
2474                         }
2475                 }
2476         }
2477
2478         atomic_inc(&po->mapped);
2479         vma->vm_ops = &packet_mmap_ops;
2480         err = 0;
2481
2482 out:
2483         mutex_unlock(&po->pg_vec_lock);
2484         return err;
2485 }
2486 #endif
2487
2488
2489 static const struct proto_ops packet_ops_spkt = {
2490         .family =       PF_PACKET,
2491         .owner =        THIS_MODULE,
2492         .release =      packet_release,
2493         .bind =         packet_bind_spkt,
2494         .connect =      sock_no_connect,
2495         .socketpair =   sock_no_socketpair,
2496         .accept =       sock_no_accept,
2497         .getname =      packet_getname_spkt,
2498         .poll =         datagram_poll,
2499         .ioctl =        packet_ioctl,
2500         .listen =       sock_no_listen,
2501         .shutdown =     sock_no_shutdown,
2502         .setsockopt =   sock_no_setsockopt,
2503         .getsockopt =   sock_no_getsockopt,
2504         .sendmsg =      packet_sendmsg_spkt,
2505         .recvmsg =      packet_recvmsg,
2506         .mmap =         sock_no_mmap,
2507         .sendpage =     sock_no_sendpage,
2508 };
2509
2510 static const struct proto_ops packet_ops = {
2511         .family =       PF_PACKET,
2512         .owner =        THIS_MODULE,
2513         .release =      packet_release,
2514         .bind =         packet_bind,
2515         .connect =      sock_no_connect,
2516         .socketpair =   sock_no_socketpair,
2517         .accept =       sock_no_accept,
2518         .getname =      packet_getname,
2519         .poll =         packet_poll,
2520         .ioctl =        packet_ioctl,
2521         .listen =       sock_no_listen,
2522         .shutdown =     sock_no_shutdown,
2523         .setsockopt =   packet_setsockopt,
2524         .getsockopt =   packet_getsockopt,
2525         .sendmsg =      packet_sendmsg,
2526         .recvmsg =      packet_recvmsg,
2527         .mmap =         packet_mmap,
2528         .sendpage =     sock_no_sendpage,
2529 };
2530
2531 static const struct net_proto_family packet_family_ops = {
2532         .family =       PF_PACKET,
2533         .create =       packet_create,
2534         .owner  =       THIS_MODULE,
2535 };
2536
2537 static struct notifier_block packet_netdev_notifier = {
2538         .notifier_call =        packet_notifier,
2539 };
2540
2541 #ifdef CONFIG_PROC_FS
2542 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2543 {
2544         struct sock *s;
2545         struct hlist_node *node;
2546
2547         sk_for_each(s, node, &net->packet.sklist) {
2548                 if (!off--)
2549                         return s;
2550         }
2551         return NULL;
2552 }
2553
2554 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2555         __acquires(seq_file_net(seq)->packet.sklist_lock)
2556 {
2557         struct net *net = seq_file_net(seq);
2558         read_lock(&net->packet.sklist_lock);
2559         return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2560 }
2561
2562 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2563 {
2564         struct net *net = seq_file_net(seq);
2565         ++*pos;
2566         return  (v == SEQ_START_TOKEN)
2567                 ? sk_head(&net->packet.sklist)
2568                 : sk_next((struct sock *)v) ;
2569 }
2570
2571 static void packet_seq_stop(struct seq_file *seq, void *v)
2572         __releases(seq_file_net(seq)->packet.sklist_lock)
2573 {
2574         struct net *net = seq_file_net(seq);
2575         read_unlock(&net->packet.sklist_lock);
2576 }
2577
2578 static int packet_seq_show(struct seq_file *seq, void *v)
2579 {
2580         if (v == SEQ_START_TOKEN)
2581                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2582         else {
2583                 struct sock *s = v;
2584                 const struct packet_sock *po = pkt_sk(s);
2585
2586                 seq_printf(seq,
2587                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2588                            s,
2589                            atomic_read(&s->sk_refcnt),
2590                            s->sk_type,
2591                            ntohs(po->num),
2592                            po->ifindex,
2593                            po->running,
2594                            atomic_read(&s->sk_rmem_alloc),
2595                            sock_i_uid(s),
2596                            sock_i_ino(s));
2597         }
2598
2599         return 0;
2600 }
2601
2602 static const struct seq_operations packet_seq_ops = {
2603         .start  = packet_seq_start,
2604         .next   = packet_seq_next,
2605         .stop   = packet_seq_stop,
2606         .show   = packet_seq_show,
2607 };
2608
2609 static int packet_seq_open(struct inode *inode, struct file *file)
2610 {
2611         return seq_open_net(inode, file, &packet_seq_ops,
2612                             sizeof(struct seq_net_private));
2613 }
2614
2615 static const struct file_operations packet_seq_fops = {
2616         .owner          = THIS_MODULE,
2617         .open           = packet_seq_open,
2618         .read           = seq_read,
2619         .llseek         = seq_lseek,
2620         .release        = seq_release_net,
2621 };
2622
2623 #endif
2624
2625 static int __net_init packet_net_init(struct net *net)
2626 {
2627         rwlock_init(&net->packet.sklist_lock);
2628         INIT_HLIST_HEAD(&net->packet.sklist);
2629
2630         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2631                 return -ENOMEM;
2632
2633         return 0;
2634 }
2635
2636 static void __net_exit packet_net_exit(struct net *net)
2637 {
2638         proc_net_remove(net, "packet");
2639 }
2640
2641 static struct pernet_operations packet_net_ops = {
2642         .init = packet_net_init,
2643         .exit = packet_net_exit,
2644 };
2645
2646
2647 static void __exit packet_exit(void)
2648 {
2649         unregister_netdevice_notifier(&packet_netdev_notifier);
2650         unregister_pernet_subsys(&packet_net_ops);
2651         sock_unregister(PF_PACKET);
2652         proto_unregister(&packet_proto);
2653 }
2654
2655 static int __init packet_init(void)
2656 {
2657         int rc = proto_register(&packet_proto, 0);
2658
2659         if (rc != 0)
2660                 goto out;
2661
2662         sock_register(&packet_family_ops);
2663         register_pernet_subsys(&packet_net_ops);
2664         register_netdevice_notifier(&packet_netdev_notifier);
2665 out:
2666         return rc;
2667 }
2668
2669 module_init(packet_init);
2670 module_exit(packet_exit);
2671 MODULE_LICENSE("GPL");
2672 MODULE_ALIAS_NETPROTO(PF_PACKET);