3616f27b9d46c08e0f750b47865cc17c3130c5f2
[linux-3.10.git] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *              Johann Baudy    :       Added TX RING.
43  *
44  *              This program is free software; you can redistribute it and/or
45  *              modify it under the terms of the GNU General Public License
46  *              as published by the Free Software Foundation; either version
47  *              2 of the License, or (at your option) any later version.
48  *
49  */
50
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <linux/slab.h>
64 #include <net/net_namespace.h>
65 #include <net/ip.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
68 #include <net/sock.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <asm/system.h>
72 #include <asm/uaccess.h>
73 #include <asm/ioctls.h>
74 #include <asm/page.h>
75 #include <asm/cacheflush.h>
76 #include <asm/io.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/poll.h>
80 #include <linux/module.h>
81 #include <linux/init.h>
82 #include <linux/mutex.h>
83 #include <linux/if_vlan.h>
84 #include <linux/virtio_net.h>
85 #include <linux/errqueue.h>
86 #include <linux/net_tstamp.h>
87
88 #ifdef CONFIG_INET
89 #include <net/inet_common.h>
90 #endif
91
92 /*
93    Assumptions:
94    - if device has no dev->hard_header routine, it adds and removes ll header
95      inside itself. In this case ll header is invisible outside of device,
96      but higher levels still should reserve dev->hard_header_len.
97      Some devices are enough clever to reallocate skb, when header
98      will not fit to reserved space (tunnel), another ones are silly
99      (PPP).
100    - packet socket receives packets with pulled ll header,
101      so that SOCK_RAW should push it back.
102
103 On receive:
104 -----------
105
106 Incoming, dev->hard_header!=NULL
107    mac_header -> ll header
108    data       -> data
109
110 Outgoing, dev->hard_header!=NULL
111    mac_header -> ll header
112    data       -> ll header
113
114 Incoming, dev->hard_header==NULL
115    mac_header -> UNKNOWN position. It is very likely, that it points to ll
116                  header.  PPP makes it, that is wrong, because introduce
117                  assymetry between rx and tx paths.
118    data       -> data
119
120 Outgoing, dev->hard_header==NULL
121    mac_header -> data. ll header is still not built!
122    data       -> data
123
124 Resume
125   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
126
127
128 On transmit:
129 ------------
130
131 dev->hard_header != NULL
132    mac_header -> ll header
133    data       -> ll header
134
135 dev->hard_header == NULL (ll header is added by device, we cannot control it)
136    mac_header -> data
137    data       -> data
138
139    We should set nh.raw on output to correct posistion,
140    packet classifier depends on it.
141  */
142
143 /* Private packet socket structures. */
144
145 struct packet_mclist {
146         struct packet_mclist    *next;
147         int                     ifindex;
148         int                     count;
149         unsigned short          type;
150         unsigned short          alen;
151         unsigned char           addr[MAX_ADDR_LEN];
152 };
153 /* identical to struct packet_mreq except it has
154  * a longer address field.
155  */
156 struct packet_mreq_max {
157         int             mr_ifindex;
158         unsigned short  mr_type;
159         unsigned short  mr_alen;
160         unsigned char   mr_address[MAX_ADDR_LEN];
161 };
162
163 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
164                 int closing, int tx_ring);
165
166 struct packet_ring_buffer {
167         char                    **pg_vec;
168         unsigned int            head;
169         unsigned int            frames_per_block;
170         unsigned int            frame_size;
171         unsigned int            frame_max;
172
173         unsigned int            pg_vec_order;
174         unsigned int            pg_vec_pages;
175         unsigned int            pg_vec_len;
176
177         atomic_t                pending;
178 };
179
180 struct packet_sock;
181 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
182
183 static void packet_flush_mclist(struct sock *sk);
184
185 struct packet_sock {
186         /* struct sock has to be the first member of packet_sock */
187         struct sock             sk;
188         struct tpacket_stats    stats;
189         struct packet_ring_buffer       rx_ring;
190         struct packet_ring_buffer       tx_ring;
191         int                     copy_thresh;
192         spinlock_t              bind_lock;
193         struct mutex            pg_vec_lock;
194         unsigned int            running:1,      /* prot_hook is attached*/
195                                 auxdata:1,
196                                 origdev:1,
197                                 has_vnet_hdr:1;
198         int                     ifindex;        /* bound device         */
199         __be16                  num;
200         struct packet_mclist    *mclist;
201         atomic_t                mapped;
202         enum tpacket_versions   tp_version;
203         unsigned int            tp_hdrlen;
204         unsigned int            tp_reserve;
205         unsigned int            tp_loss:1;
206         unsigned int            tp_tstamp;
207         struct packet_type      prot_hook ____cacheline_aligned_in_smp;
208 };
209
210 struct packet_skb_cb {
211         unsigned int origlen;
212         union {
213                 struct sockaddr_pkt pkt;
214                 struct sockaddr_ll ll;
215         } sa;
216 };
217
218 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
219
220 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
221 {
222         union {
223                 struct tpacket_hdr *h1;
224                 struct tpacket2_hdr *h2;
225                 void *raw;
226         } h;
227
228         h.raw = frame;
229         switch (po->tp_version) {
230         case TPACKET_V1:
231                 h.h1->tp_status = status;
232                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
233                 break;
234         case TPACKET_V2:
235                 h.h2->tp_status = status;
236                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
237                 break;
238         default:
239                 pr_err("TPACKET version not supported\n");
240                 BUG();
241         }
242
243         smp_wmb();
244 }
245
246 static int __packet_get_status(struct packet_sock *po, void *frame)
247 {
248         union {
249                 struct tpacket_hdr *h1;
250                 struct tpacket2_hdr *h2;
251                 void *raw;
252         } h;
253
254         smp_rmb();
255
256         h.raw = frame;
257         switch (po->tp_version) {
258         case TPACKET_V1:
259                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
260                 return h.h1->tp_status;
261         case TPACKET_V2:
262                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
263                 return h.h2->tp_status;
264         default:
265                 pr_err("TPACKET version not supported\n");
266                 BUG();
267                 return 0;
268         }
269 }
270
271 static void *packet_lookup_frame(struct packet_sock *po,
272                 struct packet_ring_buffer *rb,
273                 unsigned int position,
274                 int status)
275 {
276         unsigned int pg_vec_pos, frame_offset;
277         union {
278                 struct tpacket_hdr *h1;
279                 struct tpacket2_hdr *h2;
280                 void *raw;
281         } h;
282
283         pg_vec_pos = position / rb->frames_per_block;
284         frame_offset = position % rb->frames_per_block;
285
286         h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
287
288         if (status != __packet_get_status(po, h.raw))
289                 return NULL;
290
291         return h.raw;
292 }
293
294 static inline void *packet_current_frame(struct packet_sock *po,
295                 struct packet_ring_buffer *rb,
296                 int status)
297 {
298         return packet_lookup_frame(po, rb, rb->head, status);
299 }
300
301 static inline void *packet_previous_frame(struct packet_sock *po,
302                 struct packet_ring_buffer *rb,
303                 int status)
304 {
305         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
306         return packet_lookup_frame(po, rb, previous, status);
307 }
308
309 static inline void packet_increment_head(struct packet_ring_buffer *buff)
310 {
311         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
312 }
313
314 static inline struct packet_sock *pkt_sk(struct sock *sk)
315 {
316         return (struct packet_sock *)sk;
317 }
318
319 static void packet_sock_destruct(struct sock *sk)
320 {
321         skb_queue_purge(&sk->sk_error_queue);
322
323         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
324         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
325
326         if (!sock_flag(sk, SOCK_DEAD)) {
327                 pr_err("Attempt to release alive packet socket: %p\n", sk);
328                 return;
329         }
330
331         sk_refcnt_debug_dec(sk);
332 }
333
334
335 static const struct proto_ops packet_ops;
336
337 static const struct proto_ops packet_ops_spkt;
338
339 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
340                            struct packet_type *pt, struct net_device *orig_dev)
341 {
342         struct sock *sk;
343         struct sockaddr_pkt *spkt;
344
345         /*
346          *      When we registered the protocol we saved the socket in the data
347          *      field for just this event.
348          */
349
350         sk = pt->af_packet_priv;
351
352         /*
353          *      Yank back the headers [hope the device set this
354          *      right or kerboom...]
355          *
356          *      Incoming packets have ll header pulled,
357          *      push it back.
358          *
359          *      For outgoing ones skb->data == skb_mac_header(skb)
360          *      so that this procedure is noop.
361          */
362
363         if (skb->pkt_type == PACKET_LOOPBACK)
364                 goto out;
365
366         if (!net_eq(dev_net(dev), sock_net(sk)))
367                 goto out;
368
369         skb = skb_share_check(skb, GFP_ATOMIC);
370         if (skb == NULL)
371                 goto oom;
372
373         /* drop any routing info */
374         skb_dst_drop(skb);
375
376         /* drop conntrack reference */
377         nf_reset(skb);
378
379         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
380
381         skb_push(skb, skb->data - skb_mac_header(skb));
382
383         /*
384          *      The SOCK_PACKET socket receives _all_ frames.
385          */
386
387         spkt->spkt_family = dev->type;
388         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
389         spkt->spkt_protocol = skb->protocol;
390
391         /*
392          *      Charge the memory to the socket. This is done specifically
393          *      to prevent sockets using all the memory up.
394          */
395
396         if (sock_queue_rcv_skb(sk, skb) == 0)
397                 return 0;
398
399 out:
400         kfree_skb(skb);
401 oom:
402         return 0;
403 }
404
405
406 /*
407  *      Output a raw packet to a device layer. This bypasses all the other
408  *      protocol layers and you must therefore supply it with a complete frame
409  */
410
411 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
412                                struct msghdr *msg, size_t len)
413 {
414         struct sock *sk = sock->sk;
415         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
416         struct sk_buff *skb = NULL;
417         struct net_device *dev;
418         __be16 proto = 0;
419         int err;
420
421         /*
422          *      Get and verify the address.
423          */
424
425         if (saddr) {
426                 if (msg->msg_namelen < sizeof(struct sockaddr))
427                         return -EINVAL;
428                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
429                         proto = saddr->spkt_protocol;
430         } else
431                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
432
433         /*
434          *      Find the device first to size check it
435          */
436
437         saddr->spkt_device[13] = 0;
438 retry:
439         rcu_read_lock();
440         dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
441         err = -ENODEV;
442         if (dev == NULL)
443                 goto out_unlock;
444
445         err = -ENETDOWN;
446         if (!(dev->flags & IFF_UP))
447                 goto out_unlock;
448
449         /*
450          * You may not queue a frame bigger than the mtu. This is the lowest level
451          * raw protocol and you must do your own fragmentation at this level.
452          */
453
454         err = -EMSGSIZE;
455         if (len > dev->mtu + dev->hard_header_len)
456                 goto out_unlock;
457
458         if (!skb) {
459                 size_t reserved = LL_RESERVED_SPACE(dev);
460                 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
461
462                 rcu_read_unlock();
463                 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
464                 if (skb == NULL)
465                         return -ENOBUFS;
466                 /* FIXME: Save some space for broken drivers that write a hard
467                  * header at transmission time by themselves. PPP is the notable
468                  * one here. This should really be fixed at the driver level.
469                  */
470                 skb_reserve(skb, reserved);
471                 skb_reset_network_header(skb);
472
473                 /* Try to align data part correctly */
474                 if (hhlen) {
475                         skb->data -= hhlen;
476                         skb->tail -= hhlen;
477                         if (len < hhlen)
478                                 skb_reset_network_header(skb);
479                 }
480                 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
481                 if (err)
482                         goto out_free;
483                 goto retry;
484         }
485
486
487         skb->protocol = proto;
488         skb->dev = dev;
489         skb->priority = sk->sk_priority;
490         skb->mark = sk->sk_mark;
491         err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
492         if (err < 0)
493                 goto out_unlock;
494
495         dev_queue_xmit(skb);
496         rcu_read_unlock();
497         return len;
498
499 out_unlock:
500         rcu_read_unlock();
501 out_free:
502         kfree_skb(skb);
503         return err;
504 }
505
506 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
507                                       unsigned int res)
508 {
509         struct sk_filter *filter;
510
511         rcu_read_lock_bh();
512         filter = rcu_dereference_bh(sk->sk_filter);
513         if (filter != NULL)
514                 res = sk_run_filter(skb, filter->insns, filter->len);
515         rcu_read_unlock_bh();
516
517         return res;
518 }
519
520 /*
521    This function makes lazy skb cloning in hope that most of packets
522    are discarded by BPF.
523
524    Note tricky part: we DO mangle shared skb! skb->data, skb->len
525    and skb->cb are mangled. It works because (and until) packets
526    falling here are owned by current CPU. Output packets are cloned
527    by dev_queue_xmit_nit(), input packets are processed by net_bh
528    sequencially, so that if we return skb to original state on exit,
529    we will not harm anyone.
530  */
531
532 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
533                       struct packet_type *pt, struct net_device *orig_dev)
534 {
535         struct sock *sk;
536         struct sockaddr_ll *sll;
537         struct packet_sock *po;
538         u8 *skb_head = skb->data;
539         int skb_len = skb->len;
540         unsigned int snaplen, res;
541
542         if (skb->pkt_type == PACKET_LOOPBACK)
543                 goto drop;
544
545         sk = pt->af_packet_priv;
546         po = pkt_sk(sk);
547
548         if (!net_eq(dev_net(dev), sock_net(sk)))
549                 goto drop;
550
551         skb->dev = dev;
552
553         if (dev->header_ops) {
554                 /* The device has an explicit notion of ll header,
555                    exported to higher levels.
556
557                    Otherwise, the device hides datails of it frame
558                    structure, so that corresponding packet head
559                    never delivered to user.
560                  */
561                 if (sk->sk_type != SOCK_DGRAM)
562                         skb_push(skb, skb->data - skb_mac_header(skb));
563                 else if (skb->pkt_type == PACKET_OUTGOING) {
564                         /* Special case: outgoing packets have ll header at head */
565                         skb_pull(skb, skb_network_offset(skb));
566                 }
567         }
568
569         snaplen = skb->len;
570
571         res = run_filter(skb, sk, snaplen);
572         if (!res)
573                 goto drop_n_restore;
574         if (snaplen > res)
575                 snaplen = res;
576
577         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
578             (unsigned)sk->sk_rcvbuf)
579                 goto drop_n_acct;
580
581         if (skb_shared(skb)) {
582                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
583                 if (nskb == NULL)
584                         goto drop_n_acct;
585
586                 if (skb_head != skb->data) {
587                         skb->data = skb_head;
588                         skb->len = skb_len;
589                 }
590                 kfree_skb(skb);
591                 skb = nskb;
592         }
593
594         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
595                      sizeof(skb->cb));
596
597         sll = &PACKET_SKB_CB(skb)->sa.ll;
598         sll->sll_family = AF_PACKET;
599         sll->sll_hatype = dev->type;
600         sll->sll_protocol = skb->protocol;
601         sll->sll_pkttype = skb->pkt_type;
602         if (unlikely(po->origdev))
603                 sll->sll_ifindex = orig_dev->ifindex;
604         else
605                 sll->sll_ifindex = dev->ifindex;
606
607         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
608
609         PACKET_SKB_CB(skb)->origlen = skb->len;
610
611         if (pskb_trim(skb, snaplen))
612                 goto drop_n_acct;
613
614         skb_set_owner_r(skb, sk);
615         skb->dev = NULL;
616         skb_dst_drop(skb);
617
618         /* drop conntrack reference */
619         nf_reset(skb);
620
621         spin_lock(&sk->sk_receive_queue.lock);
622         po->stats.tp_packets++;
623         skb->dropcount = atomic_read(&sk->sk_drops);
624         __skb_queue_tail(&sk->sk_receive_queue, skb);
625         spin_unlock(&sk->sk_receive_queue.lock);
626         sk->sk_data_ready(sk, skb->len);
627         return 0;
628
629 drop_n_acct:
630         po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
631
632 drop_n_restore:
633         if (skb_head != skb->data && skb_shared(skb)) {
634                 skb->data = skb_head;
635                 skb->len = skb_len;
636         }
637 drop:
638         consume_skb(skb);
639         return 0;
640 }
641
642 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
643                        struct packet_type *pt, struct net_device *orig_dev)
644 {
645         struct sock *sk;
646         struct packet_sock *po;
647         struct sockaddr_ll *sll;
648         union {
649                 struct tpacket_hdr *h1;
650                 struct tpacket2_hdr *h2;
651                 void *raw;
652         } h;
653         u8 *skb_head = skb->data;
654         int skb_len = skb->len;
655         unsigned int snaplen, res;
656         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
657         unsigned short macoff, netoff, hdrlen;
658         struct sk_buff *copy_skb = NULL;
659         struct timeval tv;
660         struct timespec ts;
661         struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
662
663         if (skb->pkt_type == PACKET_LOOPBACK)
664                 goto drop;
665
666         sk = pt->af_packet_priv;
667         po = pkt_sk(sk);
668
669         if (!net_eq(dev_net(dev), sock_net(sk)))
670                 goto drop;
671
672         if (dev->header_ops) {
673                 if (sk->sk_type != SOCK_DGRAM)
674                         skb_push(skb, skb->data - skb_mac_header(skb));
675                 else if (skb->pkt_type == PACKET_OUTGOING) {
676                         /* Special case: outgoing packets have ll header at head */
677                         skb_pull(skb, skb_network_offset(skb));
678                 }
679         }
680
681         if (skb->ip_summed == CHECKSUM_PARTIAL)
682                 status |= TP_STATUS_CSUMNOTREADY;
683
684         snaplen = skb->len;
685
686         res = run_filter(skb, sk, snaplen);
687         if (!res)
688                 goto drop_n_restore;
689         if (snaplen > res)
690                 snaplen = res;
691
692         if (sk->sk_type == SOCK_DGRAM) {
693                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
694                                   po->tp_reserve;
695         } else {
696                 unsigned maclen = skb_network_offset(skb);
697                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
698                                        (maclen < 16 ? 16 : maclen)) +
699                         po->tp_reserve;
700                 macoff = netoff - maclen;
701         }
702
703         if (macoff + snaplen > po->rx_ring.frame_size) {
704                 if (po->copy_thresh &&
705                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
706                     (unsigned)sk->sk_rcvbuf) {
707                         if (skb_shared(skb)) {
708                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
709                         } else {
710                                 copy_skb = skb_get(skb);
711                                 skb_head = skb->data;
712                         }
713                         if (copy_skb)
714                                 skb_set_owner_r(copy_skb, sk);
715                 }
716                 snaplen = po->rx_ring.frame_size - macoff;
717                 if ((int)snaplen < 0)
718                         snaplen = 0;
719         }
720
721         spin_lock(&sk->sk_receive_queue.lock);
722         h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
723         if (!h.raw)
724                 goto ring_is_full;
725         packet_increment_head(&po->rx_ring);
726         po->stats.tp_packets++;
727         if (copy_skb) {
728                 status |= TP_STATUS_COPY;
729                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
730         }
731         if (!po->stats.tp_drops)
732                 status &= ~TP_STATUS_LOSING;
733         spin_unlock(&sk->sk_receive_queue.lock);
734
735         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
736
737         switch (po->tp_version) {
738         case TPACKET_V1:
739                 h.h1->tp_len = skb->len;
740                 h.h1->tp_snaplen = snaplen;
741                 h.h1->tp_mac = macoff;
742                 h.h1->tp_net = netoff;
743                 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
744                                 && shhwtstamps->syststamp.tv64)
745                         tv = ktime_to_timeval(shhwtstamps->syststamp);
746                 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
747                                 && shhwtstamps->hwtstamp.tv64)
748                         tv = ktime_to_timeval(shhwtstamps->hwtstamp);
749                 else if (skb->tstamp.tv64)
750                         tv = ktime_to_timeval(skb->tstamp);
751                 else
752                         do_gettimeofday(&tv);
753                 h.h1->tp_sec = tv.tv_sec;
754                 h.h1->tp_usec = tv.tv_usec;
755                 hdrlen = sizeof(*h.h1);
756                 break;
757         case TPACKET_V2:
758                 h.h2->tp_len = skb->len;
759                 h.h2->tp_snaplen = snaplen;
760                 h.h2->tp_mac = macoff;
761                 h.h2->tp_net = netoff;
762                 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
763                                 && shhwtstamps->syststamp.tv64)
764                         ts = ktime_to_timespec(shhwtstamps->syststamp);
765                 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
766                                 && shhwtstamps->hwtstamp.tv64)
767                         ts = ktime_to_timespec(shhwtstamps->hwtstamp);
768                 else if (skb->tstamp.tv64)
769                         ts = ktime_to_timespec(skb->tstamp);
770                 else
771                         getnstimeofday(&ts);
772                 h.h2->tp_sec = ts.tv_sec;
773                 h.h2->tp_nsec = ts.tv_nsec;
774                 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
775                 hdrlen = sizeof(*h.h2);
776                 break;
777         default:
778                 BUG();
779         }
780
781         sll = h.raw + TPACKET_ALIGN(hdrlen);
782         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
783         sll->sll_family = AF_PACKET;
784         sll->sll_hatype = dev->type;
785         sll->sll_protocol = skb->protocol;
786         sll->sll_pkttype = skb->pkt_type;
787         if (unlikely(po->origdev))
788                 sll->sll_ifindex = orig_dev->ifindex;
789         else
790                 sll->sll_ifindex = dev->ifindex;
791
792         __packet_set_status(po, h.raw, status);
793         smp_mb();
794         {
795                 struct page *p_start, *p_end;
796                 u8 *h_end = h.raw + macoff + snaplen - 1;
797
798                 p_start = virt_to_page(h.raw);
799                 p_end = virt_to_page(h_end);
800                 while (p_start <= p_end) {
801                         flush_dcache_page(p_start);
802                         p_start++;
803                 }
804         }
805
806         sk->sk_data_ready(sk, 0);
807
808 drop_n_restore:
809         if (skb_head != skb->data && skb_shared(skb)) {
810                 skb->data = skb_head;
811                 skb->len = skb_len;
812         }
813 drop:
814         kfree_skb(skb);
815         return 0;
816
817 ring_is_full:
818         po->stats.tp_drops++;
819         spin_unlock(&sk->sk_receive_queue.lock);
820
821         sk->sk_data_ready(sk, 0);
822         kfree_skb(copy_skb);
823         goto drop_n_restore;
824 }
825
826 static void tpacket_destruct_skb(struct sk_buff *skb)
827 {
828         struct packet_sock *po = pkt_sk(skb->sk);
829         void *ph;
830
831         BUG_ON(skb == NULL);
832
833         if (likely(po->tx_ring.pg_vec)) {
834                 ph = skb_shinfo(skb)->destructor_arg;
835                 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
836                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
837                 atomic_dec(&po->tx_ring.pending);
838                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
839         }
840
841         sock_wfree(skb);
842 }
843
844 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
845                 void *frame, struct net_device *dev, int size_max,
846                 __be16 proto, unsigned char *addr)
847 {
848         union {
849                 struct tpacket_hdr *h1;
850                 struct tpacket2_hdr *h2;
851                 void *raw;
852         } ph;
853         int to_write, offset, len, tp_len, nr_frags, len_max;
854         struct socket *sock = po->sk.sk_socket;
855         struct page *page;
856         void *data;
857         int err;
858
859         ph.raw = frame;
860
861         skb->protocol = proto;
862         skb->dev = dev;
863         skb->priority = po->sk.sk_priority;
864         skb->mark = po->sk.sk_mark;
865         skb_shinfo(skb)->destructor_arg = ph.raw;
866
867         switch (po->tp_version) {
868         case TPACKET_V2:
869                 tp_len = ph.h2->tp_len;
870                 break;
871         default:
872                 tp_len = ph.h1->tp_len;
873                 break;
874         }
875         if (unlikely(tp_len > size_max)) {
876                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
877                 return -EMSGSIZE;
878         }
879
880         skb_reserve(skb, LL_RESERVED_SPACE(dev));
881         skb_reset_network_header(skb);
882
883         data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
884         to_write = tp_len;
885
886         if (sock->type == SOCK_DGRAM) {
887                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
888                                 NULL, tp_len);
889                 if (unlikely(err < 0))
890                         return -EINVAL;
891         } else if (dev->hard_header_len) {
892                 /* net device doesn't like empty head */
893                 if (unlikely(tp_len <= dev->hard_header_len)) {
894                         pr_err("packet size is too short (%d < %d)\n",
895                                tp_len, dev->hard_header_len);
896                         return -EINVAL;
897                 }
898
899                 skb_push(skb, dev->hard_header_len);
900                 err = skb_store_bits(skb, 0, data,
901                                 dev->hard_header_len);
902                 if (unlikely(err))
903                         return err;
904
905                 data += dev->hard_header_len;
906                 to_write -= dev->hard_header_len;
907         }
908
909         err = -EFAULT;
910         page = virt_to_page(data);
911         offset = offset_in_page(data);
912         len_max = PAGE_SIZE - offset;
913         len = ((to_write > len_max) ? len_max : to_write);
914
915         skb->data_len = to_write;
916         skb->len += to_write;
917         skb->truesize += to_write;
918         atomic_add(to_write, &po->sk.sk_wmem_alloc);
919
920         while (likely(to_write)) {
921                 nr_frags = skb_shinfo(skb)->nr_frags;
922
923                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
924                         pr_err("Packet exceed the number of skb frags(%lu)\n",
925                                MAX_SKB_FRAGS);
926                         return -EFAULT;
927                 }
928
929                 flush_dcache_page(page);
930                 get_page(page);
931                 skb_fill_page_desc(skb,
932                                 nr_frags,
933                                 page++, offset, len);
934                 to_write -= len;
935                 offset = 0;
936                 len_max = PAGE_SIZE;
937                 len = ((to_write > len_max) ? len_max : to_write);
938         }
939
940         return tp_len;
941 }
942
943 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
944 {
945         struct socket *sock;
946         struct sk_buff *skb;
947         struct net_device *dev;
948         __be16 proto;
949         int ifindex, err, reserve = 0;
950         void *ph;
951         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
952         int tp_len, size_max;
953         unsigned char *addr;
954         int len_sum = 0;
955         int status = 0;
956
957         sock = po->sk.sk_socket;
958
959         mutex_lock(&po->pg_vec_lock);
960
961         err = -EBUSY;
962         if (saddr == NULL) {
963                 ifindex = po->ifindex;
964                 proto   = po->num;
965                 addr    = NULL;
966         } else {
967                 err = -EINVAL;
968                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
969                         goto out;
970                 if (msg->msg_namelen < (saddr->sll_halen
971                                         + offsetof(struct sockaddr_ll,
972                                                 sll_addr)))
973                         goto out;
974                 ifindex = saddr->sll_ifindex;
975                 proto   = saddr->sll_protocol;
976                 addr    = saddr->sll_addr;
977         }
978
979         dev = dev_get_by_index(sock_net(&po->sk), ifindex);
980         err = -ENXIO;
981         if (unlikely(dev == NULL))
982                 goto out;
983
984         reserve = dev->hard_header_len;
985
986         err = -ENETDOWN;
987         if (unlikely(!(dev->flags & IFF_UP)))
988                 goto out_put;
989
990         size_max = po->tx_ring.frame_size
991                 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
992
993         if (size_max > dev->mtu + reserve)
994                 size_max = dev->mtu + reserve;
995
996         do {
997                 ph = packet_current_frame(po, &po->tx_ring,
998                                 TP_STATUS_SEND_REQUEST);
999
1000                 if (unlikely(ph == NULL)) {
1001                         schedule();
1002                         continue;
1003                 }
1004
1005                 status = TP_STATUS_SEND_REQUEST;
1006                 skb = sock_alloc_send_skb(&po->sk,
1007                                 LL_ALLOCATED_SPACE(dev)
1008                                 + sizeof(struct sockaddr_ll),
1009                                 0, &err);
1010
1011                 if (unlikely(skb == NULL))
1012                         goto out_status;
1013
1014                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1015                                 addr);
1016
1017                 if (unlikely(tp_len < 0)) {
1018                         if (po->tp_loss) {
1019                                 __packet_set_status(po, ph,
1020                                                 TP_STATUS_AVAILABLE);
1021                                 packet_increment_head(&po->tx_ring);
1022                                 kfree_skb(skb);
1023                                 continue;
1024                         } else {
1025                                 status = TP_STATUS_WRONG_FORMAT;
1026                                 err = tp_len;
1027                                 goto out_status;
1028                         }
1029                 }
1030
1031                 skb->destructor = tpacket_destruct_skb;
1032                 __packet_set_status(po, ph, TP_STATUS_SENDING);
1033                 atomic_inc(&po->tx_ring.pending);
1034
1035                 status = TP_STATUS_SEND_REQUEST;
1036                 err = dev_queue_xmit(skb);
1037                 if (unlikely(err > 0)) {
1038                         err = net_xmit_errno(err);
1039                         if (err && __packet_get_status(po, ph) ==
1040                                    TP_STATUS_AVAILABLE) {
1041                                 /* skb was destructed already */
1042                                 skb = NULL;
1043                                 goto out_status;
1044                         }
1045                         /*
1046                          * skb was dropped but not destructed yet;
1047                          * let's treat it like congestion or err < 0
1048                          */
1049                         err = 0;
1050                 }
1051                 packet_increment_head(&po->tx_ring);
1052                 len_sum += tp_len;
1053         } while (likely((ph != NULL) ||
1054                         ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1055                          (atomic_read(&po->tx_ring.pending))))
1056                 );
1057
1058         err = len_sum;
1059         goto out_put;
1060
1061 out_status:
1062         __packet_set_status(po, ph, status);
1063         kfree_skb(skb);
1064 out_put:
1065         dev_put(dev);
1066 out:
1067         mutex_unlock(&po->pg_vec_lock);
1068         return err;
1069 }
1070
1071 static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1072                                                size_t reserve, size_t len,
1073                                                size_t linear, int noblock,
1074                                                int *err)
1075 {
1076         struct sk_buff *skb;
1077
1078         /* Under a page?  Don't bother with paged skb. */
1079         if (prepad + len < PAGE_SIZE || !linear)
1080                 linear = len;
1081
1082         skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1083                                    err);
1084         if (!skb)
1085                 return NULL;
1086
1087         skb_reserve(skb, reserve);
1088         skb_put(skb, linear);
1089         skb->data_len = len - linear;
1090         skb->len += len - linear;
1091
1092         return skb;
1093 }
1094
1095 static int packet_snd(struct socket *sock,
1096                           struct msghdr *msg, size_t len)
1097 {
1098         struct sock *sk = sock->sk;
1099         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1100         struct sk_buff *skb;
1101         struct net_device *dev;
1102         __be16 proto;
1103         unsigned char *addr;
1104         int ifindex, err, reserve = 0;
1105         struct virtio_net_hdr vnet_hdr = { 0 };
1106         int offset = 0;
1107         int vnet_hdr_len;
1108         struct packet_sock *po = pkt_sk(sk);
1109         unsigned short gso_type = 0;
1110
1111         /*
1112          *      Get and verify the address.
1113          */
1114
1115         if (saddr == NULL) {
1116                 ifindex = po->ifindex;
1117                 proto   = po->num;
1118                 addr    = NULL;
1119         } else {
1120                 err = -EINVAL;
1121                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1122                         goto out;
1123                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1124                         goto out;
1125                 ifindex = saddr->sll_ifindex;
1126                 proto   = saddr->sll_protocol;
1127                 addr    = saddr->sll_addr;
1128         }
1129
1130
1131         dev = dev_get_by_index(sock_net(sk), ifindex);
1132         err = -ENXIO;
1133         if (dev == NULL)
1134                 goto out_unlock;
1135         if (sock->type == SOCK_RAW)
1136                 reserve = dev->hard_header_len;
1137
1138         err = -ENETDOWN;
1139         if (!(dev->flags & IFF_UP))
1140                 goto out_unlock;
1141
1142         if (po->has_vnet_hdr) {
1143                 vnet_hdr_len = sizeof(vnet_hdr);
1144
1145                 err = -EINVAL;
1146                 if (len < vnet_hdr_len)
1147                         goto out_unlock;
1148
1149                 len -= vnet_hdr_len;
1150
1151                 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1152                                        vnet_hdr_len);
1153                 if (err < 0)
1154                         goto out_unlock;
1155
1156                 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1157                     (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1158                       vnet_hdr.hdr_len))
1159                         vnet_hdr.hdr_len = vnet_hdr.csum_start +
1160                                                  vnet_hdr.csum_offset + 2;
1161
1162                 err = -EINVAL;
1163                 if (vnet_hdr.hdr_len > len)
1164                         goto out_unlock;
1165
1166                 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1167                         switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1168                         case VIRTIO_NET_HDR_GSO_TCPV4:
1169                                 gso_type = SKB_GSO_TCPV4;
1170                                 break;
1171                         case VIRTIO_NET_HDR_GSO_TCPV6:
1172                                 gso_type = SKB_GSO_TCPV6;
1173                                 break;
1174                         case VIRTIO_NET_HDR_GSO_UDP:
1175                                 gso_type = SKB_GSO_UDP;
1176                                 break;
1177                         default:
1178                                 goto out_unlock;
1179                         }
1180
1181                         if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1182                                 gso_type |= SKB_GSO_TCP_ECN;
1183
1184                         if (vnet_hdr.gso_size == 0)
1185                                 goto out_unlock;
1186
1187                 }
1188         }
1189
1190         err = -EMSGSIZE;
1191         if (!gso_type && (len > dev->mtu+reserve))
1192                 goto out_unlock;
1193
1194         err = -ENOBUFS;
1195         skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1196                                LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1197                                msg->msg_flags & MSG_DONTWAIT, &err);
1198         if (skb == NULL)
1199                 goto out_unlock;
1200
1201         skb_set_network_header(skb, reserve);
1202
1203         err = -EINVAL;
1204         if (sock->type == SOCK_DGRAM &&
1205             (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
1206                 goto out_free;
1207
1208         /* Returns -EFAULT on error */
1209         err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1210         if (err)
1211                 goto out_free;
1212         err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1213         if (err < 0)
1214                 goto out_free;
1215
1216         skb->protocol = proto;
1217         skb->dev = dev;
1218         skb->priority = sk->sk_priority;
1219         skb->mark = sk->sk_mark;
1220
1221         if (po->has_vnet_hdr) {
1222                 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1223                         if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1224                                                   vnet_hdr.csum_offset)) {
1225                                 err = -EINVAL;
1226                                 goto out_free;
1227                         }
1228                 }
1229
1230                 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1231                 skb_shinfo(skb)->gso_type = gso_type;
1232
1233                 /* Header must be checked, and gso_segs computed. */
1234                 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1235                 skb_shinfo(skb)->gso_segs = 0;
1236
1237                 len += vnet_hdr_len;
1238         }
1239
1240         /*
1241          *      Now send it
1242          */
1243
1244         err = dev_queue_xmit(skb);
1245         if (err > 0 && (err = net_xmit_errno(err)) != 0)
1246                 goto out_unlock;
1247
1248         dev_put(dev);
1249
1250         return len;
1251
1252 out_free:
1253         kfree_skb(skb);
1254 out_unlock:
1255         if (dev)
1256                 dev_put(dev);
1257 out:
1258         return err;
1259 }
1260
1261 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1262                 struct msghdr *msg, size_t len)
1263 {
1264         struct sock *sk = sock->sk;
1265         struct packet_sock *po = pkt_sk(sk);
1266         if (po->tx_ring.pg_vec)
1267                 return tpacket_snd(po, msg);
1268         else
1269                 return packet_snd(sock, msg, len);
1270 }
1271
1272 /*
1273  *      Close a PACKET socket. This is fairly simple. We immediately go
1274  *      to 'closed' state and remove our protocol entry in the device list.
1275  */
1276
1277 static int packet_release(struct socket *sock)
1278 {
1279         struct sock *sk = sock->sk;
1280         struct packet_sock *po;
1281         struct net *net;
1282         struct tpacket_req req;
1283
1284         if (!sk)
1285                 return 0;
1286
1287         net = sock_net(sk);
1288         po = pkt_sk(sk);
1289
1290         spin_lock_bh(&net->packet.sklist_lock);
1291         sk_del_node_init_rcu(sk);
1292         sock_prot_inuse_add(net, sk->sk_prot, -1);
1293         spin_unlock_bh(&net->packet.sklist_lock);
1294
1295         spin_lock(&po->bind_lock);
1296         if (po->running) {
1297                 /*
1298                  * Remove from protocol table
1299                  */
1300                 po->running = 0;
1301                 po->num = 0;
1302                 __dev_remove_pack(&po->prot_hook);
1303                 __sock_put(sk);
1304         }
1305         spin_unlock(&po->bind_lock);
1306
1307         packet_flush_mclist(sk);
1308
1309         memset(&req, 0, sizeof(req));
1310
1311         if (po->rx_ring.pg_vec)
1312                 packet_set_ring(sk, &req, 1, 0);
1313
1314         if (po->tx_ring.pg_vec)
1315                 packet_set_ring(sk, &req, 1, 1);
1316
1317         synchronize_net();
1318         /*
1319          *      Now the socket is dead. No more input will appear.
1320          */
1321         sock_orphan(sk);
1322         sock->sk = NULL;
1323
1324         /* Purge queues */
1325
1326         skb_queue_purge(&sk->sk_receive_queue);
1327         sk_refcnt_debug_release(sk);
1328
1329         sock_put(sk);
1330         return 0;
1331 }
1332
1333 /*
1334  *      Attach a packet hook.
1335  */
1336
1337 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1338 {
1339         struct packet_sock *po = pkt_sk(sk);
1340         /*
1341          *      Detach an existing hook if present.
1342          */
1343
1344         lock_sock(sk);
1345
1346         spin_lock(&po->bind_lock);
1347         if (po->running) {
1348                 __sock_put(sk);
1349                 po->running = 0;
1350                 po->num = 0;
1351                 spin_unlock(&po->bind_lock);
1352                 dev_remove_pack(&po->prot_hook);
1353                 spin_lock(&po->bind_lock);
1354         }
1355
1356         po->num = protocol;
1357         po->prot_hook.type = protocol;
1358         po->prot_hook.dev = dev;
1359
1360         po->ifindex = dev ? dev->ifindex : 0;
1361
1362         if (protocol == 0)
1363                 goto out_unlock;
1364
1365         if (!dev || (dev->flags & IFF_UP)) {
1366                 dev_add_pack(&po->prot_hook);
1367                 sock_hold(sk);
1368                 po->running = 1;
1369         } else {
1370                 sk->sk_err = ENETDOWN;
1371                 if (!sock_flag(sk, SOCK_DEAD))
1372                         sk->sk_error_report(sk);
1373         }
1374
1375 out_unlock:
1376         spin_unlock(&po->bind_lock);
1377         release_sock(sk);
1378         return 0;
1379 }
1380
1381 /*
1382  *      Bind a packet socket to a device
1383  */
1384
1385 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1386                             int addr_len)
1387 {
1388         struct sock *sk = sock->sk;
1389         char name[15];
1390         struct net_device *dev;
1391         int err = -ENODEV;
1392
1393         /*
1394          *      Check legality
1395          */
1396
1397         if (addr_len != sizeof(struct sockaddr))
1398                 return -EINVAL;
1399         strlcpy(name, uaddr->sa_data, sizeof(name));
1400
1401         dev = dev_get_by_name(sock_net(sk), name);
1402         if (dev) {
1403                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1404                 dev_put(dev);
1405         }
1406         return err;
1407 }
1408
1409 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1410 {
1411         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1412         struct sock *sk = sock->sk;
1413         struct net_device *dev = NULL;
1414         int err;
1415
1416
1417         /*
1418          *      Check legality
1419          */
1420
1421         if (addr_len < sizeof(struct sockaddr_ll))
1422                 return -EINVAL;
1423         if (sll->sll_family != AF_PACKET)
1424                 return -EINVAL;
1425
1426         if (sll->sll_ifindex) {
1427                 err = -ENODEV;
1428                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1429                 if (dev == NULL)
1430                         goto out;
1431         }
1432         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1433         if (dev)
1434                 dev_put(dev);
1435
1436 out:
1437         return err;
1438 }
1439
1440 static struct proto packet_proto = {
1441         .name     = "PACKET",
1442         .owner    = THIS_MODULE,
1443         .obj_size = sizeof(struct packet_sock),
1444 };
1445
1446 /*
1447  *      Create a packet of type SOCK_PACKET.
1448  */
1449
1450 static int packet_create(struct net *net, struct socket *sock, int protocol,
1451                          int kern)
1452 {
1453         struct sock *sk;
1454         struct packet_sock *po;
1455         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1456         int err;
1457
1458         if (!capable(CAP_NET_RAW))
1459                 return -EPERM;
1460         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1461             sock->type != SOCK_PACKET)
1462                 return -ESOCKTNOSUPPORT;
1463
1464         sock->state = SS_UNCONNECTED;
1465
1466         err = -ENOBUFS;
1467         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1468         if (sk == NULL)
1469                 goto out;
1470
1471         sock->ops = &packet_ops;
1472         if (sock->type == SOCK_PACKET)
1473                 sock->ops = &packet_ops_spkt;
1474
1475         sock_init_data(sock, sk);
1476
1477         po = pkt_sk(sk);
1478         sk->sk_family = PF_PACKET;
1479         po->num = proto;
1480
1481         sk->sk_destruct = packet_sock_destruct;
1482         sk_refcnt_debug_inc(sk);
1483
1484         /*
1485          *      Attach a protocol block
1486          */
1487
1488         spin_lock_init(&po->bind_lock);
1489         mutex_init(&po->pg_vec_lock);
1490         po->prot_hook.func = packet_rcv;
1491
1492         if (sock->type == SOCK_PACKET)
1493                 po->prot_hook.func = packet_rcv_spkt;
1494
1495         po->prot_hook.af_packet_priv = sk;
1496
1497         if (proto) {
1498                 po->prot_hook.type = proto;
1499                 dev_add_pack(&po->prot_hook);
1500                 sock_hold(sk);
1501                 po->running = 1;
1502         }
1503
1504         spin_lock_bh(&net->packet.sklist_lock);
1505         sk_add_node_rcu(sk, &net->packet.sklist);
1506         sock_prot_inuse_add(net, &packet_proto, 1);
1507         spin_unlock_bh(&net->packet.sklist_lock);
1508
1509         return 0;
1510 out:
1511         return err;
1512 }
1513
1514 static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
1515 {
1516         struct sock_exterr_skb *serr;
1517         struct sk_buff *skb, *skb2;
1518         int copied, err;
1519
1520         err = -EAGAIN;
1521         skb = skb_dequeue(&sk->sk_error_queue);
1522         if (skb == NULL)
1523                 goto out;
1524
1525         copied = skb->len;
1526         if (copied > len) {
1527                 msg->msg_flags |= MSG_TRUNC;
1528                 copied = len;
1529         }
1530         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1531         if (err)
1532                 goto out_free_skb;
1533
1534         sock_recv_timestamp(msg, sk, skb);
1535
1536         serr = SKB_EXT_ERR(skb);
1537         put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
1538                  sizeof(serr->ee), &serr->ee);
1539
1540         msg->msg_flags |= MSG_ERRQUEUE;
1541         err = copied;
1542
1543         /* Reset and regenerate socket error */
1544         spin_lock_bh(&sk->sk_error_queue.lock);
1545         sk->sk_err = 0;
1546         if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
1547                 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
1548                 spin_unlock_bh(&sk->sk_error_queue.lock);
1549                 sk->sk_error_report(sk);
1550         } else
1551                 spin_unlock_bh(&sk->sk_error_queue.lock);
1552
1553 out_free_skb:
1554         kfree_skb(skb);
1555 out:
1556         return err;
1557 }
1558
1559 /*
1560  *      Pull a packet from our receive queue and hand it to the user.
1561  *      If necessary we block.
1562  */
1563
1564 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1565                           struct msghdr *msg, size_t len, int flags)
1566 {
1567         struct sock *sk = sock->sk;
1568         struct sk_buff *skb;
1569         int copied, err;
1570         struct sockaddr_ll *sll;
1571         int vnet_hdr_len = 0;
1572
1573         err = -EINVAL;
1574         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1575                 goto out;
1576
1577 #if 0
1578         /* What error should we return now? EUNATTACH? */
1579         if (pkt_sk(sk)->ifindex < 0)
1580                 return -ENODEV;
1581 #endif
1582
1583         if (flags & MSG_ERRQUEUE) {
1584                 err = packet_recv_error(sk, msg, len);
1585                 goto out;
1586         }
1587
1588         /*
1589          *      Call the generic datagram receiver. This handles all sorts
1590          *      of horrible races and re-entrancy so we can forget about it
1591          *      in the protocol layers.
1592          *
1593          *      Now it will return ENETDOWN, if device have just gone down,
1594          *      but then it will block.
1595          */
1596
1597         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1598
1599         /*
1600          *      An error occurred so return it. Because skb_recv_datagram()
1601          *      handles the blocking we don't see and worry about blocking
1602          *      retries.
1603          */
1604
1605         if (skb == NULL)
1606                 goto out;
1607
1608         if (pkt_sk(sk)->has_vnet_hdr) {
1609                 struct virtio_net_hdr vnet_hdr = { 0 };
1610
1611                 err = -EINVAL;
1612                 vnet_hdr_len = sizeof(vnet_hdr);
1613                 if ((len -= vnet_hdr_len) < 0)
1614                         goto out_free;
1615
1616                 if (skb_is_gso(skb)) {
1617                         struct skb_shared_info *sinfo = skb_shinfo(skb);
1618
1619                         /* This is a hint as to how much should be linear. */
1620                         vnet_hdr.hdr_len = skb_headlen(skb);
1621                         vnet_hdr.gso_size = sinfo->gso_size;
1622                         if (sinfo->gso_type & SKB_GSO_TCPV4)
1623                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1624                         else if (sinfo->gso_type & SKB_GSO_TCPV6)
1625                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1626                         else if (sinfo->gso_type & SKB_GSO_UDP)
1627                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1628                         else if (sinfo->gso_type & SKB_GSO_FCOE)
1629                                 goto out_free;
1630                         else
1631                                 BUG();
1632                         if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1633                                 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1634                 } else
1635                         vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1636
1637                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1638                         vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1639                         vnet_hdr.csum_start = skb->csum_start -
1640                                                         skb_headroom(skb);
1641                         vnet_hdr.csum_offset = skb->csum_offset;
1642                 } /* else everything is zero */
1643
1644                 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1645                                      vnet_hdr_len);
1646                 if (err < 0)
1647                         goto out_free;
1648         }
1649
1650         /*
1651          *      If the address length field is there to be filled in, we fill
1652          *      it in now.
1653          */
1654
1655         sll = &PACKET_SKB_CB(skb)->sa.ll;
1656         if (sock->type == SOCK_PACKET)
1657                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1658         else
1659                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1660
1661         /*
1662          *      You lose any data beyond the buffer you gave. If it worries a
1663          *      user program they can ask the device for its MTU anyway.
1664          */
1665
1666         copied = skb->len;
1667         if (copied > len) {
1668                 copied = len;
1669                 msg->msg_flags |= MSG_TRUNC;
1670         }
1671
1672         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1673         if (err)
1674                 goto out_free;
1675
1676         sock_recv_ts_and_drops(msg, sk, skb);
1677
1678         if (msg->msg_name)
1679                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1680                        msg->msg_namelen);
1681
1682         if (pkt_sk(sk)->auxdata) {
1683                 struct tpacket_auxdata aux;
1684
1685                 aux.tp_status = TP_STATUS_USER;
1686                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1687                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1688                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1689                 aux.tp_snaplen = skb->len;
1690                 aux.tp_mac = 0;
1691                 aux.tp_net = skb_network_offset(skb);
1692                 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1693
1694                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1695         }
1696
1697         /*
1698          *      Free or return the buffer as appropriate. Again this
1699          *      hides all the races and re-entrancy issues from us.
1700          */
1701         err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1702
1703 out_free:
1704         skb_free_datagram(sk, skb);
1705 out:
1706         return err;
1707 }
1708
1709 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1710                                int *uaddr_len, int peer)
1711 {
1712         struct net_device *dev;
1713         struct sock *sk = sock->sk;
1714
1715         if (peer)
1716                 return -EOPNOTSUPP;
1717
1718         uaddr->sa_family = AF_PACKET;
1719         rcu_read_lock();
1720         dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1721         if (dev)
1722                 strlcpy(uaddr->sa_data, dev->name, 15);
1723         else
1724                 memset(uaddr->sa_data, 0, 14);
1725         rcu_read_unlock();
1726         *uaddr_len = sizeof(*uaddr);
1727
1728         return 0;
1729 }
1730
1731 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1732                           int *uaddr_len, int peer)
1733 {
1734         struct net_device *dev;
1735         struct sock *sk = sock->sk;
1736         struct packet_sock *po = pkt_sk(sk);
1737         DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1738
1739         if (peer)
1740                 return -EOPNOTSUPP;
1741
1742         sll->sll_family = AF_PACKET;
1743         sll->sll_ifindex = po->ifindex;
1744         sll->sll_protocol = po->num;
1745         rcu_read_lock();
1746         dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1747         if (dev) {
1748                 sll->sll_hatype = dev->type;
1749                 sll->sll_halen = dev->addr_len;
1750                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1751         } else {
1752                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1753                 sll->sll_halen = 0;
1754         }
1755         rcu_read_unlock();
1756         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1757
1758         return 0;
1759 }
1760
1761 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1762                          int what)
1763 {
1764         switch (i->type) {
1765         case PACKET_MR_MULTICAST:
1766                 if (i->alen != dev->addr_len)
1767                         return -EINVAL;
1768                 if (what > 0)
1769                         return dev_mc_add(dev, i->addr);
1770                 else
1771                         return dev_mc_del(dev, i->addr);
1772                 break;
1773         case PACKET_MR_PROMISC:
1774                 return dev_set_promiscuity(dev, what);
1775                 break;
1776         case PACKET_MR_ALLMULTI:
1777                 return dev_set_allmulti(dev, what);
1778                 break;
1779         case PACKET_MR_UNICAST:
1780                 if (i->alen != dev->addr_len)
1781                         return -EINVAL;
1782                 if (what > 0)
1783                         return dev_uc_add(dev, i->addr);
1784                 else
1785                         return dev_uc_del(dev, i->addr);
1786                 break;
1787         default:
1788                 break;
1789         }
1790         return 0;
1791 }
1792
1793 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1794 {
1795         for ( ; i; i = i->next) {
1796                 if (i->ifindex == dev->ifindex)
1797                         packet_dev_mc(dev, i, what);
1798         }
1799 }
1800
1801 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1802 {
1803         struct packet_sock *po = pkt_sk(sk);
1804         struct packet_mclist *ml, *i;
1805         struct net_device *dev;
1806         int err;
1807
1808         rtnl_lock();
1809
1810         err = -ENODEV;
1811         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1812         if (!dev)
1813                 goto done;
1814
1815         err = -EINVAL;
1816         if (mreq->mr_alen > dev->addr_len)
1817                 goto done;
1818
1819         err = -ENOBUFS;
1820         i = kmalloc(sizeof(*i), GFP_KERNEL);
1821         if (i == NULL)
1822                 goto done;
1823
1824         err = 0;
1825         for (ml = po->mclist; ml; ml = ml->next) {
1826                 if (ml->ifindex == mreq->mr_ifindex &&
1827                     ml->type == mreq->mr_type &&
1828                     ml->alen == mreq->mr_alen &&
1829                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1830                         ml->count++;
1831                         /* Free the new element ... */
1832                         kfree(i);
1833                         goto done;
1834                 }
1835         }
1836
1837         i->type = mreq->mr_type;
1838         i->ifindex = mreq->mr_ifindex;
1839         i->alen = mreq->mr_alen;
1840         memcpy(i->addr, mreq->mr_address, i->alen);
1841         i->count = 1;
1842         i->next = po->mclist;
1843         po->mclist = i;
1844         err = packet_dev_mc(dev, i, 1);
1845         if (err) {
1846                 po->mclist = i->next;
1847                 kfree(i);
1848         }
1849
1850 done:
1851         rtnl_unlock();
1852         return err;
1853 }
1854
1855 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1856 {
1857         struct packet_mclist *ml, **mlp;
1858
1859         rtnl_lock();
1860
1861         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1862                 if (ml->ifindex == mreq->mr_ifindex &&
1863                     ml->type == mreq->mr_type &&
1864                     ml->alen == mreq->mr_alen &&
1865                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1866                         if (--ml->count == 0) {
1867                                 struct net_device *dev;
1868                                 *mlp = ml->next;
1869                                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1870                                 if (dev)
1871                                         packet_dev_mc(dev, ml, -1);
1872                                 kfree(ml);
1873                         }
1874                         rtnl_unlock();
1875                         return 0;
1876                 }
1877         }
1878         rtnl_unlock();
1879         return -EADDRNOTAVAIL;
1880 }
1881
1882 static void packet_flush_mclist(struct sock *sk)
1883 {
1884         struct packet_sock *po = pkt_sk(sk);
1885         struct packet_mclist *ml;
1886
1887         if (!po->mclist)
1888                 return;
1889
1890         rtnl_lock();
1891         while ((ml = po->mclist) != NULL) {
1892                 struct net_device *dev;
1893
1894                 po->mclist = ml->next;
1895                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1896                 if (dev != NULL)
1897                         packet_dev_mc(dev, ml, -1);
1898                 kfree(ml);
1899         }
1900         rtnl_unlock();
1901 }
1902
1903 static int
1904 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1905 {
1906         struct sock *sk = sock->sk;
1907         struct packet_sock *po = pkt_sk(sk);
1908         int ret;
1909
1910         if (level != SOL_PACKET)
1911                 return -ENOPROTOOPT;
1912
1913         switch (optname) {
1914         case PACKET_ADD_MEMBERSHIP:
1915         case PACKET_DROP_MEMBERSHIP:
1916         {
1917                 struct packet_mreq_max mreq;
1918                 int len = optlen;
1919                 memset(&mreq, 0, sizeof(mreq));
1920                 if (len < sizeof(struct packet_mreq))
1921                         return -EINVAL;
1922                 if (len > sizeof(mreq))
1923                         len = sizeof(mreq);
1924                 if (copy_from_user(&mreq, optval, len))
1925                         return -EFAULT;
1926                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1927                         return -EINVAL;
1928                 if (optname == PACKET_ADD_MEMBERSHIP)
1929                         ret = packet_mc_add(sk, &mreq);
1930                 else
1931                         ret = packet_mc_drop(sk, &mreq);
1932                 return ret;
1933         }
1934
1935         case PACKET_RX_RING:
1936         case PACKET_TX_RING:
1937         {
1938                 struct tpacket_req req;
1939
1940                 if (optlen < sizeof(req))
1941                         return -EINVAL;
1942                 if (pkt_sk(sk)->has_vnet_hdr)
1943                         return -EINVAL;
1944                 if (copy_from_user(&req, optval, sizeof(req)))
1945                         return -EFAULT;
1946                 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1947         }
1948         case PACKET_COPY_THRESH:
1949         {
1950                 int val;
1951
1952                 if (optlen != sizeof(val))
1953                         return -EINVAL;
1954                 if (copy_from_user(&val, optval, sizeof(val)))
1955                         return -EFAULT;
1956
1957                 pkt_sk(sk)->copy_thresh = val;
1958                 return 0;
1959         }
1960         case PACKET_VERSION:
1961         {
1962                 int val;
1963
1964                 if (optlen != sizeof(val))
1965                         return -EINVAL;
1966                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1967                         return -EBUSY;
1968                 if (copy_from_user(&val, optval, sizeof(val)))
1969                         return -EFAULT;
1970                 switch (val) {
1971                 case TPACKET_V1:
1972                 case TPACKET_V2:
1973                         po->tp_version = val;
1974                         return 0;
1975                 default:
1976                         return -EINVAL;
1977                 }
1978         }
1979         case PACKET_RESERVE:
1980         {
1981                 unsigned int val;
1982
1983                 if (optlen != sizeof(val))
1984                         return -EINVAL;
1985                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1986                         return -EBUSY;
1987                 if (copy_from_user(&val, optval, sizeof(val)))
1988                         return -EFAULT;
1989                 po->tp_reserve = val;
1990                 return 0;
1991         }
1992         case PACKET_LOSS:
1993         {
1994                 unsigned int val;
1995
1996                 if (optlen != sizeof(val))
1997                         return -EINVAL;
1998                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1999                         return -EBUSY;
2000                 if (copy_from_user(&val, optval, sizeof(val)))
2001                         return -EFAULT;
2002                 po->tp_loss = !!val;
2003                 return 0;
2004         }
2005         case PACKET_AUXDATA:
2006         {
2007                 int val;
2008
2009                 if (optlen < sizeof(val))
2010                         return -EINVAL;
2011                 if (copy_from_user(&val, optval, sizeof(val)))
2012                         return -EFAULT;
2013
2014                 po->auxdata = !!val;
2015                 return 0;
2016         }
2017         case PACKET_ORIGDEV:
2018         {
2019                 int val;
2020
2021                 if (optlen < sizeof(val))
2022                         return -EINVAL;
2023                 if (copy_from_user(&val, optval, sizeof(val)))
2024                         return -EFAULT;
2025
2026                 po->origdev = !!val;
2027                 return 0;
2028         }
2029         case PACKET_VNET_HDR:
2030         {
2031                 int val;
2032
2033                 if (sock->type != SOCK_RAW)
2034                         return -EINVAL;
2035                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2036                         return -EBUSY;
2037                 if (optlen < sizeof(val))
2038                         return -EINVAL;
2039                 if (copy_from_user(&val, optval, sizeof(val)))
2040                         return -EFAULT;
2041
2042                 po->has_vnet_hdr = !!val;
2043                 return 0;
2044         }
2045         case PACKET_TIMESTAMP:
2046         {
2047                 int val;
2048
2049                 if (optlen != sizeof(val))
2050                         return -EINVAL;
2051                 if (copy_from_user(&val, optval, sizeof(val)))
2052                         return -EFAULT;
2053
2054                 po->tp_tstamp = val;
2055                 return 0;
2056         }
2057         default:
2058                 return -ENOPROTOOPT;
2059         }
2060 }
2061
2062 static int packet_getsockopt(struct socket *sock, int level, int optname,
2063                              char __user *optval, int __user *optlen)
2064 {
2065         int len;
2066         int val;
2067         struct sock *sk = sock->sk;
2068         struct packet_sock *po = pkt_sk(sk);
2069         void *data;
2070         struct tpacket_stats st;
2071
2072         if (level != SOL_PACKET)
2073                 return -ENOPROTOOPT;
2074
2075         if (get_user(len, optlen))
2076                 return -EFAULT;
2077
2078         if (len < 0)
2079                 return -EINVAL;
2080
2081         switch (optname) {
2082         case PACKET_STATISTICS:
2083                 if (len > sizeof(struct tpacket_stats))
2084                         len = sizeof(struct tpacket_stats);
2085                 spin_lock_bh(&sk->sk_receive_queue.lock);
2086                 st = po->stats;
2087                 memset(&po->stats, 0, sizeof(st));
2088                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2089                 st.tp_packets += st.tp_drops;
2090
2091                 data = &st;
2092                 break;
2093         case PACKET_AUXDATA:
2094                 if (len > sizeof(int))
2095                         len = sizeof(int);
2096                 val = po->auxdata;
2097
2098                 data = &val;
2099                 break;
2100         case PACKET_ORIGDEV:
2101                 if (len > sizeof(int))
2102                         len = sizeof(int);
2103                 val = po->origdev;
2104
2105                 data = &val;
2106                 break;
2107         case PACKET_VNET_HDR:
2108                 if (len > sizeof(int))
2109                         len = sizeof(int);
2110                 val = po->has_vnet_hdr;
2111
2112                 data = &val;
2113                 break;
2114         case PACKET_VERSION:
2115                 if (len > sizeof(int))
2116                         len = sizeof(int);
2117                 val = po->tp_version;
2118                 data = &val;
2119                 break;
2120         case PACKET_HDRLEN:
2121                 if (len > sizeof(int))
2122                         len = sizeof(int);
2123                 if (copy_from_user(&val, optval, len))
2124                         return -EFAULT;
2125                 switch (val) {
2126                 case TPACKET_V1:
2127                         val = sizeof(struct tpacket_hdr);
2128                         break;
2129                 case TPACKET_V2:
2130                         val = sizeof(struct tpacket2_hdr);
2131                         break;
2132                 default:
2133                         return -EINVAL;
2134                 }
2135                 data = &val;
2136                 break;
2137         case PACKET_RESERVE:
2138                 if (len > sizeof(unsigned int))
2139                         len = sizeof(unsigned int);
2140                 val = po->tp_reserve;
2141                 data = &val;
2142                 break;
2143         case PACKET_LOSS:
2144                 if (len > sizeof(unsigned int))
2145                         len = sizeof(unsigned int);
2146                 val = po->tp_loss;
2147                 data = &val;
2148                 break;
2149         case PACKET_TIMESTAMP:
2150                 if (len > sizeof(int))
2151                         len = sizeof(int);
2152                 val = po->tp_tstamp;
2153                 data = &val;
2154                 break;
2155         default:
2156                 return -ENOPROTOOPT;
2157         }
2158
2159         if (put_user(len, optlen))
2160                 return -EFAULT;
2161         if (copy_to_user(optval, data, len))
2162                 return -EFAULT;
2163         return 0;
2164 }
2165
2166
2167 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2168 {
2169         struct sock *sk;
2170         struct hlist_node *node;
2171         struct net_device *dev = data;
2172         struct net *net = dev_net(dev);
2173
2174         rcu_read_lock();
2175         sk_for_each_rcu(sk, node, &net->packet.sklist) {
2176                 struct packet_sock *po = pkt_sk(sk);
2177
2178                 switch (msg) {
2179                 case NETDEV_UNREGISTER:
2180                         if (po->mclist)
2181                                 packet_dev_mclist(dev, po->mclist, -1);
2182                         /* fallthrough */
2183
2184                 case NETDEV_DOWN:
2185                         if (dev->ifindex == po->ifindex) {
2186                                 spin_lock(&po->bind_lock);
2187                                 if (po->running) {
2188                                         __dev_remove_pack(&po->prot_hook);
2189                                         __sock_put(sk);
2190                                         po->running = 0;
2191                                         sk->sk_err = ENETDOWN;
2192                                         if (!sock_flag(sk, SOCK_DEAD))
2193                                                 sk->sk_error_report(sk);
2194                                 }
2195                                 if (msg == NETDEV_UNREGISTER) {
2196                                         po->ifindex = -1;
2197                                         po->prot_hook.dev = NULL;
2198                                 }
2199                                 spin_unlock(&po->bind_lock);
2200                         }
2201                         break;
2202                 case NETDEV_UP:
2203                         if (dev->ifindex == po->ifindex) {
2204                                 spin_lock(&po->bind_lock);
2205                                 if (po->num && !po->running) {
2206                                         dev_add_pack(&po->prot_hook);
2207                                         sock_hold(sk);
2208                                         po->running = 1;
2209                                 }
2210                                 spin_unlock(&po->bind_lock);
2211                         }
2212                         break;
2213                 }
2214         }
2215         rcu_read_unlock();
2216         return NOTIFY_DONE;
2217 }
2218
2219
2220 static int packet_ioctl(struct socket *sock, unsigned int cmd,
2221                         unsigned long arg)
2222 {
2223         struct sock *sk = sock->sk;
2224
2225         switch (cmd) {
2226         case SIOCOUTQ:
2227         {
2228                 int amount = sk_wmem_alloc_get(sk);
2229
2230                 return put_user(amount, (int __user *)arg);
2231         }
2232         case SIOCINQ:
2233         {
2234                 struct sk_buff *skb;
2235                 int amount = 0;
2236
2237                 spin_lock_bh(&sk->sk_receive_queue.lock);
2238                 skb = skb_peek(&sk->sk_receive_queue);
2239                 if (skb)
2240                         amount = skb->len;
2241                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2242                 return put_user(amount, (int __user *)arg);
2243         }
2244         case SIOCGSTAMP:
2245                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2246         case SIOCGSTAMPNS:
2247                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2248
2249 #ifdef CONFIG_INET
2250         case SIOCADDRT:
2251         case SIOCDELRT:
2252         case SIOCDARP:
2253         case SIOCGARP:
2254         case SIOCSARP:
2255         case SIOCGIFADDR:
2256         case SIOCSIFADDR:
2257         case SIOCGIFBRDADDR:
2258         case SIOCSIFBRDADDR:
2259         case SIOCGIFNETMASK:
2260         case SIOCSIFNETMASK:
2261         case SIOCGIFDSTADDR:
2262         case SIOCSIFDSTADDR:
2263         case SIOCSIFFLAGS:
2264                 return inet_dgram_ops.ioctl(sock, cmd, arg);
2265 #endif
2266
2267         default:
2268                 return -ENOIOCTLCMD;
2269         }
2270         return 0;
2271 }
2272
2273 static unsigned int packet_poll(struct file *file, struct socket *sock,
2274                                 poll_table *wait)
2275 {
2276         struct sock *sk = sock->sk;
2277         struct packet_sock *po = pkt_sk(sk);
2278         unsigned int mask = datagram_poll(file, sock, wait);
2279
2280         spin_lock_bh(&sk->sk_receive_queue.lock);
2281         if (po->rx_ring.pg_vec) {
2282                 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2283                         mask |= POLLIN | POLLRDNORM;
2284         }
2285         spin_unlock_bh(&sk->sk_receive_queue.lock);
2286         spin_lock_bh(&sk->sk_write_queue.lock);
2287         if (po->tx_ring.pg_vec) {
2288                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2289                         mask |= POLLOUT | POLLWRNORM;
2290         }
2291         spin_unlock_bh(&sk->sk_write_queue.lock);
2292         return mask;
2293 }
2294
2295
2296 /* Dirty? Well, I still did not learn better way to account
2297  * for user mmaps.
2298  */
2299
2300 static void packet_mm_open(struct vm_area_struct *vma)
2301 {
2302         struct file *file = vma->vm_file;
2303         struct socket *sock = file->private_data;
2304         struct sock *sk = sock->sk;
2305
2306         if (sk)
2307                 atomic_inc(&pkt_sk(sk)->mapped);
2308 }
2309
2310 static void packet_mm_close(struct vm_area_struct *vma)
2311 {
2312         struct file *file = vma->vm_file;
2313         struct socket *sock = file->private_data;
2314         struct sock *sk = sock->sk;
2315
2316         if (sk)
2317                 atomic_dec(&pkt_sk(sk)->mapped);
2318 }
2319
2320 static const struct vm_operations_struct packet_mmap_ops = {
2321         .open   =       packet_mm_open,
2322         .close  =       packet_mm_close,
2323 };
2324
2325 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2326 {
2327         int i;
2328
2329         for (i = 0; i < len; i++) {
2330                 if (likely(pg_vec[i]))
2331                         free_pages((unsigned long) pg_vec[i], order);
2332         }
2333         kfree(pg_vec);
2334 }
2335
2336 static inline char *alloc_one_pg_vec_page(unsigned long order)
2337 {
2338         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2339
2340         return (char *) __get_free_pages(gfp_flags, order);
2341 }
2342
2343 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2344 {
2345         unsigned int block_nr = req->tp_block_nr;
2346         char **pg_vec;
2347         int i;
2348
2349         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2350         if (unlikely(!pg_vec))
2351                 goto out;
2352
2353         for (i = 0; i < block_nr; i++) {
2354                 pg_vec[i] = alloc_one_pg_vec_page(order);
2355                 if (unlikely(!pg_vec[i]))
2356                         goto out_free_pgvec;
2357         }
2358
2359 out:
2360         return pg_vec;
2361
2362 out_free_pgvec:
2363         free_pg_vec(pg_vec, order, block_nr);
2364         pg_vec = NULL;
2365         goto out;
2366 }
2367
2368 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2369                 int closing, int tx_ring)
2370 {
2371         char **pg_vec = NULL;
2372         struct packet_sock *po = pkt_sk(sk);
2373         int was_running, order = 0;
2374         struct packet_ring_buffer *rb;
2375         struct sk_buff_head *rb_queue;
2376         __be16 num;
2377         int err;
2378
2379         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2380         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2381
2382         err = -EBUSY;
2383         if (!closing) {
2384                 if (atomic_read(&po->mapped))
2385                         goto out;
2386                 if (atomic_read(&rb->pending))
2387                         goto out;
2388         }
2389
2390         if (req->tp_block_nr) {
2391                 /* Sanity tests and some calculations */
2392                 err = -EBUSY;
2393                 if (unlikely(rb->pg_vec))
2394                         goto out;
2395
2396                 switch (po->tp_version) {
2397                 case TPACKET_V1:
2398                         po->tp_hdrlen = TPACKET_HDRLEN;
2399                         break;
2400                 case TPACKET_V2:
2401                         po->tp_hdrlen = TPACKET2_HDRLEN;
2402                         break;
2403                 }
2404
2405                 err = -EINVAL;
2406                 if (unlikely((int)req->tp_block_size <= 0))
2407                         goto out;
2408                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2409                         goto out;
2410                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2411                                         po->tp_reserve))
2412                         goto out;
2413                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2414                         goto out;
2415
2416                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2417                 if (unlikely(rb->frames_per_block <= 0))
2418                         goto out;
2419                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2420                                         req->tp_frame_nr))
2421                         goto out;
2422
2423                 err = -ENOMEM;
2424                 order = get_order(req->tp_block_size);
2425                 pg_vec = alloc_pg_vec(req, order);
2426                 if (unlikely(!pg_vec))
2427                         goto out;
2428         }
2429         /* Done */
2430         else {
2431                 err = -EINVAL;
2432                 if (unlikely(req->tp_frame_nr))
2433                         goto out;
2434         }
2435
2436         lock_sock(sk);
2437
2438         /* Detach socket from network */
2439         spin_lock(&po->bind_lock);
2440         was_running = po->running;
2441         num = po->num;
2442         if (was_running) {
2443                 __dev_remove_pack(&po->prot_hook);
2444                 po->num = 0;
2445                 po->running = 0;
2446                 __sock_put(sk);
2447         }
2448         spin_unlock(&po->bind_lock);
2449
2450         synchronize_net();
2451
2452         err = -EBUSY;
2453         mutex_lock(&po->pg_vec_lock);
2454         if (closing || atomic_read(&po->mapped) == 0) {
2455                 err = 0;
2456 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2457                 spin_lock_bh(&rb_queue->lock);
2458                 pg_vec = XC(rb->pg_vec, pg_vec);
2459                 rb->frame_max = (req->tp_frame_nr - 1);
2460                 rb->head = 0;
2461                 rb->frame_size = req->tp_frame_size;
2462                 spin_unlock_bh(&rb_queue->lock);
2463
2464                 order = XC(rb->pg_vec_order, order);
2465                 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2466
2467                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2468                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2469                                                 tpacket_rcv : packet_rcv;
2470                 skb_queue_purge(rb_queue);
2471 #undef XC
2472                 if (atomic_read(&po->mapped))
2473                         pr_err("packet_mmap: vma is busy: %d\n",
2474                                atomic_read(&po->mapped));
2475         }
2476         mutex_unlock(&po->pg_vec_lock);
2477
2478         spin_lock(&po->bind_lock);
2479         if (was_running && !po->running) {
2480                 sock_hold(sk);
2481                 po->running = 1;
2482                 po->num = num;
2483                 dev_add_pack(&po->prot_hook);
2484         }
2485         spin_unlock(&po->bind_lock);
2486
2487         release_sock(sk);
2488
2489         if (pg_vec)
2490                 free_pg_vec(pg_vec, order, req->tp_block_nr);
2491 out:
2492         return err;
2493 }
2494
2495 static int packet_mmap(struct file *file, struct socket *sock,
2496                 struct vm_area_struct *vma)
2497 {
2498         struct sock *sk = sock->sk;
2499         struct packet_sock *po = pkt_sk(sk);
2500         unsigned long size, expected_size;
2501         struct packet_ring_buffer *rb;
2502         unsigned long start;
2503         int err = -EINVAL;
2504         int i;
2505
2506         if (vma->vm_pgoff)
2507                 return -EINVAL;
2508
2509         mutex_lock(&po->pg_vec_lock);
2510
2511         expected_size = 0;
2512         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2513                 if (rb->pg_vec) {
2514                         expected_size += rb->pg_vec_len
2515                                                 * rb->pg_vec_pages
2516                                                 * PAGE_SIZE;
2517                 }
2518         }
2519
2520         if (expected_size == 0)
2521                 goto out;
2522
2523         size = vma->vm_end - vma->vm_start;
2524         if (size != expected_size)
2525                 goto out;
2526
2527         start = vma->vm_start;
2528         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2529                 if (rb->pg_vec == NULL)
2530                         continue;
2531
2532                 for (i = 0; i < rb->pg_vec_len; i++) {
2533                         struct page *page = virt_to_page(rb->pg_vec[i]);
2534                         int pg_num;
2535
2536                         for (pg_num = 0; pg_num < rb->pg_vec_pages;
2537                                         pg_num++, page++) {
2538                                 err = vm_insert_page(vma, start, page);
2539                                 if (unlikely(err))
2540                                         goto out;
2541                                 start += PAGE_SIZE;
2542                         }
2543                 }
2544         }
2545
2546         atomic_inc(&po->mapped);
2547         vma->vm_ops = &packet_mmap_ops;
2548         err = 0;
2549
2550 out:
2551         mutex_unlock(&po->pg_vec_lock);
2552         return err;
2553 }
2554
2555 static const struct proto_ops packet_ops_spkt = {
2556         .family =       PF_PACKET,
2557         .owner =        THIS_MODULE,
2558         .release =      packet_release,
2559         .bind =         packet_bind_spkt,
2560         .connect =      sock_no_connect,
2561         .socketpair =   sock_no_socketpair,
2562         .accept =       sock_no_accept,
2563         .getname =      packet_getname_spkt,
2564         .poll =         datagram_poll,
2565         .ioctl =        packet_ioctl,
2566         .listen =       sock_no_listen,
2567         .shutdown =     sock_no_shutdown,
2568         .setsockopt =   sock_no_setsockopt,
2569         .getsockopt =   sock_no_getsockopt,
2570         .sendmsg =      packet_sendmsg_spkt,
2571         .recvmsg =      packet_recvmsg,
2572         .mmap =         sock_no_mmap,
2573         .sendpage =     sock_no_sendpage,
2574 };
2575
2576 static const struct proto_ops packet_ops = {
2577         .family =       PF_PACKET,
2578         .owner =        THIS_MODULE,
2579         .release =      packet_release,
2580         .bind =         packet_bind,
2581         .connect =      sock_no_connect,
2582         .socketpair =   sock_no_socketpair,
2583         .accept =       sock_no_accept,
2584         .getname =      packet_getname,
2585         .poll =         packet_poll,
2586         .ioctl =        packet_ioctl,
2587         .listen =       sock_no_listen,
2588         .shutdown =     sock_no_shutdown,
2589         .setsockopt =   packet_setsockopt,
2590         .getsockopt =   packet_getsockopt,
2591         .sendmsg =      packet_sendmsg,
2592         .recvmsg =      packet_recvmsg,
2593         .mmap =         packet_mmap,
2594         .sendpage =     sock_no_sendpage,
2595 };
2596
2597 static const struct net_proto_family packet_family_ops = {
2598         .family =       PF_PACKET,
2599         .create =       packet_create,
2600         .owner  =       THIS_MODULE,
2601 };
2602
2603 static struct notifier_block packet_netdev_notifier = {
2604         .notifier_call =        packet_notifier,
2605 };
2606
2607 #ifdef CONFIG_PROC_FS
2608
2609 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2610         __acquires(RCU)
2611 {
2612         struct net *net = seq_file_net(seq);
2613
2614         rcu_read_lock();
2615         return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
2616 }
2617
2618 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2619 {
2620         struct net *net = seq_file_net(seq);
2621         return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
2622 }
2623
2624 static void packet_seq_stop(struct seq_file *seq, void *v)
2625         __releases(RCU)
2626 {
2627         rcu_read_unlock();
2628 }
2629
2630 static int packet_seq_show(struct seq_file *seq, void *v)
2631 {
2632         if (v == SEQ_START_TOKEN)
2633                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2634         else {
2635                 struct sock *s = sk_entry(v);
2636                 const struct packet_sock *po = pkt_sk(s);
2637
2638                 seq_printf(seq,
2639                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2640                            s,
2641                            atomic_read(&s->sk_refcnt),
2642                            s->sk_type,
2643                            ntohs(po->num),
2644                            po->ifindex,
2645                            po->running,
2646                            atomic_read(&s->sk_rmem_alloc),
2647                            sock_i_uid(s),
2648                            sock_i_ino(s));
2649         }
2650
2651         return 0;
2652 }
2653
2654 static const struct seq_operations packet_seq_ops = {
2655         .start  = packet_seq_start,
2656         .next   = packet_seq_next,
2657         .stop   = packet_seq_stop,
2658         .show   = packet_seq_show,
2659 };
2660
2661 static int packet_seq_open(struct inode *inode, struct file *file)
2662 {
2663         return seq_open_net(inode, file, &packet_seq_ops,
2664                             sizeof(struct seq_net_private));
2665 }
2666
2667 static const struct file_operations packet_seq_fops = {
2668         .owner          = THIS_MODULE,
2669         .open           = packet_seq_open,
2670         .read           = seq_read,
2671         .llseek         = seq_lseek,
2672         .release        = seq_release_net,
2673 };
2674
2675 #endif
2676
2677 static int __net_init packet_net_init(struct net *net)
2678 {
2679         spin_lock_init(&net->packet.sklist_lock);
2680         INIT_HLIST_HEAD(&net->packet.sklist);
2681
2682         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2683                 return -ENOMEM;
2684
2685         return 0;
2686 }
2687
2688 static void __net_exit packet_net_exit(struct net *net)
2689 {
2690         proc_net_remove(net, "packet");
2691 }
2692
2693 static struct pernet_operations packet_net_ops = {
2694         .init = packet_net_init,
2695         .exit = packet_net_exit,
2696 };
2697
2698
2699 static void __exit packet_exit(void)
2700 {
2701         unregister_netdevice_notifier(&packet_netdev_notifier);
2702         unregister_pernet_subsys(&packet_net_ops);
2703         sock_unregister(PF_PACKET);
2704         proto_unregister(&packet_proto);
2705 }
2706
2707 static int __init packet_init(void)
2708 {
2709         int rc = proto_register(&packet_proto, 0);
2710
2711         if (rc != 0)
2712                 goto out;
2713
2714         sock_register(&packet_family_ops);
2715         register_pernet_subsys(&packet_net_ops);
2716         register_netdevice_notifier(&packet_netdev_notifier);
2717 out:
2718         return rc;
2719 }
2720
2721 module_init(packet_init);
2722 module_exit(packet_exit);
2723 MODULE_LICENSE("GPL");
2724 MODULE_ALIAS_NETPROTO(PF_PACKET);