Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[linux-2.6.git] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *              Johann Baudy    :       Added TX RING.
43  *
44  *              This program is free software; you can redistribute it and/or
45  *              modify it under the terms of the GNU General Public License
46  *              as published by the Free Software Foundation; either version
47  *              2 of the License, or (at your option) any later version.
48  *
49  */
50
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <linux/slab.h>
64 #include <net/net_namespace.h>
65 #include <net/ip.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
68 #include <net/sock.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <asm/system.h>
72 #include <asm/uaccess.h>
73 #include <asm/ioctls.h>
74 #include <asm/page.h>
75 #include <asm/cacheflush.h>
76 #include <asm/io.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/poll.h>
80 #include <linux/module.h>
81 #include <linux/init.h>
82 #include <linux/mutex.h>
83 #include <linux/if_vlan.h>
84 #include <linux/virtio_net.h>
85
86 #ifdef CONFIG_INET
87 #include <net/inet_common.h>
88 #endif
89
90 /*
91    Assumptions:
92    - if device has no dev->hard_header routine, it adds and removes ll header
93      inside itself. In this case ll header is invisible outside of device,
94      but higher levels still should reserve dev->hard_header_len.
95      Some devices are enough clever to reallocate skb, when header
96      will not fit to reserved space (tunnel), another ones are silly
97      (PPP).
98    - packet socket receives packets with pulled ll header,
99      so that SOCK_RAW should push it back.
100
101 On receive:
102 -----------
103
104 Incoming, dev->hard_header!=NULL
105    mac_header -> ll header
106    data       -> data
107
108 Outgoing, dev->hard_header!=NULL
109    mac_header -> ll header
110    data       -> ll header
111
112 Incoming, dev->hard_header==NULL
113    mac_header -> UNKNOWN position. It is very likely, that it points to ll
114                  header.  PPP makes it, that is wrong, because introduce
115                  assymetry between rx and tx paths.
116    data       -> data
117
118 Outgoing, dev->hard_header==NULL
119    mac_header -> data. ll header is still not built!
120    data       -> data
121
122 Resume
123   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
124
125
126 On transmit:
127 ------------
128
129 dev->hard_header != NULL
130    mac_header -> ll header
131    data       -> ll header
132
133 dev->hard_header == NULL (ll header is added by device, we cannot control it)
134    mac_header -> data
135    data       -> data
136
137    We should set nh.raw on output to correct posistion,
138    packet classifier depends on it.
139  */
140
141 /* Private packet socket structures. */
142
143 struct packet_mclist {
144         struct packet_mclist    *next;
145         int                     ifindex;
146         int                     count;
147         unsigned short          type;
148         unsigned short          alen;
149         unsigned char           addr[MAX_ADDR_LEN];
150 };
151 /* identical to struct packet_mreq except it has
152  * a longer address field.
153  */
154 struct packet_mreq_max {
155         int             mr_ifindex;
156         unsigned short  mr_type;
157         unsigned short  mr_alen;
158         unsigned char   mr_address[MAX_ADDR_LEN];
159 };
160
161 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
162                 int closing, int tx_ring);
163
164 struct packet_ring_buffer {
165         char                    **pg_vec;
166         unsigned int            head;
167         unsigned int            frames_per_block;
168         unsigned int            frame_size;
169         unsigned int            frame_max;
170
171         unsigned int            pg_vec_order;
172         unsigned int            pg_vec_pages;
173         unsigned int            pg_vec_len;
174
175         atomic_t                pending;
176 };
177
178 struct packet_sock;
179 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
180
181 static void packet_flush_mclist(struct sock *sk);
182
183 struct packet_sock {
184         /* struct sock has to be the first member of packet_sock */
185         struct sock             sk;
186         struct tpacket_stats    stats;
187         struct packet_ring_buffer       rx_ring;
188         struct packet_ring_buffer       tx_ring;
189         int                     copy_thresh;
190         spinlock_t              bind_lock;
191         struct mutex            pg_vec_lock;
192         unsigned int            running:1,      /* prot_hook is attached*/
193                                 auxdata:1,
194                                 origdev:1,
195                                 has_vnet_hdr:1;
196         int                     ifindex;        /* bound device         */
197         __be16                  num;
198         struct packet_mclist    *mclist;
199         atomic_t                mapped;
200         enum tpacket_versions   tp_version;
201         unsigned int            tp_hdrlen;
202         unsigned int            tp_reserve;
203         unsigned int            tp_loss:1;
204         struct packet_type      prot_hook ____cacheline_aligned_in_smp;
205 };
206
207 struct packet_skb_cb {
208         unsigned int origlen;
209         union {
210                 struct sockaddr_pkt pkt;
211                 struct sockaddr_ll ll;
212         } sa;
213 };
214
215 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
216
217 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
218 {
219         union {
220                 struct tpacket_hdr *h1;
221                 struct tpacket2_hdr *h2;
222                 void *raw;
223         } h;
224
225         h.raw = frame;
226         switch (po->tp_version) {
227         case TPACKET_V1:
228                 h.h1->tp_status = status;
229                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
230                 break;
231         case TPACKET_V2:
232                 h.h2->tp_status = status;
233                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
234                 break;
235         default:
236                 pr_err("TPACKET version not supported\n");
237                 BUG();
238         }
239
240         smp_wmb();
241 }
242
243 static int __packet_get_status(struct packet_sock *po, void *frame)
244 {
245         union {
246                 struct tpacket_hdr *h1;
247                 struct tpacket2_hdr *h2;
248                 void *raw;
249         } h;
250
251         smp_rmb();
252
253         h.raw = frame;
254         switch (po->tp_version) {
255         case TPACKET_V1:
256                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
257                 return h.h1->tp_status;
258         case TPACKET_V2:
259                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
260                 return h.h2->tp_status;
261         default:
262                 pr_err("TPACKET version not supported\n");
263                 BUG();
264                 return 0;
265         }
266 }
267
268 static void *packet_lookup_frame(struct packet_sock *po,
269                 struct packet_ring_buffer *rb,
270                 unsigned int position,
271                 int status)
272 {
273         unsigned int pg_vec_pos, frame_offset;
274         union {
275                 struct tpacket_hdr *h1;
276                 struct tpacket2_hdr *h2;
277                 void *raw;
278         } h;
279
280         pg_vec_pos = position / rb->frames_per_block;
281         frame_offset = position % rb->frames_per_block;
282
283         h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
284
285         if (status != __packet_get_status(po, h.raw))
286                 return NULL;
287
288         return h.raw;
289 }
290
291 static inline void *packet_current_frame(struct packet_sock *po,
292                 struct packet_ring_buffer *rb,
293                 int status)
294 {
295         return packet_lookup_frame(po, rb, rb->head, status);
296 }
297
298 static inline void *packet_previous_frame(struct packet_sock *po,
299                 struct packet_ring_buffer *rb,
300                 int status)
301 {
302         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
303         return packet_lookup_frame(po, rb, previous, status);
304 }
305
306 static inline void packet_increment_head(struct packet_ring_buffer *buff)
307 {
308         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
309 }
310
311 static inline struct packet_sock *pkt_sk(struct sock *sk)
312 {
313         return (struct packet_sock *)sk;
314 }
315
316 static void packet_sock_destruct(struct sock *sk)
317 {
318         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
319         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
320
321         if (!sock_flag(sk, SOCK_DEAD)) {
322                 pr_err("Attempt to release alive packet socket: %p\n", sk);
323                 return;
324         }
325
326         sk_refcnt_debug_dec(sk);
327 }
328
329
330 static const struct proto_ops packet_ops;
331
332 static const struct proto_ops packet_ops_spkt;
333
334 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
335                            struct packet_type *pt, struct net_device *orig_dev)
336 {
337         struct sock *sk;
338         struct sockaddr_pkt *spkt;
339
340         /*
341          *      When we registered the protocol we saved the socket in the data
342          *      field for just this event.
343          */
344
345         sk = pt->af_packet_priv;
346
347         /*
348          *      Yank back the headers [hope the device set this
349          *      right or kerboom...]
350          *
351          *      Incoming packets have ll header pulled,
352          *      push it back.
353          *
354          *      For outgoing ones skb->data == skb_mac_header(skb)
355          *      so that this procedure is noop.
356          */
357
358         if (skb->pkt_type == PACKET_LOOPBACK)
359                 goto out;
360
361         if (!net_eq(dev_net(dev), sock_net(sk)))
362                 goto out;
363
364         skb = skb_share_check(skb, GFP_ATOMIC);
365         if (skb == NULL)
366                 goto oom;
367
368         /* drop any routing info */
369         skb_dst_drop(skb);
370
371         /* drop conntrack reference */
372         nf_reset(skb);
373
374         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
375
376         skb_push(skb, skb->data - skb_mac_header(skb));
377
378         /*
379          *      The SOCK_PACKET socket receives _all_ frames.
380          */
381
382         spkt->spkt_family = dev->type;
383         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
384         spkt->spkt_protocol = skb->protocol;
385
386         /*
387          *      Charge the memory to the socket. This is done specifically
388          *      to prevent sockets using all the memory up.
389          */
390
391         if (sock_queue_rcv_skb(sk, skb) == 0)
392                 return 0;
393
394 out:
395         kfree_skb(skb);
396 oom:
397         return 0;
398 }
399
400
401 /*
402  *      Output a raw packet to a device layer. This bypasses all the other
403  *      protocol layers and you must therefore supply it with a complete frame
404  */
405
406 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
407                                struct msghdr *msg, size_t len)
408 {
409         struct sock *sk = sock->sk;
410         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
411         struct sk_buff *skb = NULL;
412         struct net_device *dev;
413         __be16 proto = 0;
414         int err;
415
416         /*
417          *      Get and verify the address.
418          */
419
420         if (saddr) {
421                 if (msg->msg_namelen < sizeof(struct sockaddr))
422                         return -EINVAL;
423                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
424                         proto = saddr->spkt_protocol;
425         } else
426                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
427
428         /*
429          *      Find the device first to size check it
430          */
431
432         saddr->spkt_device[13] = 0;
433 retry:
434         rcu_read_lock();
435         dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
436         err = -ENODEV;
437         if (dev == NULL)
438                 goto out_unlock;
439
440         err = -ENETDOWN;
441         if (!(dev->flags & IFF_UP))
442                 goto out_unlock;
443
444         /*
445          * You may not queue a frame bigger than the mtu. This is the lowest level
446          * raw protocol and you must do your own fragmentation at this level.
447          */
448
449         err = -EMSGSIZE;
450         if (len > dev->mtu + dev->hard_header_len)
451                 goto out_unlock;
452
453         if (!skb) {
454                 size_t reserved = LL_RESERVED_SPACE(dev);
455                 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
456
457                 rcu_read_unlock();
458                 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
459                 if (skb == NULL)
460                         return -ENOBUFS;
461                 /* FIXME: Save some space for broken drivers that write a hard
462                  * header at transmission time by themselves. PPP is the notable
463                  * one here. This should really be fixed at the driver level.
464                  */
465                 skb_reserve(skb, reserved);
466                 skb_reset_network_header(skb);
467
468                 /* Try to align data part correctly */
469                 if (hhlen) {
470                         skb->data -= hhlen;
471                         skb->tail -= hhlen;
472                         if (len < hhlen)
473                                 skb_reset_network_header(skb);
474                 }
475                 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
476                 if (err)
477                         goto out_free;
478                 goto retry;
479         }
480
481
482         skb->protocol = proto;
483         skb->dev = dev;
484         skb->priority = sk->sk_priority;
485         skb->mark = sk->sk_mark;
486
487         dev_queue_xmit(skb);
488         rcu_read_unlock();
489         return len;
490
491 out_unlock:
492         rcu_read_unlock();
493 out_free:
494         kfree_skb(skb);
495         return err;
496 }
497
498 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
499                                       unsigned int res)
500 {
501         struct sk_filter *filter;
502
503         rcu_read_lock_bh();
504         filter = rcu_dereference_bh(sk->sk_filter);
505         if (filter != NULL)
506                 res = sk_run_filter(skb, filter->insns, filter->len);
507         rcu_read_unlock_bh();
508
509         return res;
510 }
511
512 /*
513    This function makes lazy skb cloning in hope that most of packets
514    are discarded by BPF.
515
516    Note tricky part: we DO mangle shared skb! skb->data, skb->len
517    and skb->cb are mangled. It works because (and until) packets
518    falling here are owned by current CPU. Output packets are cloned
519    by dev_queue_xmit_nit(), input packets are processed by net_bh
520    sequencially, so that if we return skb to original state on exit,
521    we will not harm anyone.
522  */
523
524 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
525                       struct packet_type *pt, struct net_device *orig_dev)
526 {
527         struct sock *sk;
528         struct sockaddr_ll *sll;
529         struct packet_sock *po;
530         u8 *skb_head = skb->data;
531         int skb_len = skb->len;
532         unsigned int snaplen, res;
533
534         if (skb->pkt_type == PACKET_LOOPBACK)
535                 goto drop;
536
537         sk = pt->af_packet_priv;
538         po = pkt_sk(sk);
539
540         if (!net_eq(dev_net(dev), sock_net(sk)))
541                 goto drop;
542
543         skb->dev = dev;
544
545         if (dev->header_ops) {
546                 /* The device has an explicit notion of ll header,
547                    exported to higher levels.
548
549                    Otherwise, the device hides datails of it frame
550                    structure, so that corresponding packet head
551                    never delivered to user.
552                  */
553                 if (sk->sk_type != SOCK_DGRAM)
554                         skb_push(skb, skb->data - skb_mac_header(skb));
555                 else if (skb->pkt_type == PACKET_OUTGOING) {
556                         /* Special case: outgoing packets have ll header at head */
557                         skb_pull(skb, skb_network_offset(skb));
558                 }
559         }
560
561         snaplen = skb->len;
562
563         res = run_filter(skb, sk, snaplen);
564         if (!res)
565                 goto drop_n_restore;
566         if (snaplen > res)
567                 snaplen = res;
568
569         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
570             (unsigned)sk->sk_rcvbuf)
571                 goto drop_n_acct;
572
573         if (skb_shared(skb)) {
574                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
575                 if (nskb == NULL)
576                         goto drop_n_acct;
577
578                 if (skb_head != skb->data) {
579                         skb->data = skb_head;
580                         skb->len = skb_len;
581                 }
582                 kfree_skb(skb);
583                 skb = nskb;
584         }
585
586         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
587                      sizeof(skb->cb));
588
589         sll = &PACKET_SKB_CB(skb)->sa.ll;
590         sll->sll_family = AF_PACKET;
591         sll->sll_hatype = dev->type;
592         sll->sll_protocol = skb->protocol;
593         sll->sll_pkttype = skb->pkt_type;
594         if (unlikely(po->origdev))
595                 sll->sll_ifindex = orig_dev->ifindex;
596         else
597                 sll->sll_ifindex = dev->ifindex;
598
599         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
600
601         PACKET_SKB_CB(skb)->origlen = skb->len;
602
603         if (pskb_trim(skb, snaplen))
604                 goto drop_n_acct;
605
606         skb_set_owner_r(skb, sk);
607         skb->dev = NULL;
608         skb_dst_drop(skb);
609
610         /* drop conntrack reference */
611         nf_reset(skb);
612
613         spin_lock(&sk->sk_receive_queue.lock);
614         po->stats.tp_packets++;
615         skb->dropcount = atomic_read(&sk->sk_drops);
616         __skb_queue_tail(&sk->sk_receive_queue, skb);
617         spin_unlock(&sk->sk_receive_queue.lock);
618         sk->sk_data_ready(sk, skb->len);
619         return 0;
620
621 drop_n_acct:
622         po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
623
624 drop_n_restore:
625         if (skb_head != skb->data && skb_shared(skb)) {
626                 skb->data = skb_head;
627                 skb->len = skb_len;
628         }
629 drop:
630         consume_skb(skb);
631         return 0;
632 }
633
634 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
635                        struct packet_type *pt, struct net_device *orig_dev)
636 {
637         struct sock *sk;
638         struct packet_sock *po;
639         struct sockaddr_ll *sll;
640         union {
641                 struct tpacket_hdr *h1;
642                 struct tpacket2_hdr *h2;
643                 void *raw;
644         } h;
645         u8 *skb_head = skb->data;
646         int skb_len = skb->len;
647         unsigned int snaplen, res;
648         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
649         unsigned short macoff, netoff, hdrlen;
650         struct sk_buff *copy_skb = NULL;
651         struct timeval tv;
652         struct timespec ts;
653
654         if (skb->pkt_type == PACKET_LOOPBACK)
655                 goto drop;
656
657         sk = pt->af_packet_priv;
658         po = pkt_sk(sk);
659
660         if (!net_eq(dev_net(dev), sock_net(sk)))
661                 goto drop;
662
663         if (dev->header_ops) {
664                 if (sk->sk_type != SOCK_DGRAM)
665                         skb_push(skb, skb->data - skb_mac_header(skb));
666                 else if (skb->pkt_type == PACKET_OUTGOING) {
667                         /* Special case: outgoing packets have ll header at head */
668                         skb_pull(skb, skb_network_offset(skb));
669                 }
670         }
671
672         if (skb->ip_summed == CHECKSUM_PARTIAL)
673                 status |= TP_STATUS_CSUMNOTREADY;
674
675         snaplen = skb->len;
676
677         res = run_filter(skb, sk, snaplen);
678         if (!res)
679                 goto drop_n_restore;
680         if (snaplen > res)
681                 snaplen = res;
682
683         if (sk->sk_type == SOCK_DGRAM) {
684                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
685                                   po->tp_reserve;
686         } else {
687                 unsigned maclen = skb_network_offset(skb);
688                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
689                                        (maclen < 16 ? 16 : maclen)) +
690                         po->tp_reserve;
691                 macoff = netoff - maclen;
692         }
693
694         if (macoff + snaplen > po->rx_ring.frame_size) {
695                 if (po->copy_thresh &&
696                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
697                     (unsigned)sk->sk_rcvbuf) {
698                         if (skb_shared(skb)) {
699                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
700                         } else {
701                                 copy_skb = skb_get(skb);
702                                 skb_head = skb->data;
703                         }
704                         if (copy_skb)
705                                 skb_set_owner_r(copy_skb, sk);
706                 }
707                 snaplen = po->rx_ring.frame_size - macoff;
708                 if ((int)snaplen < 0)
709                         snaplen = 0;
710         }
711
712         spin_lock(&sk->sk_receive_queue.lock);
713         h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
714         if (!h.raw)
715                 goto ring_is_full;
716         packet_increment_head(&po->rx_ring);
717         po->stats.tp_packets++;
718         if (copy_skb) {
719                 status |= TP_STATUS_COPY;
720                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
721         }
722         if (!po->stats.tp_drops)
723                 status &= ~TP_STATUS_LOSING;
724         spin_unlock(&sk->sk_receive_queue.lock);
725
726         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
727
728         switch (po->tp_version) {
729         case TPACKET_V1:
730                 h.h1->tp_len = skb->len;
731                 h.h1->tp_snaplen = snaplen;
732                 h.h1->tp_mac = macoff;
733                 h.h1->tp_net = netoff;
734                 if (skb->tstamp.tv64)
735                         tv = ktime_to_timeval(skb->tstamp);
736                 else
737                         do_gettimeofday(&tv);
738                 h.h1->tp_sec = tv.tv_sec;
739                 h.h1->tp_usec = tv.tv_usec;
740                 hdrlen = sizeof(*h.h1);
741                 break;
742         case TPACKET_V2:
743                 h.h2->tp_len = skb->len;
744                 h.h2->tp_snaplen = snaplen;
745                 h.h2->tp_mac = macoff;
746                 h.h2->tp_net = netoff;
747                 if (skb->tstamp.tv64)
748                         ts = ktime_to_timespec(skb->tstamp);
749                 else
750                         getnstimeofday(&ts);
751                 h.h2->tp_sec = ts.tv_sec;
752                 h.h2->tp_nsec = ts.tv_nsec;
753                 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
754                 hdrlen = sizeof(*h.h2);
755                 break;
756         default:
757                 BUG();
758         }
759
760         sll = h.raw + TPACKET_ALIGN(hdrlen);
761         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
762         sll->sll_family = AF_PACKET;
763         sll->sll_hatype = dev->type;
764         sll->sll_protocol = skb->protocol;
765         sll->sll_pkttype = skb->pkt_type;
766         if (unlikely(po->origdev))
767                 sll->sll_ifindex = orig_dev->ifindex;
768         else
769                 sll->sll_ifindex = dev->ifindex;
770
771         __packet_set_status(po, h.raw, status);
772         smp_mb();
773         {
774                 struct page *p_start, *p_end;
775                 u8 *h_end = h.raw + macoff + snaplen - 1;
776
777                 p_start = virt_to_page(h.raw);
778                 p_end = virt_to_page(h_end);
779                 while (p_start <= p_end) {
780                         flush_dcache_page(p_start);
781                         p_start++;
782                 }
783         }
784
785         sk->sk_data_ready(sk, 0);
786
787 drop_n_restore:
788         if (skb_head != skb->data && skb_shared(skb)) {
789                 skb->data = skb_head;
790                 skb->len = skb_len;
791         }
792 drop:
793         kfree_skb(skb);
794         return 0;
795
796 ring_is_full:
797         po->stats.tp_drops++;
798         spin_unlock(&sk->sk_receive_queue.lock);
799
800         sk->sk_data_ready(sk, 0);
801         kfree_skb(copy_skb);
802         goto drop_n_restore;
803 }
804
805 static void tpacket_destruct_skb(struct sk_buff *skb)
806 {
807         struct packet_sock *po = pkt_sk(skb->sk);
808         void *ph;
809
810         BUG_ON(skb == NULL);
811
812         if (likely(po->tx_ring.pg_vec)) {
813                 ph = skb_shinfo(skb)->destructor_arg;
814                 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
815                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
816                 atomic_dec(&po->tx_ring.pending);
817                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
818         }
819
820         sock_wfree(skb);
821 }
822
823 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
824                 void *frame, struct net_device *dev, int size_max,
825                 __be16 proto, unsigned char *addr)
826 {
827         union {
828                 struct tpacket_hdr *h1;
829                 struct tpacket2_hdr *h2;
830                 void *raw;
831         } ph;
832         int to_write, offset, len, tp_len, nr_frags, len_max;
833         struct socket *sock = po->sk.sk_socket;
834         struct page *page;
835         void *data;
836         int err;
837
838         ph.raw = frame;
839
840         skb->protocol = proto;
841         skb->dev = dev;
842         skb->priority = po->sk.sk_priority;
843         skb->mark = po->sk.sk_mark;
844         skb_shinfo(skb)->destructor_arg = ph.raw;
845
846         switch (po->tp_version) {
847         case TPACKET_V2:
848                 tp_len = ph.h2->tp_len;
849                 break;
850         default:
851                 tp_len = ph.h1->tp_len;
852                 break;
853         }
854         if (unlikely(tp_len > size_max)) {
855                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
856                 return -EMSGSIZE;
857         }
858
859         skb_reserve(skb, LL_RESERVED_SPACE(dev));
860         skb_reset_network_header(skb);
861
862         data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
863         to_write = tp_len;
864
865         if (sock->type == SOCK_DGRAM) {
866                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
867                                 NULL, tp_len);
868                 if (unlikely(err < 0))
869                         return -EINVAL;
870         } else if (dev->hard_header_len) {
871                 /* net device doesn't like empty head */
872                 if (unlikely(tp_len <= dev->hard_header_len)) {
873                         pr_err("packet size is too short (%d < %d)\n",
874                                tp_len, dev->hard_header_len);
875                         return -EINVAL;
876                 }
877
878                 skb_push(skb, dev->hard_header_len);
879                 err = skb_store_bits(skb, 0, data,
880                                 dev->hard_header_len);
881                 if (unlikely(err))
882                         return err;
883
884                 data += dev->hard_header_len;
885                 to_write -= dev->hard_header_len;
886         }
887
888         err = -EFAULT;
889         page = virt_to_page(data);
890         offset = offset_in_page(data);
891         len_max = PAGE_SIZE - offset;
892         len = ((to_write > len_max) ? len_max : to_write);
893
894         skb->data_len = to_write;
895         skb->len += to_write;
896         skb->truesize += to_write;
897         atomic_add(to_write, &po->sk.sk_wmem_alloc);
898
899         while (likely(to_write)) {
900                 nr_frags = skb_shinfo(skb)->nr_frags;
901
902                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
903                         pr_err("Packet exceed the number of skb frags(%lu)\n",
904                                MAX_SKB_FRAGS);
905                         return -EFAULT;
906                 }
907
908                 flush_dcache_page(page);
909                 get_page(page);
910                 skb_fill_page_desc(skb,
911                                 nr_frags,
912                                 page++, offset, len);
913                 to_write -= len;
914                 offset = 0;
915                 len_max = PAGE_SIZE;
916                 len = ((to_write > len_max) ? len_max : to_write);
917         }
918
919         return tp_len;
920 }
921
922 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
923 {
924         struct socket *sock;
925         struct sk_buff *skb;
926         struct net_device *dev;
927         __be16 proto;
928         int ifindex, err, reserve = 0;
929         void *ph;
930         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
931         int tp_len, size_max;
932         unsigned char *addr;
933         int len_sum = 0;
934         int status = 0;
935
936         sock = po->sk.sk_socket;
937
938         mutex_lock(&po->pg_vec_lock);
939
940         err = -EBUSY;
941         if (saddr == NULL) {
942                 ifindex = po->ifindex;
943                 proto   = po->num;
944                 addr    = NULL;
945         } else {
946                 err = -EINVAL;
947                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
948                         goto out;
949                 if (msg->msg_namelen < (saddr->sll_halen
950                                         + offsetof(struct sockaddr_ll,
951                                                 sll_addr)))
952                         goto out;
953                 ifindex = saddr->sll_ifindex;
954                 proto   = saddr->sll_protocol;
955                 addr    = saddr->sll_addr;
956         }
957
958         dev = dev_get_by_index(sock_net(&po->sk), ifindex);
959         err = -ENXIO;
960         if (unlikely(dev == NULL))
961                 goto out;
962
963         reserve = dev->hard_header_len;
964
965         err = -ENETDOWN;
966         if (unlikely(!(dev->flags & IFF_UP)))
967                 goto out_put;
968
969         size_max = po->tx_ring.frame_size
970                 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
971
972         if (size_max > dev->mtu + reserve)
973                 size_max = dev->mtu + reserve;
974
975         do {
976                 ph = packet_current_frame(po, &po->tx_ring,
977                                 TP_STATUS_SEND_REQUEST);
978
979                 if (unlikely(ph == NULL)) {
980                         schedule();
981                         continue;
982                 }
983
984                 status = TP_STATUS_SEND_REQUEST;
985                 skb = sock_alloc_send_skb(&po->sk,
986                                 LL_ALLOCATED_SPACE(dev)
987                                 + sizeof(struct sockaddr_ll),
988                                 0, &err);
989
990                 if (unlikely(skb == NULL))
991                         goto out_status;
992
993                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
994                                 addr);
995
996                 if (unlikely(tp_len < 0)) {
997                         if (po->tp_loss) {
998                                 __packet_set_status(po, ph,
999                                                 TP_STATUS_AVAILABLE);
1000                                 packet_increment_head(&po->tx_ring);
1001                                 kfree_skb(skb);
1002                                 continue;
1003                         } else {
1004                                 status = TP_STATUS_WRONG_FORMAT;
1005                                 err = tp_len;
1006                                 goto out_status;
1007                         }
1008                 }
1009
1010                 skb->destructor = tpacket_destruct_skb;
1011                 __packet_set_status(po, ph, TP_STATUS_SENDING);
1012                 atomic_inc(&po->tx_ring.pending);
1013
1014                 status = TP_STATUS_SEND_REQUEST;
1015                 err = dev_queue_xmit(skb);
1016                 if (unlikely(err > 0)) {
1017                         err = net_xmit_errno(err);
1018                         if (err && __packet_get_status(po, ph) ==
1019                                    TP_STATUS_AVAILABLE) {
1020                                 /* skb was destructed already */
1021                                 skb = NULL;
1022                                 goto out_status;
1023                         }
1024                         /*
1025                          * skb was dropped but not destructed yet;
1026                          * let's treat it like congestion or err < 0
1027                          */
1028                         err = 0;
1029                 }
1030                 packet_increment_head(&po->tx_ring);
1031                 len_sum += tp_len;
1032         } while (likely((ph != NULL) ||
1033                         ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1034                          (atomic_read(&po->tx_ring.pending))))
1035                 );
1036
1037         err = len_sum;
1038         goto out_put;
1039
1040 out_status:
1041         __packet_set_status(po, ph, status);
1042         kfree_skb(skb);
1043 out_put:
1044         dev_put(dev);
1045 out:
1046         mutex_unlock(&po->pg_vec_lock);
1047         return err;
1048 }
1049
1050 static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1051                                                size_t reserve, size_t len,
1052                                                size_t linear, int noblock,
1053                                                int *err)
1054 {
1055         struct sk_buff *skb;
1056
1057         /* Under a page?  Don't bother with paged skb. */
1058         if (prepad + len < PAGE_SIZE || !linear)
1059                 linear = len;
1060
1061         skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1062                                    err);
1063         if (!skb)
1064                 return NULL;
1065
1066         skb_reserve(skb, reserve);
1067         skb_put(skb, linear);
1068         skb->data_len = len - linear;
1069         skb->len += len - linear;
1070
1071         return skb;
1072 }
1073
1074 static int packet_snd(struct socket *sock,
1075                           struct msghdr *msg, size_t len)
1076 {
1077         struct sock *sk = sock->sk;
1078         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1079         struct sk_buff *skb;
1080         struct net_device *dev;
1081         __be16 proto;
1082         unsigned char *addr;
1083         int ifindex, err, reserve = 0;
1084         struct virtio_net_hdr vnet_hdr = { 0 };
1085         int offset = 0;
1086         int vnet_hdr_len;
1087         struct packet_sock *po = pkt_sk(sk);
1088         unsigned short gso_type = 0;
1089
1090         /*
1091          *      Get and verify the address.
1092          */
1093
1094         if (saddr == NULL) {
1095                 ifindex = po->ifindex;
1096                 proto   = po->num;
1097                 addr    = NULL;
1098         } else {
1099                 err = -EINVAL;
1100                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1101                         goto out;
1102                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1103                         goto out;
1104                 ifindex = saddr->sll_ifindex;
1105                 proto   = saddr->sll_protocol;
1106                 addr    = saddr->sll_addr;
1107         }
1108
1109
1110         dev = dev_get_by_index(sock_net(sk), ifindex);
1111         err = -ENXIO;
1112         if (dev == NULL)
1113                 goto out_unlock;
1114         if (sock->type == SOCK_RAW)
1115                 reserve = dev->hard_header_len;
1116
1117         err = -ENETDOWN;
1118         if (!(dev->flags & IFF_UP))
1119                 goto out_unlock;
1120
1121         if (po->has_vnet_hdr) {
1122                 vnet_hdr_len = sizeof(vnet_hdr);
1123
1124                 err = -EINVAL;
1125                 if (len < vnet_hdr_len)
1126                         goto out_unlock;
1127
1128                 len -= vnet_hdr_len;
1129
1130                 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1131                                        vnet_hdr_len);
1132                 if (err < 0)
1133                         goto out_unlock;
1134
1135                 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1136                     (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1137                       vnet_hdr.hdr_len))
1138                         vnet_hdr.hdr_len = vnet_hdr.csum_start +
1139                                                  vnet_hdr.csum_offset + 2;
1140
1141                 err = -EINVAL;
1142                 if (vnet_hdr.hdr_len > len)
1143                         goto out_unlock;
1144
1145                 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1146                         switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1147                         case VIRTIO_NET_HDR_GSO_TCPV4:
1148                                 gso_type = SKB_GSO_TCPV4;
1149                                 break;
1150                         case VIRTIO_NET_HDR_GSO_TCPV6:
1151                                 gso_type = SKB_GSO_TCPV6;
1152                                 break;
1153                         case VIRTIO_NET_HDR_GSO_UDP:
1154                                 gso_type = SKB_GSO_UDP;
1155                                 break;
1156                         default:
1157                                 goto out_unlock;
1158                         }
1159
1160                         if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1161                                 gso_type |= SKB_GSO_TCP_ECN;
1162
1163                         if (vnet_hdr.gso_size == 0)
1164                                 goto out_unlock;
1165
1166                 }
1167         }
1168
1169         err = -EMSGSIZE;
1170         if (!gso_type && (len > dev->mtu+reserve))
1171                 goto out_unlock;
1172
1173         err = -ENOBUFS;
1174         skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1175                                LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1176                                msg->msg_flags & MSG_DONTWAIT, &err);
1177         if (skb == NULL)
1178                 goto out_unlock;
1179
1180         skb_set_network_header(skb, reserve);
1181
1182         err = -EINVAL;
1183         if (sock->type == SOCK_DGRAM &&
1184             (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
1185                 goto out_free;
1186
1187         /* Returns -EFAULT on error */
1188         err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1189         if (err)
1190                 goto out_free;
1191
1192         skb->protocol = proto;
1193         skb->dev = dev;
1194         skb->priority = sk->sk_priority;
1195         skb->mark = sk->sk_mark;
1196
1197         if (po->has_vnet_hdr) {
1198                 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1199                         if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1200                                                   vnet_hdr.csum_offset)) {
1201                                 err = -EINVAL;
1202                                 goto out_free;
1203                         }
1204                 }
1205
1206                 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1207                 skb_shinfo(skb)->gso_type = gso_type;
1208
1209                 /* Header must be checked, and gso_segs computed. */
1210                 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1211                 skb_shinfo(skb)->gso_segs = 0;
1212
1213                 len += vnet_hdr_len;
1214         }
1215
1216         /*
1217          *      Now send it
1218          */
1219
1220         err = dev_queue_xmit(skb);
1221         if (err > 0 && (err = net_xmit_errno(err)) != 0)
1222                 goto out_unlock;
1223
1224         dev_put(dev);
1225
1226         return len;
1227
1228 out_free:
1229         kfree_skb(skb);
1230 out_unlock:
1231         if (dev)
1232                 dev_put(dev);
1233 out:
1234         return err;
1235 }
1236
1237 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1238                 struct msghdr *msg, size_t len)
1239 {
1240         struct sock *sk = sock->sk;
1241         struct packet_sock *po = pkt_sk(sk);
1242         if (po->tx_ring.pg_vec)
1243                 return tpacket_snd(po, msg);
1244         else
1245                 return packet_snd(sock, msg, len);
1246 }
1247
1248 /*
1249  *      Close a PACKET socket. This is fairly simple. We immediately go
1250  *      to 'closed' state and remove our protocol entry in the device list.
1251  */
1252
1253 static int packet_release(struct socket *sock)
1254 {
1255         struct sock *sk = sock->sk;
1256         struct packet_sock *po;
1257         struct net *net;
1258         struct tpacket_req req;
1259
1260         if (!sk)
1261                 return 0;
1262
1263         net = sock_net(sk);
1264         po = pkt_sk(sk);
1265
1266         spin_lock_bh(&net->packet.sklist_lock);
1267         sk_del_node_init_rcu(sk);
1268         sock_prot_inuse_add(net, sk->sk_prot, -1);
1269         spin_unlock_bh(&net->packet.sklist_lock);
1270
1271         spin_lock(&po->bind_lock);
1272         if (po->running) {
1273                 /*
1274                  * Remove from protocol table
1275                  */
1276                 po->running = 0;
1277                 po->num = 0;
1278                 __dev_remove_pack(&po->prot_hook);
1279                 __sock_put(sk);
1280         }
1281         spin_unlock(&po->bind_lock);
1282
1283         packet_flush_mclist(sk);
1284
1285         memset(&req, 0, sizeof(req));
1286
1287         if (po->rx_ring.pg_vec)
1288                 packet_set_ring(sk, &req, 1, 0);
1289
1290         if (po->tx_ring.pg_vec)
1291                 packet_set_ring(sk, &req, 1, 1);
1292
1293         synchronize_net();
1294         /*
1295          *      Now the socket is dead. No more input will appear.
1296          */
1297         sock_orphan(sk);
1298         sock->sk = NULL;
1299
1300         /* Purge queues */
1301
1302         skb_queue_purge(&sk->sk_receive_queue);
1303         sk_refcnt_debug_release(sk);
1304
1305         sock_put(sk);
1306         return 0;
1307 }
1308
1309 /*
1310  *      Attach a packet hook.
1311  */
1312
1313 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1314 {
1315         struct packet_sock *po = pkt_sk(sk);
1316         /*
1317          *      Detach an existing hook if present.
1318          */
1319
1320         lock_sock(sk);
1321
1322         spin_lock(&po->bind_lock);
1323         if (po->running) {
1324                 __sock_put(sk);
1325                 po->running = 0;
1326                 po->num = 0;
1327                 spin_unlock(&po->bind_lock);
1328                 dev_remove_pack(&po->prot_hook);
1329                 spin_lock(&po->bind_lock);
1330         }
1331
1332         po->num = protocol;
1333         po->prot_hook.type = protocol;
1334         po->prot_hook.dev = dev;
1335
1336         po->ifindex = dev ? dev->ifindex : 0;
1337
1338         if (protocol == 0)
1339                 goto out_unlock;
1340
1341         if (!dev || (dev->flags & IFF_UP)) {
1342                 dev_add_pack(&po->prot_hook);
1343                 sock_hold(sk);
1344                 po->running = 1;
1345         } else {
1346                 sk->sk_err = ENETDOWN;
1347                 if (!sock_flag(sk, SOCK_DEAD))
1348                         sk->sk_error_report(sk);
1349         }
1350
1351 out_unlock:
1352         spin_unlock(&po->bind_lock);
1353         release_sock(sk);
1354         return 0;
1355 }
1356
1357 /*
1358  *      Bind a packet socket to a device
1359  */
1360
1361 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1362                             int addr_len)
1363 {
1364         struct sock *sk = sock->sk;
1365         char name[15];
1366         struct net_device *dev;
1367         int err = -ENODEV;
1368
1369         /*
1370          *      Check legality
1371          */
1372
1373         if (addr_len != sizeof(struct sockaddr))
1374                 return -EINVAL;
1375         strlcpy(name, uaddr->sa_data, sizeof(name));
1376
1377         dev = dev_get_by_name(sock_net(sk), name);
1378         if (dev) {
1379                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1380                 dev_put(dev);
1381         }
1382         return err;
1383 }
1384
1385 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1386 {
1387         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1388         struct sock *sk = sock->sk;
1389         struct net_device *dev = NULL;
1390         int err;
1391
1392
1393         /*
1394          *      Check legality
1395          */
1396
1397         if (addr_len < sizeof(struct sockaddr_ll))
1398                 return -EINVAL;
1399         if (sll->sll_family != AF_PACKET)
1400                 return -EINVAL;
1401
1402         if (sll->sll_ifindex) {
1403                 err = -ENODEV;
1404                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1405                 if (dev == NULL)
1406                         goto out;
1407         }
1408         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1409         if (dev)
1410                 dev_put(dev);
1411
1412 out:
1413         return err;
1414 }
1415
1416 static struct proto packet_proto = {
1417         .name     = "PACKET",
1418         .owner    = THIS_MODULE,
1419         .obj_size = sizeof(struct packet_sock),
1420 };
1421
1422 /*
1423  *      Create a packet of type SOCK_PACKET.
1424  */
1425
1426 static int packet_create(struct net *net, struct socket *sock, int protocol,
1427                          int kern)
1428 {
1429         struct sock *sk;
1430         struct packet_sock *po;
1431         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1432         int err;
1433
1434         if (!capable(CAP_NET_RAW))
1435                 return -EPERM;
1436         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1437             sock->type != SOCK_PACKET)
1438                 return -ESOCKTNOSUPPORT;
1439
1440         sock->state = SS_UNCONNECTED;
1441
1442         err = -ENOBUFS;
1443         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1444         if (sk == NULL)
1445                 goto out;
1446
1447         sock->ops = &packet_ops;
1448         if (sock->type == SOCK_PACKET)
1449                 sock->ops = &packet_ops_spkt;
1450
1451         sock_init_data(sock, sk);
1452
1453         po = pkt_sk(sk);
1454         sk->sk_family = PF_PACKET;
1455         po->num = proto;
1456
1457         sk->sk_destruct = packet_sock_destruct;
1458         sk_refcnt_debug_inc(sk);
1459
1460         /*
1461          *      Attach a protocol block
1462          */
1463
1464         spin_lock_init(&po->bind_lock);
1465         mutex_init(&po->pg_vec_lock);
1466         po->prot_hook.func = packet_rcv;
1467
1468         if (sock->type == SOCK_PACKET)
1469                 po->prot_hook.func = packet_rcv_spkt;
1470
1471         po->prot_hook.af_packet_priv = sk;
1472
1473         if (proto) {
1474                 po->prot_hook.type = proto;
1475                 dev_add_pack(&po->prot_hook);
1476                 sock_hold(sk);
1477                 po->running = 1;
1478         }
1479
1480         spin_lock_bh(&net->packet.sklist_lock);
1481         sk_add_node_rcu(sk, &net->packet.sklist);
1482         sock_prot_inuse_add(net, &packet_proto, 1);
1483         spin_unlock_bh(&net->packet.sklist_lock);
1484
1485         return 0;
1486 out:
1487         return err;
1488 }
1489
1490 /*
1491  *      Pull a packet from our receive queue and hand it to the user.
1492  *      If necessary we block.
1493  */
1494
1495 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1496                           struct msghdr *msg, size_t len, int flags)
1497 {
1498         struct sock *sk = sock->sk;
1499         struct sk_buff *skb;
1500         int copied, err;
1501         struct sockaddr_ll *sll;
1502         int vnet_hdr_len = 0;
1503
1504         err = -EINVAL;
1505         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1506                 goto out;
1507
1508 #if 0
1509         /* What error should we return now? EUNATTACH? */
1510         if (pkt_sk(sk)->ifindex < 0)
1511                 return -ENODEV;
1512 #endif
1513
1514         /*
1515          *      Call the generic datagram receiver. This handles all sorts
1516          *      of horrible races and re-entrancy so we can forget about it
1517          *      in the protocol layers.
1518          *
1519          *      Now it will return ENETDOWN, if device have just gone down,
1520          *      but then it will block.
1521          */
1522
1523         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1524
1525         /*
1526          *      An error occurred so return it. Because skb_recv_datagram()
1527          *      handles the blocking we don't see and worry about blocking
1528          *      retries.
1529          */
1530
1531         if (skb == NULL)
1532                 goto out;
1533
1534         if (pkt_sk(sk)->has_vnet_hdr) {
1535                 struct virtio_net_hdr vnet_hdr = { 0 };
1536
1537                 err = -EINVAL;
1538                 vnet_hdr_len = sizeof(vnet_hdr);
1539                 if ((len -= vnet_hdr_len) < 0)
1540                         goto out_free;
1541
1542                 if (skb_is_gso(skb)) {
1543                         struct skb_shared_info *sinfo = skb_shinfo(skb);
1544
1545                         /* This is a hint as to how much should be linear. */
1546                         vnet_hdr.hdr_len = skb_headlen(skb);
1547                         vnet_hdr.gso_size = sinfo->gso_size;
1548                         if (sinfo->gso_type & SKB_GSO_TCPV4)
1549                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1550                         else if (sinfo->gso_type & SKB_GSO_TCPV6)
1551                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1552                         else if (sinfo->gso_type & SKB_GSO_UDP)
1553                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1554                         else if (sinfo->gso_type & SKB_GSO_FCOE)
1555                                 goto out_free;
1556                         else
1557                                 BUG();
1558                         if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1559                                 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1560                 } else
1561                         vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1562
1563                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1564                         vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1565                         vnet_hdr.csum_start = skb->csum_start -
1566                                                         skb_headroom(skb);
1567                         vnet_hdr.csum_offset = skb->csum_offset;
1568                 } /* else everything is zero */
1569
1570                 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1571                                      vnet_hdr_len);
1572                 if (err < 0)
1573                         goto out_free;
1574         }
1575
1576         /*
1577          *      If the address length field is there to be filled in, we fill
1578          *      it in now.
1579          */
1580
1581         sll = &PACKET_SKB_CB(skb)->sa.ll;
1582         if (sock->type == SOCK_PACKET)
1583                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1584         else
1585                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1586
1587         /*
1588          *      You lose any data beyond the buffer you gave. If it worries a
1589          *      user program they can ask the device for its MTU anyway.
1590          */
1591
1592         copied = skb->len;
1593         if (copied > len) {
1594                 copied = len;
1595                 msg->msg_flags |= MSG_TRUNC;
1596         }
1597
1598         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1599         if (err)
1600                 goto out_free;
1601
1602         sock_recv_ts_and_drops(msg, sk, skb);
1603
1604         if (msg->msg_name)
1605                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1606                        msg->msg_namelen);
1607
1608         if (pkt_sk(sk)->auxdata) {
1609                 struct tpacket_auxdata aux;
1610
1611                 aux.tp_status = TP_STATUS_USER;
1612                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1613                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1614                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1615                 aux.tp_snaplen = skb->len;
1616                 aux.tp_mac = 0;
1617                 aux.tp_net = skb_network_offset(skb);
1618                 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1619
1620                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1621         }
1622
1623         /*
1624          *      Free or return the buffer as appropriate. Again this
1625          *      hides all the races and re-entrancy issues from us.
1626          */
1627         err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1628
1629 out_free:
1630         skb_free_datagram(sk, skb);
1631 out:
1632         return err;
1633 }
1634
1635 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1636                                int *uaddr_len, int peer)
1637 {
1638         struct net_device *dev;
1639         struct sock *sk = sock->sk;
1640
1641         if (peer)
1642                 return -EOPNOTSUPP;
1643
1644         uaddr->sa_family = AF_PACKET;
1645         rcu_read_lock();
1646         dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1647         if (dev)
1648                 strlcpy(uaddr->sa_data, dev->name, 15);
1649         else
1650                 memset(uaddr->sa_data, 0, 14);
1651         rcu_read_unlock();
1652         *uaddr_len = sizeof(*uaddr);
1653
1654         return 0;
1655 }
1656
1657 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1658                           int *uaddr_len, int peer)
1659 {
1660         struct net_device *dev;
1661         struct sock *sk = sock->sk;
1662         struct packet_sock *po = pkt_sk(sk);
1663         DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1664
1665         if (peer)
1666                 return -EOPNOTSUPP;
1667
1668         sll->sll_family = AF_PACKET;
1669         sll->sll_ifindex = po->ifindex;
1670         sll->sll_protocol = po->num;
1671         rcu_read_lock();
1672         dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1673         if (dev) {
1674                 sll->sll_hatype = dev->type;
1675                 sll->sll_halen = dev->addr_len;
1676                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1677         } else {
1678                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1679                 sll->sll_halen = 0;
1680         }
1681         rcu_read_unlock();
1682         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1683
1684         return 0;
1685 }
1686
1687 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1688                          int what)
1689 {
1690         switch (i->type) {
1691         case PACKET_MR_MULTICAST:
1692                 if (i->alen != dev->addr_len)
1693                         return -EINVAL;
1694                 if (what > 0)
1695                         return dev_mc_add(dev, i->addr);
1696                 else
1697                         return dev_mc_del(dev, i->addr);
1698                 break;
1699         case PACKET_MR_PROMISC:
1700                 return dev_set_promiscuity(dev, what);
1701                 break;
1702         case PACKET_MR_ALLMULTI:
1703                 return dev_set_allmulti(dev, what);
1704                 break;
1705         case PACKET_MR_UNICAST:
1706                 if (i->alen != dev->addr_len)
1707                         return -EINVAL;
1708                 if (what > 0)
1709                         return dev_uc_add(dev, i->addr);
1710                 else
1711                         return dev_uc_del(dev, i->addr);
1712                 break;
1713         default:
1714                 break;
1715         }
1716         return 0;
1717 }
1718
1719 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1720 {
1721         for ( ; i; i = i->next) {
1722                 if (i->ifindex == dev->ifindex)
1723                         packet_dev_mc(dev, i, what);
1724         }
1725 }
1726
1727 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1728 {
1729         struct packet_sock *po = pkt_sk(sk);
1730         struct packet_mclist *ml, *i;
1731         struct net_device *dev;
1732         int err;
1733
1734         rtnl_lock();
1735
1736         err = -ENODEV;
1737         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1738         if (!dev)
1739                 goto done;
1740
1741         err = -EINVAL;
1742         if (mreq->mr_alen > dev->addr_len)
1743                 goto done;
1744
1745         err = -ENOBUFS;
1746         i = kmalloc(sizeof(*i), GFP_KERNEL);
1747         if (i == NULL)
1748                 goto done;
1749
1750         err = 0;
1751         for (ml = po->mclist; ml; ml = ml->next) {
1752                 if (ml->ifindex == mreq->mr_ifindex &&
1753                     ml->type == mreq->mr_type &&
1754                     ml->alen == mreq->mr_alen &&
1755                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1756                         ml->count++;
1757                         /* Free the new element ... */
1758                         kfree(i);
1759                         goto done;
1760                 }
1761         }
1762
1763         i->type = mreq->mr_type;
1764         i->ifindex = mreq->mr_ifindex;
1765         i->alen = mreq->mr_alen;
1766         memcpy(i->addr, mreq->mr_address, i->alen);
1767         i->count = 1;
1768         i->next = po->mclist;
1769         po->mclist = i;
1770         err = packet_dev_mc(dev, i, 1);
1771         if (err) {
1772                 po->mclist = i->next;
1773                 kfree(i);
1774         }
1775
1776 done:
1777         rtnl_unlock();
1778         return err;
1779 }
1780
1781 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1782 {
1783         struct packet_mclist *ml, **mlp;
1784
1785         rtnl_lock();
1786
1787         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1788                 if (ml->ifindex == mreq->mr_ifindex &&
1789                     ml->type == mreq->mr_type &&
1790                     ml->alen == mreq->mr_alen &&
1791                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1792                         if (--ml->count == 0) {
1793                                 struct net_device *dev;
1794                                 *mlp = ml->next;
1795                                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1796                                 if (dev)
1797                                         packet_dev_mc(dev, ml, -1);
1798                                 kfree(ml);
1799                         }
1800                         rtnl_unlock();
1801                         return 0;
1802                 }
1803         }
1804         rtnl_unlock();
1805         return -EADDRNOTAVAIL;
1806 }
1807
1808 static void packet_flush_mclist(struct sock *sk)
1809 {
1810         struct packet_sock *po = pkt_sk(sk);
1811         struct packet_mclist *ml;
1812
1813         if (!po->mclist)
1814                 return;
1815
1816         rtnl_lock();
1817         while ((ml = po->mclist) != NULL) {
1818                 struct net_device *dev;
1819
1820                 po->mclist = ml->next;
1821                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1822                 if (dev != NULL)
1823                         packet_dev_mc(dev, ml, -1);
1824                 kfree(ml);
1825         }
1826         rtnl_unlock();
1827 }
1828
1829 static int
1830 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1831 {
1832         struct sock *sk = sock->sk;
1833         struct packet_sock *po = pkt_sk(sk);
1834         int ret;
1835
1836         if (level != SOL_PACKET)
1837                 return -ENOPROTOOPT;
1838
1839         switch (optname) {
1840         case PACKET_ADD_MEMBERSHIP:
1841         case PACKET_DROP_MEMBERSHIP:
1842         {
1843                 struct packet_mreq_max mreq;
1844                 int len = optlen;
1845                 memset(&mreq, 0, sizeof(mreq));
1846                 if (len < sizeof(struct packet_mreq))
1847                         return -EINVAL;
1848                 if (len > sizeof(mreq))
1849                         len = sizeof(mreq);
1850                 if (copy_from_user(&mreq, optval, len))
1851                         return -EFAULT;
1852                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1853                         return -EINVAL;
1854                 if (optname == PACKET_ADD_MEMBERSHIP)
1855                         ret = packet_mc_add(sk, &mreq);
1856                 else
1857                         ret = packet_mc_drop(sk, &mreq);
1858                 return ret;
1859         }
1860
1861         case PACKET_RX_RING:
1862         case PACKET_TX_RING:
1863         {
1864                 struct tpacket_req req;
1865
1866                 if (optlen < sizeof(req))
1867                         return -EINVAL;
1868                 if (pkt_sk(sk)->has_vnet_hdr)
1869                         return -EINVAL;
1870                 if (copy_from_user(&req, optval, sizeof(req)))
1871                         return -EFAULT;
1872                 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1873         }
1874         case PACKET_COPY_THRESH:
1875         {
1876                 int val;
1877
1878                 if (optlen != sizeof(val))
1879                         return -EINVAL;
1880                 if (copy_from_user(&val, optval, sizeof(val)))
1881                         return -EFAULT;
1882
1883                 pkt_sk(sk)->copy_thresh = val;
1884                 return 0;
1885         }
1886         case PACKET_VERSION:
1887         {
1888                 int val;
1889
1890                 if (optlen != sizeof(val))
1891                         return -EINVAL;
1892                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1893                         return -EBUSY;
1894                 if (copy_from_user(&val, optval, sizeof(val)))
1895                         return -EFAULT;
1896                 switch (val) {
1897                 case TPACKET_V1:
1898                 case TPACKET_V2:
1899                         po->tp_version = val;
1900                         return 0;
1901                 default:
1902                         return -EINVAL;
1903                 }
1904         }
1905         case PACKET_RESERVE:
1906         {
1907                 unsigned int val;
1908
1909                 if (optlen != sizeof(val))
1910                         return -EINVAL;
1911                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1912                         return -EBUSY;
1913                 if (copy_from_user(&val, optval, sizeof(val)))
1914                         return -EFAULT;
1915                 po->tp_reserve = val;
1916                 return 0;
1917         }
1918         case PACKET_LOSS:
1919         {
1920                 unsigned int val;
1921
1922                 if (optlen != sizeof(val))
1923                         return -EINVAL;
1924                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1925                         return -EBUSY;
1926                 if (copy_from_user(&val, optval, sizeof(val)))
1927                         return -EFAULT;
1928                 po->tp_loss = !!val;
1929                 return 0;
1930         }
1931         case PACKET_AUXDATA:
1932         {
1933                 int val;
1934
1935                 if (optlen < sizeof(val))
1936                         return -EINVAL;
1937                 if (copy_from_user(&val, optval, sizeof(val)))
1938                         return -EFAULT;
1939
1940                 po->auxdata = !!val;
1941                 return 0;
1942         }
1943         case PACKET_ORIGDEV:
1944         {
1945                 int val;
1946
1947                 if (optlen < sizeof(val))
1948                         return -EINVAL;
1949                 if (copy_from_user(&val, optval, sizeof(val)))
1950                         return -EFAULT;
1951
1952                 po->origdev = !!val;
1953                 return 0;
1954         }
1955         case PACKET_VNET_HDR:
1956         {
1957                 int val;
1958
1959                 if (sock->type != SOCK_RAW)
1960                         return -EINVAL;
1961                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1962                         return -EBUSY;
1963                 if (optlen < sizeof(val))
1964                         return -EINVAL;
1965                 if (copy_from_user(&val, optval, sizeof(val)))
1966                         return -EFAULT;
1967
1968                 po->has_vnet_hdr = !!val;
1969                 return 0;
1970         }
1971         default:
1972                 return -ENOPROTOOPT;
1973         }
1974 }
1975
1976 static int packet_getsockopt(struct socket *sock, int level, int optname,
1977                              char __user *optval, int __user *optlen)
1978 {
1979         int len;
1980         int val;
1981         struct sock *sk = sock->sk;
1982         struct packet_sock *po = pkt_sk(sk);
1983         void *data;
1984         struct tpacket_stats st;
1985
1986         if (level != SOL_PACKET)
1987                 return -ENOPROTOOPT;
1988
1989         if (get_user(len, optlen))
1990                 return -EFAULT;
1991
1992         if (len < 0)
1993                 return -EINVAL;
1994
1995         switch (optname) {
1996         case PACKET_STATISTICS:
1997                 if (len > sizeof(struct tpacket_stats))
1998                         len = sizeof(struct tpacket_stats);
1999                 spin_lock_bh(&sk->sk_receive_queue.lock);
2000                 st = po->stats;
2001                 memset(&po->stats, 0, sizeof(st));
2002                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2003                 st.tp_packets += st.tp_drops;
2004
2005                 data = &st;
2006                 break;
2007         case PACKET_AUXDATA:
2008                 if (len > sizeof(int))
2009                         len = sizeof(int);
2010                 val = po->auxdata;
2011
2012                 data = &val;
2013                 break;
2014         case PACKET_ORIGDEV:
2015                 if (len > sizeof(int))
2016                         len = sizeof(int);
2017                 val = po->origdev;
2018
2019                 data = &val;
2020                 break;
2021         case PACKET_VNET_HDR:
2022                 if (len > sizeof(int))
2023                         len = sizeof(int);
2024                 val = po->has_vnet_hdr;
2025
2026                 data = &val;
2027                 break;
2028         case PACKET_VERSION:
2029                 if (len > sizeof(int))
2030                         len = sizeof(int);
2031                 val = po->tp_version;
2032                 data = &val;
2033                 break;
2034         case PACKET_HDRLEN:
2035                 if (len > sizeof(int))
2036                         len = sizeof(int);
2037                 if (copy_from_user(&val, optval, len))
2038                         return -EFAULT;
2039                 switch (val) {
2040                 case TPACKET_V1:
2041                         val = sizeof(struct tpacket_hdr);
2042                         break;
2043                 case TPACKET_V2:
2044                         val = sizeof(struct tpacket2_hdr);
2045                         break;
2046                 default:
2047                         return -EINVAL;
2048                 }
2049                 data = &val;
2050                 break;
2051         case PACKET_RESERVE:
2052                 if (len > sizeof(unsigned int))
2053                         len = sizeof(unsigned int);
2054                 val = po->tp_reserve;
2055                 data = &val;
2056                 break;
2057         case PACKET_LOSS:
2058                 if (len > sizeof(unsigned int))
2059                         len = sizeof(unsigned int);
2060                 val = po->tp_loss;
2061                 data = &val;
2062                 break;
2063         default:
2064                 return -ENOPROTOOPT;
2065         }
2066
2067         if (put_user(len, optlen))
2068                 return -EFAULT;
2069         if (copy_to_user(optval, data, len))
2070                 return -EFAULT;
2071         return 0;
2072 }
2073
2074
2075 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2076 {
2077         struct sock *sk;
2078         struct hlist_node *node;
2079         struct net_device *dev = data;
2080         struct net *net = dev_net(dev);
2081
2082         rcu_read_lock();
2083         sk_for_each_rcu(sk, node, &net->packet.sklist) {
2084                 struct packet_sock *po = pkt_sk(sk);
2085
2086                 switch (msg) {
2087                 case NETDEV_UNREGISTER:
2088                         if (po->mclist)
2089                                 packet_dev_mclist(dev, po->mclist, -1);
2090                         /* fallthrough */
2091
2092                 case NETDEV_DOWN:
2093                         if (dev->ifindex == po->ifindex) {
2094                                 spin_lock(&po->bind_lock);
2095                                 if (po->running) {
2096                                         __dev_remove_pack(&po->prot_hook);
2097                                         __sock_put(sk);
2098                                         po->running = 0;
2099                                         sk->sk_err = ENETDOWN;
2100                                         if (!sock_flag(sk, SOCK_DEAD))
2101                                                 sk->sk_error_report(sk);
2102                                 }
2103                                 if (msg == NETDEV_UNREGISTER) {
2104                                         po->ifindex = -1;
2105                                         po->prot_hook.dev = NULL;
2106                                 }
2107                                 spin_unlock(&po->bind_lock);
2108                         }
2109                         break;
2110                 case NETDEV_UP:
2111                         if (dev->ifindex == po->ifindex) {
2112                                 spin_lock(&po->bind_lock);
2113                                 if (po->num && !po->running) {
2114                                         dev_add_pack(&po->prot_hook);
2115                                         sock_hold(sk);
2116                                         po->running = 1;
2117                                 }
2118                                 spin_unlock(&po->bind_lock);
2119                         }
2120                         break;
2121                 }
2122         }
2123         rcu_read_unlock();
2124         return NOTIFY_DONE;
2125 }
2126
2127
2128 static int packet_ioctl(struct socket *sock, unsigned int cmd,
2129                         unsigned long arg)
2130 {
2131         struct sock *sk = sock->sk;
2132
2133         switch (cmd) {
2134         case SIOCOUTQ:
2135         {
2136                 int amount = sk_wmem_alloc_get(sk);
2137
2138                 return put_user(amount, (int __user *)arg);
2139         }
2140         case SIOCINQ:
2141         {
2142                 struct sk_buff *skb;
2143                 int amount = 0;
2144
2145                 spin_lock_bh(&sk->sk_receive_queue.lock);
2146                 skb = skb_peek(&sk->sk_receive_queue);
2147                 if (skb)
2148                         amount = skb->len;
2149                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2150                 return put_user(amount, (int __user *)arg);
2151         }
2152         case SIOCGSTAMP:
2153                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2154         case SIOCGSTAMPNS:
2155                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2156
2157 #ifdef CONFIG_INET
2158         case SIOCADDRT:
2159         case SIOCDELRT:
2160         case SIOCDARP:
2161         case SIOCGARP:
2162         case SIOCSARP:
2163         case SIOCGIFADDR:
2164         case SIOCSIFADDR:
2165         case SIOCGIFBRDADDR:
2166         case SIOCSIFBRDADDR:
2167         case SIOCGIFNETMASK:
2168         case SIOCSIFNETMASK:
2169         case SIOCGIFDSTADDR:
2170         case SIOCSIFDSTADDR:
2171         case SIOCSIFFLAGS:
2172                 if (!net_eq(sock_net(sk), &init_net))
2173                         return -ENOIOCTLCMD;
2174                 return inet_dgram_ops.ioctl(sock, cmd, arg);
2175 #endif
2176
2177         default:
2178                 return -ENOIOCTLCMD;
2179         }
2180         return 0;
2181 }
2182
2183 static unsigned int packet_poll(struct file *file, struct socket *sock,
2184                                 poll_table *wait)
2185 {
2186         struct sock *sk = sock->sk;
2187         struct packet_sock *po = pkt_sk(sk);
2188         unsigned int mask = datagram_poll(file, sock, wait);
2189
2190         spin_lock_bh(&sk->sk_receive_queue.lock);
2191         if (po->rx_ring.pg_vec) {
2192                 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2193                         mask |= POLLIN | POLLRDNORM;
2194         }
2195         spin_unlock_bh(&sk->sk_receive_queue.lock);
2196         spin_lock_bh(&sk->sk_write_queue.lock);
2197         if (po->tx_ring.pg_vec) {
2198                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2199                         mask |= POLLOUT | POLLWRNORM;
2200         }
2201         spin_unlock_bh(&sk->sk_write_queue.lock);
2202         return mask;
2203 }
2204
2205
2206 /* Dirty? Well, I still did not learn better way to account
2207  * for user mmaps.
2208  */
2209
2210 static void packet_mm_open(struct vm_area_struct *vma)
2211 {
2212         struct file *file = vma->vm_file;
2213         struct socket *sock = file->private_data;
2214         struct sock *sk = sock->sk;
2215
2216         if (sk)
2217                 atomic_inc(&pkt_sk(sk)->mapped);
2218 }
2219
2220 static void packet_mm_close(struct vm_area_struct *vma)
2221 {
2222         struct file *file = vma->vm_file;
2223         struct socket *sock = file->private_data;
2224         struct sock *sk = sock->sk;
2225
2226         if (sk)
2227                 atomic_dec(&pkt_sk(sk)->mapped);
2228 }
2229
2230 static const struct vm_operations_struct packet_mmap_ops = {
2231         .open   =       packet_mm_open,
2232         .close  =       packet_mm_close,
2233 };
2234
2235 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2236 {
2237         int i;
2238
2239         for (i = 0; i < len; i++) {
2240                 if (likely(pg_vec[i]))
2241                         free_pages((unsigned long) pg_vec[i], order);
2242         }
2243         kfree(pg_vec);
2244 }
2245
2246 static inline char *alloc_one_pg_vec_page(unsigned long order)
2247 {
2248         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2249
2250         return (char *) __get_free_pages(gfp_flags, order);
2251 }
2252
2253 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2254 {
2255         unsigned int block_nr = req->tp_block_nr;
2256         char **pg_vec;
2257         int i;
2258
2259         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2260         if (unlikely(!pg_vec))
2261                 goto out;
2262
2263         for (i = 0; i < block_nr; i++) {
2264                 pg_vec[i] = alloc_one_pg_vec_page(order);
2265                 if (unlikely(!pg_vec[i]))
2266                         goto out_free_pgvec;
2267         }
2268
2269 out:
2270         return pg_vec;
2271
2272 out_free_pgvec:
2273         free_pg_vec(pg_vec, order, block_nr);
2274         pg_vec = NULL;
2275         goto out;
2276 }
2277
2278 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2279                 int closing, int tx_ring)
2280 {
2281         char **pg_vec = NULL;
2282         struct packet_sock *po = pkt_sk(sk);
2283         int was_running, order = 0;
2284         struct packet_ring_buffer *rb;
2285         struct sk_buff_head *rb_queue;
2286         __be16 num;
2287         int err;
2288
2289         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2290         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2291
2292         err = -EBUSY;
2293         if (!closing) {
2294                 if (atomic_read(&po->mapped))
2295                         goto out;
2296                 if (atomic_read(&rb->pending))
2297                         goto out;
2298         }
2299
2300         if (req->tp_block_nr) {
2301                 /* Sanity tests and some calculations */
2302                 err = -EBUSY;
2303                 if (unlikely(rb->pg_vec))
2304                         goto out;
2305
2306                 switch (po->tp_version) {
2307                 case TPACKET_V1:
2308                         po->tp_hdrlen = TPACKET_HDRLEN;
2309                         break;
2310                 case TPACKET_V2:
2311                         po->tp_hdrlen = TPACKET2_HDRLEN;
2312                         break;
2313                 }
2314
2315                 err = -EINVAL;
2316                 if (unlikely((int)req->tp_block_size <= 0))
2317                         goto out;
2318                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2319                         goto out;
2320                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2321                                         po->tp_reserve))
2322                         goto out;
2323                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2324                         goto out;
2325
2326                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2327                 if (unlikely(rb->frames_per_block <= 0))
2328                         goto out;
2329                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2330                                         req->tp_frame_nr))
2331                         goto out;
2332
2333                 err = -ENOMEM;
2334                 order = get_order(req->tp_block_size);
2335                 pg_vec = alloc_pg_vec(req, order);
2336                 if (unlikely(!pg_vec))
2337                         goto out;
2338         }
2339         /* Done */
2340         else {
2341                 err = -EINVAL;
2342                 if (unlikely(req->tp_frame_nr))
2343                         goto out;
2344         }
2345
2346         lock_sock(sk);
2347
2348         /* Detach socket from network */
2349         spin_lock(&po->bind_lock);
2350         was_running = po->running;
2351         num = po->num;
2352         if (was_running) {
2353                 __dev_remove_pack(&po->prot_hook);
2354                 po->num = 0;
2355                 po->running = 0;
2356                 __sock_put(sk);
2357         }
2358         spin_unlock(&po->bind_lock);
2359
2360         synchronize_net();
2361
2362         err = -EBUSY;
2363         mutex_lock(&po->pg_vec_lock);
2364         if (closing || atomic_read(&po->mapped) == 0) {
2365                 err = 0;
2366 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2367                 spin_lock_bh(&rb_queue->lock);
2368                 pg_vec = XC(rb->pg_vec, pg_vec);
2369                 rb->frame_max = (req->tp_frame_nr - 1);
2370                 rb->head = 0;
2371                 rb->frame_size = req->tp_frame_size;
2372                 spin_unlock_bh(&rb_queue->lock);
2373
2374                 order = XC(rb->pg_vec_order, order);
2375                 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2376
2377                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2378                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2379                                                 tpacket_rcv : packet_rcv;
2380                 skb_queue_purge(rb_queue);
2381 #undef XC
2382                 if (atomic_read(&po->mapped))
2383                         pr_err("packet_mmap: vma is busy: %d\n",
2384                                atomic_read(&po->mapped));
2385         }
2386         mutex_unlock(&po->pg_vec_lock);
2387
2388         spin_lock(&po->bind_lock);
2389         if (was_running && !po->running) {
2390                 sock_hold(sk);
2391                 po->running = 1;
2392                 po->num = num;
2393                 dev_add_pack(&po->prot_hook);
2394         }
2395         spin_unlock(&po->bind_lock);
2396
2397         release_sock(sk);
2398
2399         if (pg_vec)
2400                 free_pg_vec(pg_vec, order, req->tp_block_nr);
2401 out:
2402         return err;
2403 }
2404
2405 static int packet_mmap(struct file *file, struct socket *sock,
2406                 struct vm_area_struct *vma)
2407 {
2408         struct sock *sk = sock->sk;
2409         struct packet_sock *po = pkt_sk(sk);
2410         unsigned long size, expected_size;
2411         struct packet_ring_buffer *rb;
2412         unsigned long start;
2413         int err = -EINVAL;
2414         int i;
2415
2416         if (vma->vm_pgoff)
2417                 return -EINVAL;
2418
2419         mutex_lock(&po->pg_vec_lock);
2420
2421         expected_size = 0;
2422         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2423                 if (rb->pg_vec) {
2424                         expected_size += rb->pg_vec_len
2425                                                 * rb->pg_vec_pages
2426                                                 * PAGE_SIZE;
2427                 }
2428         }
2429
2430         if (expected_size == 0)
2431                 goto out;
2432
2433         size = vma->vm_end - vma->vm_start;
2434         if (size != expected_size)
2435                 goto out;
2436
2437         start = vma->vm_start;
2438         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2439                 if (rb->pg_vec == NULL)
2440                         continue;
2441
2442                 for (i = 0; i < rb->pg_vec_len; i++) {
2443                         struct page *page = virt_to_page(rb->pg_vec[i]);
2444                         int pg_num;
2445
2446                         for (pg_num = 0; pg_num < rb->pg_vec_pages;
2447                                         pg_num++, page++) {
2448                                 err = vm_insert_page(vma, start, page);
2449                                 if (unlikely(err))
2450                                         goto out;
2451                                 start += PAGE_SIZE;
2452                         }
2453                 }
2454         }
2455
2456         atomic_inc(&po->mapped);
2457         vma->vm_ops = &packet_mmap_ops;
2458         err = 0;
2459
2460 out:
2461         mutex_unlock(&po->pg_vec_lock);
2462         return err;
2463 }
2464
2465 static const struct proto_ops packet_ops_spkt = {
2466         .family =       PF_PACKET,
2467         .owner =        THIS_MODULE,
2468         .release =      packet_release,
2469         .bind =         packet_bind_spkt,
2470         .connect =      sock_no_connect,
2471         .socketpair =   sock_no_socketpair,
2472         .accept =       sock_no_accept,
2473         .getname =      packet_getname_spkt,
2474         .poll =         datagram_poll,
2475         .ioctl =        packet_ioctl,
2476         .listen =       sock_no_listen,
2477         .shutdown =     sock_no_shutdown,
2478         .setsockopt =   sock_no_setsockopt,
2479         .getsockopt =   sock_no_getsockopt,
2480         .sendmsg =      packet_sendmsg_spkt,
2481         .recvmsg =      packet_recvmsg,
2482         .mmap =         sock_no_mmap,
2483         .sendpage =     sock_no_sendpage,
2484 };
2485
2486 static const struct proto_ops packet_ops = {
2487         .family =       PF_PACKET,
2488         .owner =        THIS_MODULE,
2489         .release =      packet_release,
2490         .bind =         packet_bind,
2491         .connect =      sock_no_connect,
2492         .socketpair =   sock_no_socketpair,
2493         .accept =       sock_no_accept,
2494         .getname =      packet_getname,
2495         .poll =         packet_poll,
2496         .ioctl =        packet_ioctl,
2497         .listen =       sock_no_listen,
2498         .shutdown =     sock_no_shutdown,
2499         .setsockopt =   packet_setsockopt,
2500         .getsockopt =   packet_getsockopt,
2501         .sendmsg =      packet_sendmsg,
2502         .recvmsg =      packet_recvmsg,
2503         .mmap =         packet_mmap,
2504         .sendpage =     sock_no_sendpage,
2505 };
2506
2507 static const struct net_proto_family packet_family_ops = {
2508         .family =       PF_PACKET,
2509         .create =       packet_create,
2510         .owner  =       THIS_MODULE,
2511 };
2512
2513 static struct notifier_block packet_netdev_notifier = {
2514         .notifier_call =        packet_notifier,
2515 };
2516
2517 #ifdef CONFIG_PROC_FS
2518
2519 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2520         __acquires(RCU)
2521 {
2522         struct net *net = seq_file_net(seq);
2523
2524         rcu_read_lock();
2525         return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
2526 }
2527
2528 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2529 {
2530         struct net *net = seq_file_net(seq);
2531         return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
2532 }
2533
2534 static void packet_seq_stop(struct seq_file *seq, void *v)
2535         __releases(RCU)
2536 {
2537         rcu_read_unlock();
2538 }
2539
2540 static int packet_seq_show(struct seq_file *seq, void *v)
2541 {
2542         if (v == SEQ_START_TOKEN)
2543                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2544         else {
2545                 struct sock *s = sk_entry(v);
2546                 const struct packet_sock *po = pkt_sk(s);
2547
2548                 seq_printf(seq,
2549                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2550                            s,
2551                            atomic_read(&s->sk_refcnt),
2552                            s->sk_type,
2553                            ntohs(po->num),
2554                            po->ifindex,
2555                            po->running,
2556                            atomic_read(&s->sk_rmem_alloc),
2557                            sock_i_uid(s),
2558                            sock_i_ino(s));
2559         }
2560
2561         return 0;
2562 }
2563
2564 static const struct seq_operations packet_seq_ops = {
2565         .start  = packet_seq_start,
2566         .next   = packet_seq_next,
2567         .stop   = packet_seq_stop,
2568         .show   = packet_seq_show,
2569 };
2570
2571 static int packet_seq_open(struct inode *inode, struct file *file)
2572 {
2573         return seq_open_net(inode, file, &packet_seq_ops,
2574                             sizeof(struct seq_net_private));
2575 }
2576
2577 static const struct file_operations packet_seq_fops = {
2578         .owner          = THIS_MODULE,
2579         .open           = packet_seq_open,
2580         .read           = seq_read,
2581         .llseek         = seq_lseek,
2582         .release        = seq_release_net,
2583 };
2584
2585 #endif
2586
2587 static int __net_init packet_net_init(struct net *net)
2588 {
2589         spin_lock_init(&net->packet.sklist_lock);
2590         INIT_HLIST_HEAD(&net->packet.sklist);
2591
2592         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2593                 return -ENOMEM;
2594
2595         return 0;
2596 }
2597
2598 static void __net_exit packet_net_exit(struct net *net)
2599 {
2600         proc_net_remove(net, "packet");
2601 }
2602
2603 static struct pernet_operations packet_net_ops = {
2604         .init = packet_net_init,
2605         .exit = packet_net_exit,
2606 };
2607
2608
2609 static void __exit packet_exit(void)
2610 {
2611         unregister_netdevice_notifier(&packet_netdev_notifier);
2612         unregister_pernet_subsys(&packet_net_ops);
2613         sock_unregister(PF_PACKET);
2614         proto_unregister(&packet_proto);
2615 }
2616
2617 static int __init packet_init(void)
2618 {
2619         int rc = proto_register(&packet_proto, 0);
2620
2621         if (rc != 0)
2622                 goto out;
2623
2624         sock_register(&packet_family_ops);
2625         register_pernet_subsys(&packet_net_ops);
2626         register_netdevice_notifier(&packet_netdev_notifier);
2627 out:
2628         return rc;
2629 }
2630
2631 module_init(packet_init);
2632 module_exit(packet_exit);
2633 MODULE_LICENSE("GPL");
2634 MODULE_ALIAS_NETPROTO(PF_PACKET);