packet: Enhance AF_PACKET implementation to not require high order contiguous memory...
[linux-2.6.git] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *              Johann Baudy    :       Added TX RING.
43  *
44  *              This program is free software; you can redistribute it and/or
45  *              modify it under the terms of the GNU General Public License
46  *              as published by the Free Software Foundation; either version
47  *              2 of the License, or (at your option) any later version.
48  *
49  */
50
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <net/net_namespace.h>
66 #include <net/ip.h>
67 #include <net/protocol.h>
68 #include <linux/skbuff.h>
69 #include <net/sock.h>
70 #include <linux/errno.h>
71 #include <linux/timer.h>
72 #include <asm/system.h>
73 #include <asm/uaccess.h>
74 #include <asm/ioctls.h>
75 #include <asm/page.h>
76 #include <asm/cacheflush.h>
77 #include <asm/io.h>
78 #include <linux/proc_fs.h>
79 #include <linux/seq_file.h>
80 #include <linux/poll.h>
81 #include <linux/module.h>
82 #include <linux/init.h>
83 #include <linux/mutex.h>
84 #include <linux/if_vlan.h>
85 #include <linux/virtio_net.h>
86 #include <linux/errqueue.h>
87 #include <linux/net_tstamp.h>
88
89 #ifdef CONFIG_INET
90 #include <net/inet_common.h>
91 #endif
92
93 /*
94    Assumptions:
95    - if device has no dev->hard_header routine, it adds and removes ll header
96      inside itself. In this case ll header is invisible outside of device,
97      but higher levels still should reserve dev->hard_header_len.
98      Some devices are enough clever to reallocate skb, when header
99      will not fit to reserved space (tunnel), another ones are silly
100      (PPP).
101    - packet socket receives packets with pulled ll header,
102      so that SOCK_RAW should push it back.
103
104 On receive:
105 -----------
106
107 Incoming, dev->hard_header!=NULL
108    mac_header -> ll header
109    data       -> data
110
111 Outgoing, dev->hard_header!=NULL
112    mac_header -> ll header
113    data       -> ll header
114
115 Incoming, dev->hard_header==NULL
116    mac_header -> UNKNOWN position. It is very likely, that it points to ll
117                  header.  PPP makes it, that is wrong, because introduce
118                  assymetry between rx and tx paths.
119    data       -> data
120
121 Outgoing, dev->hard_header==NULL
122    mac_header -> data. ll header is still not built!
123    data       -> data
124
125 Resume
126   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
127
128
129 On transmit:
130 ------------
131
132 dev->hard_header != NULL
133    mac_header -> ll header
134    data       -> ll header
135
136 dev->hard_header == NULL (ll header is added by device, we cannot control it)
137    mac_header -> data
138    data       -> data
139
140    We should set nh.raw on output to correct posistion,
141    packet classifier depends on it.
142  */
143
144 /* Private packet socket structures. */
145
146 struct packet_mclist {
147         struct packet_mclist    *next;
148         int                     ifindex;
149         int                     count;
150         unsigned short          type;
151         unsigned short          alen;
152         unsigned char           addr[MAX_ADDR_LEN];
153 };
154 /* identical to struct packet_mreq except it has
155  * a longer address field.
156  */
157 struct packet_mreq_max {
158         int             mr_ifindex;
159         unsigned short  mr_type;
160         unsigned short  mr_alen;
161         unsigned char   mr_address[MAX_ADDR_LEN];
162 };
163
164 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
165                 int closing, int tx_ring);
166
167 #define PGV_FROM_VMALLOC 1
168 struct pgv {
169         char *buffer;
170         unsigned char flags;
171 };
172
173 struct packet_ring_buffer {
174         struct pgv              *pg_vec;
175         unsigned int            head;
176         unsigned int            frames_per_block;
177         unsigned int            frame_size;
178         unsigned int            frame_max;
179
180         unsigned int            pg_vec_order;
181         unsigned int            pg_vec_pages;
182         unsigned int            pg_vec_len;
183
184         atomic_t                pending;
185 };
186
187 struct packet_sock;
188 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
189
190 static void packet_flush_mclist(struct sock *sk);
191
192 struct packet_sock {
193         /* struct sock has to be the first member of packet_sock */
194         struct sock             sk;
195         struct tpacket_stats    stats;
196         struct packet_ring_buffer       rx_ring;
197         struct packet_ring_buffer       tx_ring;
198         int                     copy_thresh;
199         spinlock_t              bind_lock;
200         struct mutex            pg_vec_lock;
201         unsigned int            running:1,      /* prot_hook is attached*/
202                                 auxdata:1,
203                                 origdev:1,
204                                 has_vnet_hdr:1;
205         int                     ifindex;        /* bound device         */
206         __be16                  num;
207         struct packet_mclist    *mclist;
208         atomic_t                mapped;
209         enum tpacket_versions   tp_version;
210         unsigned int            tp_hdrlen;
211         unsigned int            tp_reserve;
212         unsigned int            tp_loss:1;
213         unsigned int            tp_tstamp;
214         struct packet_type      prot_hook ____cacheline_aligned_in_smp;
215 };
216
217 struct packet_skb_cb {
218         unsigned int origlen;
219         union {
220                 struct sockaddr_pkt pkt;
221                 struct sockaddr_ll ll;
222         } sa;
223 };
224
225 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
226
227 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
228 {
229         union {
230                 struct tpacket_hdr *h1;
231                 struct tpacket2_hdr *h2;
232                 void *raw;
233         } h;
234
235         h.raw = frame;
236         switch (po->tp_version) {
237         case TPACKET_V1:
238                 h.h1->tp_status = status;
239                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
240                 break;
241         case TPACKET_V2:
242                 h.h2->tp_status = status;
243                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
244                 break;
245         default:
246                 pr_err("TPACKET version not supported\n");
247                 BUG();
248         }
249
250         smp_wmb();
251 }
252
253 static int __packet_get_status(struct packet_sock *po, void *frame)
254 {
255         union {
256                 struct tpacket_hdr *h1;
257                 struct tpacket2_hdr *h2;
258                 void *raw;
259         } h;
260
261         smp_rmb();
262
263         h.raw = frame;
264         switch (po->tp_version) {
265         case TPACKET_V1:
266                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
267                 return h.h1->tp_status;
268         case TPACKET_V2:
269                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
270                 return h.h2->tp_status;
271         default:
272                 pr_err("TPACKET version not supported\n");
273                 BUG();
274                 return 0;
275         }
276 }
277
278 static void *packet_lookup_frame(struct packet_sock *po,
279                 struct packet_ring_buffer *rb,
280                 unsigned int position,
281                 int status)
282 {
283         unsigned int pg_vec_pos, frame_offset;
284         union {
285                 struct tpacket_hdr *h1;
286                 struct tpacket2_hdr *h2;
287                 void *raw;
288         } h;
289
290         pg_vec_pos = position / rb->frames_per_block;
291         frame_offset = position % rb->frames_per_block;
292
293         h.raw = rb->pg_vec[pg_vec_pos].buffer +
294                 (frame_offset * rb->frame_size);
295
296         if (status != __packet_get_status(po, h.raw))
297                 return NULL;
298
299         return h.raw;
300 }
301
302 static inline void *packet_current_frame(struct packet_sock *po,
303                 struct packet_ring_buffer *rb,
304                 int status)
305 {
306         return packet_lookup_frame(po, rb, rb->head, status);
307 }
308
309 static inline void *packet_previous_frame(struct packet_sock *po,
310                 struct packet_ring_buffer *rb,
311                 int status)
312 {
313         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
314         return packet_lookup_frame(po, rb, previous, status);
315 }
316
317 static inline void packet_increment_head(struct packet_ring_buffer *buff)
318 {
319         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
320 }
321
322 static inline struct packet_sock *pkt_sk(struct sock *sk)
323 {
324         return (struct packet_sock *)sk;
325 }
326
327 static void packet_sock_destruct(struct sock *sk)
328 {
329         skb_queue_purge(&sk->sk_error_queue);
330
331         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
332         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
333
334         if (!sock_flag(sk, SOCK_DEAD)) {
335                 pr_err("Attempt to release alive packet socket: %p\n", sk);
336                 return;
337         }
338
339         sk_refcnt_debug_dec(sk);
340 }
341
342
343 static const struct proto_ops packet_ops;
344
345 static const struct proto_ops packet_ops_spkt;
346
347 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
348                            struct packet_type *pt, struct net_device *orig_dev)
349 {
350         struct sock *sk;
351         struct sockaddr_pkt *spkt;
352
353         /*
354          *      When we registered the protocol we saved the socket in the data
355          *      field for just this event.
356          */
357
358         sk = pt->af_packet_priv;
359
360         /*
361          *      Yank back the headers [hope the device set this
362          *      right or kerboom...]
363          *
364          *      Incoming packets have ll header pulled,
365          *      push it back.
366          *
367          *      For outgoing ones skb->data == skb_mac_header(skb)
368          *      so that this procedure is noop.
369          */
370
371         if (skb->pkt_type == PACKET_LOOPBACK)
372                 goto out;
373
374         if (!net_eq(dev_net(dev), sock_net(sk)))
375                 goto out;
376
377         skb = skb_share_check(skb, GFP_ATOMIC);
378         if (skb == NULL)
379                 goto oom;
380
381         /* drop any routing info */
382         skb_dst_drop(skb);
383
384         /* drop conntrack reference */
385         nf_reset(skb);
386
387         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
388
389         skb_push(skb, skb->data - skb_mac_header(skb));
390
391         /*
392          *      The SOCK_PACKET socket receives _all_ frames.
393          */
394
395         spkt->spkt_family = dev->type;
396         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
397         spkt->spkt_protocol = skb->protocol;
398
399         /*
400          *      Charge the memory to the socket. This is done specifically
401          *      to prevent sockets using all the memory up.
402          */
403
404         if (sock_queue_rcv_skb(sk, skb) == 0)
405                 return 0;
406
407 out:
408         kfree_skb(skb);
409 oom:
410         return 0;
411 }
412
413
414 /*
415  *      Output a raw packet to a device layer. This bypasses all the other
416  *      protocol layers and you must therefore supply it with a complete frame
417  */
418
419 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
420                                struct msghdr *msg, size_t len)
421 {
422         struct sock *sk = sock->sk;
423         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
424         struct sk_buff *skb = NULL;
425         struct net_device *dev;
426         __be16 proto = 0;
427         int err;
428
429         /*
430          *      Get and verify the address.
431          */
432
433         if (saddr) {
434                 if (msg->msg_namelen < sizeof(struct sockaddr))
435                         return -EINVAL;
436                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
437                         proto = saddr->spkt_protocol;
438         } else
439                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
440
441         /*
442          *      Find the device first to size check it
443          */
444
445         saddr->spkt_device[13] = 0;
446 retry:
447         rcu_read_lock();
448         dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
449         err = -ENODEV;
450         if (dev == NULL)
451                 goto out_unlock;
452
453         err = -ENETDOWN;
454         if (!(dev->flags & IFF_UP))
455                 goto out_unlock;
456
457         /*
458          * You may not queue a frame bigger than the mtu. This is the lowest level
459          * raw protocol and you must do your own fragmentation at this level.
460          */
461
462         err = -EMSGSIZE;
463         if (len > dev->mtu + dev->hard_header_len)
464                 goto out_unlock;
465
466         if (!skb) {
467                 size_t reserved = LL_RESERVED_SPACE(dev);
468                 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
469
470                 rcu_read_unlock();
471                 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
472                 if (skb == NULL)
473                         return -ENOBUFS;
474                 /* FIXME: Save some space for broken drivers that write a hard
475                  * header at transmission time by themselves. PPP is the notable
476                  * one here. This should really be fixed at the driver level.
477                  */
478                 skb_reserve(skb, reserved);
479                 skb_reset_network_header(skb);
480
481                 /* Try to align data part correctly */
482                 if (hhlen) {
483                         skb->data -= hhlen;
484                         skb->tail -= hhlen;
485                         if (len < hhlen)
486                                 skb_reset_network_header(skb);
487                 }
488                 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
489                 if (err)
490                         goto out_free;
491                 goto retry;
492         }
493
494
495         skb->protocol = proto;
496         skb->dev = dev;
497         skb->priority = sk->sk_priority;
498         skb->mark = sk->sk_mark;
499         err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
500         if (err < 0)
501                 goto out_unlock;
502
503         dev_queue_xmit(skb);
504         rcu_read_unlock();
505         return len;
506
507 out_unlock:
508         rcu_read_unlock();
509 out_free:
510         kfree_skb(skb);
511         return err;
512 }
513
514 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
515                                       unsigned int res)
516 {
517         struct sk_filter *filter;
518
519         rcu_read_lock_bh();
520         filter = rcu_dereference_bh(sk->sk_filter);
521         if (filter != NULL)
522                 res = sk_run_filter(skb, filter->insns, filter->len);
523         rcu_read_unlock_bh();
524
525         return res;
526 }
527
528 /*
529    This function makes lazy skb cloning in hope that most of packets
530    are discarded by BPF.
531
532    Note tricky part: we DO mangle shared skb! skb->data, skb->len
533    and skb->cb are mangled. It works because (and until) packets
534    falling here are owned by current CPU. Output packets are cloned
535    by dev_queue_xmit_nit(), input packets are processed by net_bh
536    sequencially, so that if we return skb to original state on exit,
537    we will not harm anyone.
538  */
539
540 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
541                       struct packet_type *pt, struct net_device *orig_dev)
542 {
543         struct sock *sk;
544         struct sockaddr_ll *sll;
545         struct packet_sock *po;
546         u8 *skb_head = skb->data;
547         int skb_len = skb->len;
548         unsigned int snaplen, res;
549
550         if (skb->pkt_type == PACKET_LOOPBACK)
551                 goto drop;
552
553         sk = pt->af_packet_priv;
554         po = pkt_sk(sk);
555
556         if (!net_eq(dev_net(dev), sock_net(sk)))
557                 goto drop;
558
559         skb->dev = dev;
560
561         if (dev->header_ops) {
562                 /* The device has an explicit notion of ll header,
563                    exported to higher levels.
564
565                    Otherwise, the device hides datails of it frame
566                    structure, so that corresponding packet head
567                    never delivered to user.
568                  */
569                 if (sk->sk_type != SOCK_DGRAM)
570                         skb_push(skb, skb->data - skb_mac_header(skb));
571                 else if (skb->pkt_type == PACKET_OUTGOING) {
572                         /* Special case: outgoing packets have ll header at head */
573                         skb_pull(skb, skb_network_offset(skb));
574                 }
575         }
576
577         snaplen = skb->len;
578
579         res = run_filter(skb, sk, snaplen);
580         if (!res)
581                 goto drop_n_restore;
582         if (snaplen > res)
583                 snaplen = res;
584
585         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
586             (unsigned)sk->sk_rcvbuf)
587                 goto drop_n_acct;
588
589         if (skb_shared(skb)) {
590                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
591                 if (nskb == NULL)
592                         goto drop_n_acct;
593
594                 if (skb_head != skb->data) {
595                         skb->data = skb_head;
596                         skb->len = skb_len;
597                 }
598                 kfree_skb(skb);
599                 skb = nskb;
600         }
601
602         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
603                      sizeof(skb->cb));
604
605         sll = &PACKET_SKB_CB(skb)->sa.ll;
606         sll->sll_family = AF_PACKET;
607         sll->sll_hatype = dev->type;
608         sll->sll_protocol = skb->protocol;
609         sll->sll_pkttype = skb->pkt_type;
610         if (unlikely(po->origdev))
611                 sll->sll_ifindex = orig_dev->ifindex;
612         else
613                 sll->sll_ifindex = dev->ifindex;
614
615         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
616
617         PACKET_SKB_CB(skb)->origlen = skb->len;
618
619         if (pskb_trim(skb, snaplen))
620                 goto drop_n_acct;
621
622         skb_set_owner_r(skb, sk);
623         skb->dev = NULL;
624         skb_dst_drop(skb);
625
626         /* drop conntrack reference */
627         nf_reset(skb);
628
629         spin_lock(&sk->sk_receive_queue.lock);
630         po->stats.tp_packets++;
631         skb->dropcount = atomic_read(&sk->sk_drops);
632         __skb_queue_tail(&sk->sk_receive_queue, skb);
633         spin_unlock(&sk->sk_receive_queue.lock);
634         sk->sk_data_ready(sk, skb->len);
635         return 0;
636
637 drop_n_acct:
638         po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
639
640 drop_n_restore:
641         if (skb_head != skb->data && skb_shared(skb)) {
642                 skb->data = skb_head;
643                 skb->len = skb_len;
644         }
645 drop:
646         consume_skb(skb);
647         return 0;
648 }
649
650 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
651                        struct packet_type *pt, struct net_device *orig_dev)
652 {
653         struct sock *sk;
654         struct packet_sock *po;
655         struct sockaddr_ll *sll;
656         union {
657                 struct tpacket_hdr *h1;
658                 struct tpacket2_hdr *h2;
659                 void *raw;
660         } h;
661         u8 *skb_head = skb->data;
662         int skb_len = skb->len;
663         unsigned int snaplen, res;
664         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
665         unsigned short macoff, netoff, hdrlen;
666         struct sk_buff *copy_skb = NULL;
667         struct timeval tv;
668         struct timespec ts;
669         struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
670
671         if (skb->pkt_type == PACKET_LOOPBACK)
672                 goto drop;
673
674         sk = pt->af_packet_priv;
675         po = pkt_sk(sk);
676
677         if (!net_eq(dev_net(dev), sock_net(sk)))
678                 goto drop;
679
680         if (dev->header_ops) {
681                 if (sk->sk_type != SOCK_DGRAM)
682                         skb_push(skb, skb->data - skb_mac_header(skb));
683                 else if (skb->pkt_type == PACKET_OUTGOING) {
684                         /* Special case: outgoing packets have ll header at head */
685                         skb_pull(skb, skb_network_offset(skb));
686                 }
687         }
688
689         if (skb->ip_summed == CHECKSUM_PARTIAL)
690                 status |= TP_STATUS_CSUMNOTREADY;
691
692         snaplen = skb->len;
693
694         res = run_filter(skb, sk, snaplen);
695         if (!res)
696                 goto drop_n_restore;
697         if (snaplen > res)
698                 snaplen = res;
699
700         if (sk->sk_type == SOCK_DGRAM) {
701                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
702                                   po->tp_reserve;
703         } else {
704                 unsigned maclen = skb_network_offset(skb);
705                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
706                                        (maclen < 16 ? 16 : maclen)) +
707                         po->tp_reserve;
708                 macoff = netoff - maclen;
709         }
710
711         if (macoff + snaplen > po->rx_ring.frame_size) {
712                 if (po->copy_thresh &&
713                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
714                     (unsigned)sk->sk_rcvbuf) {
715                         if (skb_shared(skb)) {
716                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
717                         } else {
718                                 copy_skb = skb_get(skb);
719                                 skb_head = skb->data;
720                         }
721                         if (copy_skb)
722                                 skb_set_owner_r(copy_skb, sk);
723                 }
724                 snaplen = po->rx_ring.frame_size - macoff;
725                 if ((int)snaplen < 0)
726                         snaplen = 0;
727         }
728
729         spin_lock(&sk->sk_receive_queue.lock);
730         h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
731         if (!h.raw)
732                 goto ring_is_full;
733         packet_increment_head(&po->rx_ring);
734         po->stats.tp_packets++;
735         if (copy_skb) {
736                 status |= TP_STATUS_COPY;
737                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
738         }
739         if (!po->stats.tp_drops)
740                 status &= ~TP_STATUS_LOSING;
741         spin_unlock(&sk->sk_receive_queue.lock);
742
743         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
744
745         switch (po->tp_version) {
746         case TPACKET_V1:
747                 h.h1->tp_len = skb->len;
748                 h.h1->tp_snaplen = snaplen;
749                 h.h1->tp_mac = macoff;
750                 h.h1->tp_net = netoff;
751                 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
752                                 && shhwtstamps->syststamp.tv64)
753                         tv = ktime_to_timeval(shhwtstamps->syststamp);
754                 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
755                                 && shhwtstamps->hwtstamp.tv64)
756                         tv = ktime_to_timeval(shhwtstamps->hwtstamp);
757                 else if (skb->tstamp.tv64)
758                         tv = ktime_to_timeval(skb->tstamp);
759                 else
760                         do_gettimeofday(&tv);
761                 h.h1->tp_sec = tv.tv_sec;
762                 h.h1->tp_usec = tv.tv_usec;
763                 hdrlen = sizeof(*h.h1);
764                 break;
765         case TPACKET_V2:
766                 h.h2->tp_len = skb->len;
767                 h.h2->tp_snaplen = snaplen;
768                 h.h2->tp_mac = macoff;
769                 h.h2->tp_net = netoff;
770                 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
771                                 && shhwtstamps->syststamp.tv64)
772                         ts = ktime_to_timespec(shhwtstamps->syststamp);
773                 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
774                                 && shhwtstamps->hwtstamp.tv64)
775                         ts = ktime_to_timespec(shhwtstamps->hwtstamp);
776                 else if (skb->tstamp.tv64)
777                         ts = ktime_to_timespec(skb->tstamp);
778                 else
779                         getnstimeofday(&ts);
780                 h.h2->tp_sec = ts.tv_sec;
781                 h.h2->tp_nsec = ts.tv_nsec;
782                 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
783                 hdrlen = sizeof(*h.h2);
784                 break;
785         default:
786                 BUG();
787         }
788
789         sll = h.raw + TPACKET_ALIGN(hdrlen);
790         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
791         sll->sll_family = AF_PACKET;
792         sll->sll_hatype = dev->type;
793         sll->sll_protocol = skb->protocol;
794         sll->sll_pkttype = skb->pkt_type;
795         if (unlikely(po->origdev))
796                 sll->sll_ifindex = orig_dev->ifindex;
797         else
798                 sll->sll_ifindex = dev->ifindex;
799
800         __packet_set_status(po, h.raw, status);
801         smp_mb();
802         {
803                 struct page *p_start, *p_end;
804                 u8 *h_end = h.raw + macoff + snaplen - 1;
805
806                 p_start = virt_to_page(h.raw);
807                 p_end = virt_to_page(h_end);
808                 while (p_start <= p_end) {
809                         flush_dcache_page(p_start);
810                         p_start++;
811                 }
812         }
813
814         sk->sk_data_ready(sk, 0);
815
816 drop_n_restore:
817         if (skb_head != skb->data && skb_shared(skb)) {
818                 skb->data = skb_head;
819                 skb->len = skb_len;
820         }
821 drop:
822         kfree_skb(skb);
823         return 0;
824
825 ring_is_full:
826         po->stats.tp_drops++;
827         spin_unlock(&sk->sk_receive_queue.lock);
828
829         sk->sk_data_ready(sk, 0);
830         kfree_skb(copy_skb);
831         goto drop_n_restore;
832 }
833
834 static void tpacket_destruct_skb(struct sk_buff *skb)
835 {
836         struct packet_sock *po = pkt_sk(skb->sk);
837         void *ph;
838
839         BUG_ON(skb == NULL);
840
841         if (likely(po->tx_ring.pg_vec)) {
842                 ph = skb_shinfo(skb)->destructor_arg;
843                 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
844                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
845                 atomic_dec(&po->tx_ring.pending);
846                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
847         }
848
849         sock_wfree(skb);
850 }
851
852 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
853                 void *frame, struct net_device *dev, int size_max,
854                 __be16 proto, unsigned char *addr)
855 {
856         union {
857                 struct tpacket_hdr *h1;
858                 struct tpacket2_hdr *h2;
859                 void *raw;
860         } ph;
861         int to_write, offset, len, tp_len, nr_frags, len_max;
862         struct socket *sock = po->sk.sk_socket;
863         struct page *page;
864         void *data;
865         int err;
866
867         ph.raw = frame;
868
869         skb->protocol = proto;
870         skb->dev = dev;
871         skb->priority = po->sk.sk_priority;
872         skb->mark = po->sk.sk_mark;
873         skb_shinfo(skb)->destructor_arg = ph.raw;
874
875         switch (po->tp_version) {
876         case TPACKET_V2:
877                 tp_len = ph.h2->tp_len;
878                 break;
879         default:
880                 tp_len = ph.h1->tp_len;
881                 break;
882         }
883         if (unlikely(tp_len > size_max)) {
884                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
885                 return -EMSGSIZE;
886         }
887
888         skb_reserve(skb, LL_RESERVED_SPACE(dev));
889         skb_reset_network_header(skb);
890
891         data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
892         to_write = tp_len;
893
894         if (sock->type == SOCK_DGRAM) {
895                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
896                                 NULL, tp_len);
897                 if (unlikely(err < 0))
898                         return -EINVAL;
899         } else if (dev->hard_header_len) {
900                 /* net device doesn't like empty head */
901                 if (unlikely(tp_len <= dev->hard_header_len)) {
902                         pr_err("packet size is too short (%d < %d)\n",
903                                tp_len, dev->hard_header_len);
904                         return -EINVAL;
905                 }
906
907                 skb_push(skb, dev->hard_header_len);
908                 err = skb_store_bits(skb, 0, data,
909                                 dev->hard_header_len);
910                 if (unlikely(err))
911                         return err;
912
913                 data += dev->hard_header_len;
914                 to_write -= dev->hard_header_len;
915         }
916
917         err = -EFAULT;
918         page = virt_to_page(data);
919         offset = offset_in_page(data);
920         len_max = PAGE_SIZE - offset;
921         len = ((to_write > len_max) ? len_max : to_write);
922
923         skb->data_len = to_write;
924         skb->len += to_write;
925         skb->truesize += to_write;
926         atomic_add(to_write, &po->sk.sk_wmem_alloc);
927
928         while (likely(to_write)) {
929                 nr_frags = skb_shinfo(skb)->nr_frags;
930
931                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
932                         pr_err("Packet exceed the number of skb frags(%lu)\n",
933                                MAX_SKB_FRAGS);
934                         return -EFAULT;
935                 }
936
937                 flush_dcache_page(page);
938                 get_page(page);
939                 skb_fill_page_desc(skb,
940                                 nr_frags,
941                                 page++, offset, len);
942                 to_write -= len;
943                 offset = 0;
944                 len_max = PAGE_SIZE;
945                 len = ((to_write > len_max) ? len_max : to_write);
946         }
947
948         return tp_len;
949 }
950
951 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
952 {
953         struct socket *sock;
954         struct sk_buff *skb;
955         struct net_device *dev;
956         __be16 proto;
957         int ifindex, err, reserve = 0;
958         void *ph;
959         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
960         int tp_len, size_max;
961         unsigned char *addr;
962         int len_sum = 0;
963         int status = 0;
964
965         sock = po->sk.sk_socket;
966
967         mutex_lock(&po->pg_vec_lock);
968
969         err = -EBUSY;
970         if (saddr == NULL) {
971                 ifindex = po->ifindex;
972                 proto   = po->num;
973                 addr    = NULL;
974         } else {
975                 err = -EINVAL;
976                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
977                         goto out;
978                 if (msg->msg_namelen < (saddr->sll_halen
979                                         + offsetof(struct sockaddr_ll,
980                                                 sll_addr)))
981                         goto out;
982                 ifindex = saddr->sll_ifindex;
983                 proto   = saddr->sll_protocol;
984                 addr    = saddr->sll_addr;
985         }
986
987         dev = dev_get_by_index(sock_net(&po->sk), ifindex);
988         err = -ENXIO;
989         if (unlikely(dev == NULL))
990                 goto out;
991
992         reserve = dev->hard_header_len;
993
994         err = -ENETDOWN;
995         if (unlikely(!(dev->flags & IFF_UP)))
996                 goto out_put;
997
998         size_max = po->tx_ring.frame_size
999                 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
1000
1001         if (size_max > dev->mtu + reserve)
1002                 size_max = dev->mtu + reserve;
1003
1004         do {
1005                 ph = packet_current_frame(po, &po->tx_ring,
1006                                 TP_STATUS_SEND_REQUEST);
1007
1008                 if (unlikely(ph == NULL)) {
1009                         schedule();
1010                         continue;
1011                 }
1012
1013                 status = TP_STATUS_SEND_REQUEST;
1014                 skb = sock_alloc_send_skb(&po->sk,
1015                                 LL_ALLOCATED_SPACE(dev)
1016                                 + sizeof(struct sockaddr_ll),
1017                                 0, &err);
1018
1019                 if (unlikely(skb == NULL))
1020                         goto out_status;
1021
1022                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1023                                 addr);
1024
1025                 if (unlikely(tp_len < 0)) {
1026                         if (po->tp_loss) {
1027                                 __packet_set_status(po, ph,
1028                                                 TP_STATUS_AVAILABLE);
1029                                 packet_increment_head(&po->tx_ring);
1030                                 kfree_skb(skb);
1031                                 continue;
1032                         } else {
1033                                 status = TP_STATUS_WRONG_FORMAT;
1034                                 err = tp_len;
1035                                 goto out_status;
1036                         }
1037                 }
1038
1039                 skb->destructor = tpacket_destruct_skb;
1040                 __packet_set_status(po, ph, TP_STATUS_SENDING);
1041                 atomic_inc(&po->tx_ring.pending);
1042
1043                 status = TP_STATUS_SEND_REQUEST;
1044                 err = dev_queue_xmit(skb);
1045                 if (unlikely(err > 0)) {
1046                         err = net_xmit_errno(err);
1047                         if (err && __packet_get_status(po, ph) ==
1048                                    TP_STATUS_AVAILABLE) {
1049                                 /* skb was destructed already */
1050                                 skb = NULL;
1051                                 goto out_status;
1052                         }
1053                         /*
1054                          * skb was dropped but not destructed yet;
1055                          * let's treat it like congestion or err < 0
1056                          */
1057                         err = 0;
1058                 }
1059                 packet_increment_head(&po->tx_ring);
1060                 len_sum += tp_len;
1061         } while (likely((ph != NULL) ||
1062                         ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1063                          (atomic_read(&po->tx_ring.pending))))
1064                 );
1065
1066         err = len_sum;
1067         goto out_put;
1068
1069 out_status:
1070         __packet_set_status(po, ph, status);
1071         kfree_skb(skb);
1072 out_put:
1073         dev_put(dev);
1074 out:
1075         mutex_unlock(&po->pg_vec_lock);
1076         return err;
1077 }
1078
1079 static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1080                                                size_t reserve, size_t len,
1081                                                size_t linear, int noblock,
1082                                                int *err)
1083 {
1084         struct sk_buff *skb;
1085
1086         /* Under a page?  Don't bother with paged skb. */
1087         if (prepad + len < PAGE_SIZE || !linear)
1088                 linear = len;
1089
1090         skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1091                                    err);
1092         if (!skb)
1093                 return NULL;
1094
1095         skb_reserve(skb, reserve);
1096         skb_put(skb, linear);
1097         skb->data_len = len - linear;
1098         skb->len += len - linear;
1099
1100         return skb;
1101 }
1102
1103 static int packet_snd(struct socket *sock,
1104                           struct msghdr *msg, size_t len)
1105 {
1106         struct sock *sk = sock->sk;
1107         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1108         struct sk_buff *skb;
1109         struct net_device *dev;
1110         __be16 proto;
1111         unsigned char *addr;
1112         int ifindex, err, reserve = 0;
1113         struct virtio_net_hdr vnet_hdr = { 0 };
1114         int offset = 0;
1115         int vnet_hdr_len;
1116         struct packet_sock *po = pkt_sk(sk);
1117         unsigned short gso_type = 0;
1118
1119         /*
1120          *      Get and verify the address.
1121          */
1122
1123         if (saddr == NULL) {
1124                 ifindex = po->ifindex;
1125                 proto   = po->num;
1126                 addr    = NULL;
1127         } else {
1128                 err = -EINVAL;
1129                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1130                         goto out;
1131                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1132                         goto out;
1133                 ifindex = saddr->sll_ifindex;
1134                 proto   = saddr->sll_protocol;
1135                 addr    = saddr->sll_addr;
1136         }
1137
1138
1139         dev = dev_get_by_index(sock_net(sk), ifindex);
1140         err = -ENXIO;
1141         if (dev == NULL)
1142                 goto out_unlock;
1143         if (sock->type == SOCK_RAW)
1144                 reserve = dev->hard_header_len;
1145
1146         err = -ENETDOWN;
1147         if (!(dev->flags & IFF_UP))
1148                 goto out_unlock;
1149
1150         if (po->has_vnet_hdr) {
1151                 vnet_hdr_len = sizeof(vnet_hdr);
1152
1153                 err = -EINVAL;
1154                 if (len < vnet_hdr_len)
1155                         goto out_unlock;
1156
1157                 len -= vnet_hdr_len;
1158
1159                 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1160                                        vnet_hdr_len);
1161                 if (err < 0)
1162                         goto out_unlock;
1163
1164                 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1165                     (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1166                       vnet_hdr.hdr_len))
1167                         vnet_hdr.hdr_len = vnet_hdr.csum_start +
1168                                                  vnet_hdr.csum_offset + 2;
1169
1170                 err = -EINVAL;
1171                 if (vnet_hdr.hdr_len > len)
1172                         goto out_unlock;
1173
1174                 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1175                         switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1176                         case VIRTIO_NET_HDR_GSO_TCPV4:
1177                                 gso_type = SKB_GSO_TCPV4;
1178                                 break;
1179                         case VIRTIO_NET_HDR_GSO_TCPV6:
1180                                 gso_type = SKB_GSO_TCPV6;
1181                                 break;
1182                         case VIRTIO_NET_HDR_GSO_UDP:
1183                                 gso_type = SKB_GSO_UDP;
1184                                 break;
1185                         default:
1186                                 goto out_unlock;
1187                         }
1188
1189                         if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1190                                 gso_type |= SKB_GSO_TCP_ECN;
1191
1192                         if (vnet_hdr.gso_size == 0)
1193                                 goto out_unlock;
1194
1195                 }
1196         }
1197
1198         err = -EMSGSIZE;
1199         if (!gso_type && (len > dev->mtu+reserve))
1200                 goto out_unlock;
1201
1202         err = -ENOBUFS;
1203         skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1204                                LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1205                                msg->msg_flags & MSG_DONTWAIT, &err);
1206         if (skb == NULL)
1207                 goto out_unlock;
1208
1209         skb_set_network_header(skb, reserve);
1210
1211         err = -EINVAL;
1212         if (sock->type == SOCK_DGRAM &&
1213             (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
1214                 goto out_free;
1215
1216         /* Returns -EFAULT on error */
1217         err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1218         if (err)
1219                 goto out_free;
1220         err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
1221         if (err < 0)
1222                 goto out_free;
1223
1224         skb->protocol = proto;
1225         skb->dev = dev;
1226         skb->priority = sk->sk_priority;
1227         skb->mark = sk->sk_mark;
1228
1229         if (po->has_vnet_hdr) {
1230                 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1231                         if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1232                                                   vnet_hdr.csum_offset)) {
1233                                 err = -EINVAL;
1234                                 goto out_free;
1235                         }
1236                 }
1237
1238                 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1239                 skb_shinfo(skb)->gso_type = gso_type;
1240
1241                 /* Header must be checked, and gso_segs computed. */
1242                 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1243                 skb_shinfo(skb)->gso_segs = 0;
1244
1245                 len += vnet_hdr_len;
1246         }
1247
1248         /*
1249          *      Now send it
1250          */
1251
1252         err = dev_queue_xmit(skb);
1253         if (err > 0 && (err = net_xmit_errno(err)) != 0)
1254                 goto out_unlock;
1255
1256         dev_put(dev);
1257
1258         return len;
1259
1260 out_free:
1261         kfree_skb(skb);
1262 out_unlock:
1263         if (dev)
1264                 dev_put(dev);
1265 out:
1266         return err;
1267 }
1268
1269 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1270                 struct msghdr *msg, size_t len)
1271 {
1272         struct sock *sk = sock->sk;
1273         struct packet_sock *po = pkt_sk(sk);
1274         if (po->tx_ring.pg_vec)
1275                 return tpacket_snd(po, msg);
1276         else
1277                 return packet_snd(sock, msg, len);
1278 }
1279
1280 /*
1281  *      Close a PACKET socket. This is fairly simple. We immediately go
1282  *      to 'closed' state and remove our protocol entry in the device list.
1283  */
1284
1285 static int packet_release(struct socket *sock)
1286 {
1287         struct sock *sk = sock->sk;
1288         struct packet_sock *po;
1289         struct net *net;
1290         struct tpacket_req req;
1291
1292         if (!sk)
1293                 return 0;
1294
1295         net = sock_net(sk);
1296         po = pkt_sk(sk);
1297
1298         spin_lock_bh(&net->packet.sklist_lock);
1299         sk_del_node_init_rcu(sk);
1300         sock_prot_inuse_add(net, sk->sk_prot, -1);
1301         spin_unlock_bh(&net->packet.sklist_lock);
1302
1303         spin_lock(&po->bind_lock);
1304         if (po->running) {
1305                 /*
1306                  * Remove from protocol table
1307                  */
1308                 po->running = 0;
1309                 po->num = 0;
1310                 __dev_remove_pack(&po->prot_hook);
1311                 __sock_put(sk);
1312         }
1313         spin_unlock(&po->bind_lock);
1314
1315         packet_flush_mclist(sk);
1316
1317         memset(&req, 0, sizeof(req));
1318
1319         if (po->rx_ring.pg_vec)
1320                 packet_set_ring(sk, &req, 1, 0);
1321
1322         if (po->tx_ring.pg_vec)
1323                 packet_set_ring(sk, &req, 1, 1);
1324
1325         synchronize_net();
1326         /*
1327          *      Now the socket is dead. No more input will appear.
1328          */
1329         sock_orphan(sk);
1330         sock->sk = NULL;
1331
1332         /* Purge queues */
1333
1334         skb_queue_purge(&sk->sk_receive_queue);
1335         sk_refcnt_debug_release(sk);
1336
1337         sock_put(sk);
1338         return 0;
1339 }
1340
1341 /*
1342  *      Attach a packet hook.
1343  */
1344
1345 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1346 {
1347         struct packet_sock *po = pkt_sk(sk);
1348         /*
1349          *      Detach an existing hook if present.
1350          */
1351
1352         lock_sock(sk);
1353
1354         spin_lock(&po->bind_lock);
1355         if (po->running) {
1356                 __sock_put(sk);
1357                 po->running = 0;
1358                 po->num = 0;
1359                 spin_unlock(&po->bind_lock);
1360                 dev_remove_pack(&po->prot_hook);
1361                 spin_lock(&po->bind_lock);
1362         }
1363
1364         po->num = protocol;
1365         po->prot_hook.type = protocol;
1366         po->prot_hook.dev = dev;
1367
1368         po->ifindex = dev ? dev->ifindex : 0;
1369
1370         if (protocol == 0)
1371                 goto out_unlock;
1372
1373         if (!dev || (dev->flags & IFF_UP)) {
1374                 dev_add_pack(&po->prot_hook);
1375                 sock_hold(sk);
1376                 po->running = 1;
1377         } else {
1378                 sk->sk_err = ENETDOWN;
1379                 if (!sock_flag(sk, SOCK_DEAD))
1380                         sk->sk_error_report(sk);
1381         }
1382
1383 out_unlock:
1384         spin_unlock(&po->bind_lock);
1385         release_sock(sk);
1386         return 0;
1387 }
1388
1389 /*
1390  *      Bind a packet socket to a device
1391  */
1392
1393 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1394                             int addr_len)
1395 {
1396         struct sock *sk = sock->sk;
1397         char name[15];
1398         struct net_device *dev;
1399         int err = -ENODEV;
1400
1401         /*
1402          *      Check legality
1403          */
1404
1405         if (addr_len != sizeof(struct sockaddr))
1406                 return -EINVAL;
1407         strlcpy(name, uaddr->sa_data, sizeof(name));
1408
1409         dev = dev_get_by_name(sock_net(sk), name);
1410         if (dev) {
1411                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1412                 dev_put(dev);
1413         }
1414         return err;
1415 }
1416
1417 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1418 {
1419         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1420         struct sock *sk = sock->sk;
1421         struct net_device *dev = NULL;
1422         int err;
1423
1424
1425         /*
1426          *      Check legality
1427          */
1428
1429         if (addr_len < sizeof(struct sockaddr_ll))
1430                 return -EINVAL;
1431         if (sll->sll_family != AF_PACKET)
1432                 return -EINVAL;
1433
1434         if (sll->sll_ifindex) {
1435                 err = -ENODEV;
1436                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1437                 if (dev == NULL)
1438                         goto out;
1439         }
1440         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1441         if (dev)
1442                 dev_put(dev);
1443
1444 out:
1445         return err;
1446 }
1447
1448 static struct proto packet_proto = {
1449         .name     = "PACKET",
1450         .owner    = THIS_MODULE,
1451         .obj_size = sizeof(struct packet_sock),
1452 };
1453
1454 /*
1455  *      Create a packet of type SOCK_PACKET.
1456  */
1457
1458 static int packet_create(struct net *net, struct socket *sock, int protocol,
1459                          int kern)
1460 {
1461         struct sock *sk;
1462         struct packet_sock *po;
1463         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1464         int err;
1465
1466         if (!capable(CAP_NET_RAW))
1467                 return -EPERM;
1468         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1469             sock->type != SOCK_PACKET)
1470                 return -ESOCKTNOSUPPORT;
1471
1472         sock->state = SS_UNCONNECTED;
1473
1474         err = -ENOBUFS;
1475         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1476         if (sk == NULL)
1477                 goto out;
1478
1479         sock->ops = &packet_ops;
1480         if (sock->type == SOCK_PACKET)
1481                 sock->ops = &packet_ops_spkt;
1482
1483         sock_init_data(sock, sk);
1484
1485         po = pkt_sk(sk);
1486         sk->sk_family = PF_PACKET;
1487         po->num = proto;
1488
1489         sk->sk_destruct = packet_sock_destruct;
1490         sk_refcnt_debug_inc(sk);
1491
1492         /*
1493          *      Attach a protocol block
1494          */
1495
1496         spin_lock_init(&po->bind_lock);
1497         mutex_init(&po->pg_vec_lock);
1498         po->prot_hook.func = packet_rcv;
1499
1500         if (sock->type == SOCK_PACKET)
1501                 po->prot_hook.func = packet_rcv_spkt;
1502
1503         po->prot_hook.af_packet_priv = sk;
1504
1505         if (proto) {
1506                 po->prot_hook.type = proto;
1507                 dev_add_pack(&po->prot_hook);
1508                 sock_hold(sk);
1509                 po->running = 1;
1510         }
1511
1512         spin_lock_bh(&net->packet.sklist_lock);
1513         sk_add_node_rcu(sk, &net->packet.sklist);
1514         sock_prot_inuse_add(net, &packet_proto, 1);
1515         spin_unlock_bh(&net->packet.sklist_lock);
1516
1517         return 0;
1518 out:
1519         return err;
1520 }
1521
1522 static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
1523 {
1524         struct sock_exterr_skb *serr;
1525         struct sk_buff *skb, *skb2;
1526         int copied, err;
1527
1528         err = -EAGAIN;
1529         skb = skb_dequeue(&sk->sk_error_queue);
1530         if (skb == NULL)
1531                 goto out;
1532
1533         copied = skb->len;
1534         if (copied > len) {
1535                 msg->msg_flags |= MSG_TRUNC;
1536                 copied = len;
1537         }
1538         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1539         if (err)
1540                 goto out_free_skb;
1541
1542         sock_recv_timestamp(msg, sk, skb);
1543
1544         serr = SKB_EXT_ERR(skb);
1545         put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
1546                  sizeof(serr->ee), &serr->ee);
1547
1548         msg->msg_flags |= MSG_ERRQUEUE;
1549         err = copied;
1550
1551         /* Reset and regenerate socket error */
1552         spin_lock_bh(&sk->sk_error_queue.lock);
1553         sk->sk_err = 0;
1554         if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
1555                 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
1556                 spin_unlock_bh(&sk->sk_error_queue.lock);
1557                 sk->sk_error_report(sk);
1558         } else
1559                 spin_unlock_bh(&sk->sk_error_queue.lock);
1560
1561 out_free_skb:
1562         kfree_skb(skb);
1563 out:
1564         return err;
1565 }
1566
1567 /*
1568  *      Pull a packet from our receive queue and hand it to the user.
1569  *      If necessary we block.
1570  */
1571
1572 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1573                           struct msghdr *msg, size_t len, int flags)
1574 {
1575         struct sock *sk = sock->sk;
1576         struct sk_buff *skb;
1577         int copied, err;
1578         struct sockaddr_ll *sll;
1579         int vnet_hdr_len = 0;
1580
1581         err = -EINVAL;
1582         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1583                 goto out;
1584
1585 #if 0
1586         /* What error should we return now? EUNATTACH? */
1587         if (pkt_sk(sk)->ifindex < 0)
1588                 return -ENODEV;
1589 #endif
1590
1591         if (flags & MSG_ERRQUEUE) {
1592                 err = packet_recv_error(sk, msg, len);
1593                 goto out;
1594         }
1595
1596         /*
1597          *      Call the generic datagram receiver. This handles all sorts
1598          *      of horrible races and re-entrancy so we can forget about it
1599          *      in the protocol layers.
1600          *
1601          *      Now it will return ENETDOWN, if device have just gone down,
1602          *      but then it will block.
1603          */
1604
1605         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1606
1607         /*
1608          *      An error occurred so return it. Because skb_recv_datagram()
1609          *      handles the blocking we don't see and worry about blocking
1610          *      retries.
1611          */
1612
1613         if (skb == NULL)
1614                 goto out;
1615
1616         if (pkt_sk(sk)->has_vnet_hdr) {
1617                 struct virtio_net_hdr vnet_hdr = { 0 };
1618
1619                 err = -EINVAL;
1620                 vnet_hdr_len = sizeof(vnet_hdr);
1621                 if (len < vnet_hdr_len)
1622                         goto out_free;
1623
1624                 len -= vnet_hdr_len;
1625
1626                 if (skb_is_gso(skb)) {
1627                         struct skb_shared_info *sinfo = skb_shinfo(skb);
1628
1629                         /* This is a hint as to how much should be linear. */
1630                         vnet_hdr.hdr_len = skb_headlen(skb);
1631                         vnet_hdr.gso_size = sinfo->gso_size;
1632                         if (sinfo->gso_type & SKB_GSO_TCPV4)
1633                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1634                         else if (sinfo->gso_type & SKB_GSO_TCPV6)
1635                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1636                         else if (sinfo->gso_type & SKB_GSO_UDP)
1637                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1638                         else if (sinfo->gso_type & SKB_GSO_FCOE)
1639                                 goto out_free;
1640                         else
1641                                 BUG();
1642                         if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1643                                 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1644                 } else
1645                         vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1646
1647                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1648                         vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1649                         vnet_hdr.csum_start = skb->csum_start -
1650                                                         skb_headroom(skb);
1651                         vnet_hdr.csum_offset = skb->csum_offset;
1652                 } /* else everything is zero */
1653
1654                 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1655                                      vnet_hdr_len);
1656                 if (err < 0)
1657                         goto out_free;
1658         }
1659
1660         /*
1661          *      If the address length field is there to be filled in, we fill
1662          *      it in now.
1663          */
1664
1665         sll = &PACKET_SKB_CB(skb)->sa.ll;
1666         if (sock->type == SOCK_PACKET)
1667                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1668         else
1669                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1670
1671         /*
1672          *      You lose any data beyond the buffer you gave. If it worries a
1673          *      user program they can ask the device for its MTU anyway.
1674          */
1675
1676         copied = skb->len;
1677         if (copied > len) {
1678                 copied = len;
1679                 msg->msg_flags |= MSG_TRUNC;
1680         }
1681
1682         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1683         if (err)
1684                 goto out_free;
1685
1686         sock_recv_ts_and_drops(msg, sk, skb);
1687
1688         if (msg->msg_name)
1689                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1690                        msg->msg_namelen);
1691
1692         if (pkt_sk(sk)->auxdata) {
1693                 struct tpacket_auxdata aux;
1694
1695                 aux.tp_status = TP_STATUS_USER;
1696                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1697                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1698                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1699                 aux.tp_snaplen = skb->len;
1700                 aux.tp_mac = 0;
1701                 aux.tp_net = skb_network_offset(skb);
1702                 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1703
1704                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1705         }
1706
1707         /*
1708          *      Free or return the buffer as appropriate. Again this
1709          *      hides all the races and re-entrancy issues from us.
1710          */
1711         err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1712
1713 out_free:
1714         skb_free_datagram(sk, skb);
1715 out:
1716         return err;
1717 }
1718
1719 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1720                                int *uaddr_len, int peer)
1721 {
1722         struct net_device *dev;
1723         struct sock *sk = sock->sk;
1724
1725         if (peer)
1726                 return -EOPNOTSUPP;
1727
1728         uaddr->sa_family = AF_PACKET;
1729         rcu_read_lock();
1730         dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1731         if (dev)
1732                 strncpy(uaddr->sa_data, dev->name, 14);
1733         else
1734                 memset(uaddr->sa_data, 0, 14);
1735         rcu_read_unlock();
1736         *uaddr_len = sizeof(*uaddr);
1737
1738         return 0;
1739 }
1740
1741 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1742                           int *uaddr_len, int peer)
1743 {
1744         struct net_device *dev;
1745         struct sock *sk = sock->sk;
1746         struct packet_sock *po = pkt_sk(sk);
1747         DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1748
1749         if (peer)
1750                 return -EOPNOTSUPP;
1751
1752         sll->sll_family = AF_PACKET;
1753         sll->sll_ifindex = po->ifindex;
1754         sll->sll_protocol = po->num;
1755         sll->sll_pkttype = 0;
1756         rcu_read_lock();
1757         dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1758         if (dev) {
1759                 sll->sll_hatype = dev->type;
1760                 sll->sll_halen = dev->addr_len;
1761                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1762         } else {
1763                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1764                 sll->sll_halen = 0;
1765         }
1766         rcu_read_unlock();
1767         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1768
1769         return 0;
1770 }
1771
1772 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1773                          int what)
1774 {
1775         switch (i->type) {
1776         case PACKET_MR_MULTICAST:
1777                 if (i->alen != dev->addr_len)
1778                         return -EINVAL;
1779                 if (what > 0)
1780                         return dev_mc_add(dev, i->addr);
1781                 else
1782                         return dev_mc_del(dev, i->addr);
1783                 break;
1784         case PACKET_MR_PROMISC:
1785                 return dev_set_promiscuity(dev, what);
1786                 break;
1787         case PACKET_MR_ALLMULTI:
1788                 return dev_set_allmulti(dev, what);
1789                 break;
1790         case PACKET_MR_UNICAST:
1791                 if (i->alen != dev->addr_len)
1792                         return -EINVAL;
1793                 if (what > 0)
1794                         return dev_uc_add(dev, i->addr);
1795                 else
1796                         return dev_uc_del(dev, i->addr);
1797                 break;
1798         default:
1799                 break;
1800         }
1801         return 0;
1802 }
1803
1804 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1805 {
1806         for ( ; i; i = i->next) {
1807                 if (i->ifindex == dev->ifindex)
1808                         packet_dev_mc(dev, i, what);
1809         }
1810 }
1811
1812 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1813 {
1814         struct packet_sock *po = pkt_sk(sk);
1815         struct packet_mclist *ml, *i;
1816         struct net_device *dev;
1817         int err;
1818
1819         rtnl_lock();
1820
1821         err = -ENODEV;
1822         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1823         if (!dev)
1824                 goto done;
1825
1826         err = -EINVAL;
1827         if (mreq->mr_alen > dev->addr_len)
1828                 goto done;
1829
1830         err = -ENOBUFS;
1831         i = kmalloc(sizeof(*i), GFP_KERNEL);
1832         if (i == NULL)
1833                 goto done;
1834
1835         err = 0;
1836         for (ml = po->mclist; ml; ml = ml->next) {
1837                 if (ml->ifindex == mreq->mr_ifindex &&
1838                     ml->type == mreq->mr_type &&
1839                     ml->alen == mreq->mr_alen &&
1840                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1841                         ml->count++;
1842                         /* Free the new element ... */
1843                         kfree(i);
1844                         goto done;
1845                 }
1846         }
1847
1848         i->type = mreq->mr_type;
1849         i->ifindex = mreq->mr_ifindex;
1850         i->alen = mreq->mr_alen;
1851         memcpy(i->addr, mreq->mr_address, i->alen);
1852         i->count = 1;
1853         i->next = po->mclist;
1854         po->mclist = i;
1855         err = packet_dev_mc(dev, i, 1);
1856         if (err) {
1857                 po->mclist = i->next;
1858                 kfree(i);
1859         }
1860
1861 done:
1862         rtnl_unlock();
1863         return err;
1864 }
1865
1866 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1867 {
1868         struct packet_mclist *ml, **mlp;
1869
1870         rtnl_lock();
1871
1872         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1873                 if (ml->ifindex == mreq->mr_ifindex &&
1874                     ml->type == mreq->mr_type &&
1875                     ml->alen == mreq->mr_alen &&
1876                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1877                         if (--ml->count == 0) {
1878                                 struct net_device *dev;
1879                                 *mlp = ml->next;
1880                                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1881                                 if (dev)
1882                                         packet_dev_mc(dev, ml, -1);
1883                                 kfree(ml);
1884                         }
1885                         rtnl_unlock();
1886                         return 0;
1887                 }
1888         }
1889         rtnl_unlock();
1890         return -EADDRNOTAVAIL;
1891 }
1892
1893 static void packet_flush_mclist(struct sock *sk)
1894 {
1895         struct packet_sock *po = pkt_sk(sk);
1896         struct packet_mclist *ml;
1897
1898         if (!po->mclist)
1899                 return;
1900
1901         rtnl_lock();
1902         while ((ml = po->mclist) != NULL) {
1903                 struct net_device *dev;
1904
1905                 po->mclist = ml->next;
1906                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1907                 if (dev != NULL)
1908                         packet_dev_mc(dev, ml, -1);
1909                 kfree(ml);
1910         }
1911         rtnl_unlock();
1912 }
1913
1914 static int
1915 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1916 {
1917         struct sock *sk = sock->sk;
1918         struct packet_sock *po = pkt_sk(sk);
1919         int ret;
1920
1921         if (level != SOL_PACKET)
1922                 return -ENOPROTOOPT;
1923
1924         switch (optname) {
1925         case PACKET_ADD_MEMBERSHIP:
1926         case PACKET_DROP_MEMBERSHIP:
1927         {
1928                 struct packet_mreq_max mreq;
1929                 int len = optlen;
1930                 memset(&mreq, 0, sizeof(mreq));
1931                 if (len < sizeof(struct packet_mreq))
1932                         return -EINVAL;
1933                 if (len > sizeof(mreq))
1934                         len = sizeof(mreq);
1935                 if (copy_from_user(&mreq, optval, len))
1936                         return -EFAULT;
1937                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1938                         return -EINVAL;
1939                 if (optname == PACKET_ADD_MEMBERSHIP)
1940                         ret = packet_mc_add(sk, &mreq);
1941                 else
1942                         ret = packet_mc_drop(sk, &mreq);
1943                 return ret;
1944         }
1945
1946         case PACKET_RX_RING:
1947         case PACKET_TX_RING:
1948         {
1949                 struct tpacket_req req;
1950
1951                 if (optlen < sizeof(req))
1952                         return -EINVAL;
1953                 if (pkt_sk(sk)->has_vnet_hdr)
1954                         return -EINVAL;
1955                 if (copy_from_user(&req, optval, sizeof(req)))
1956                         return -EFAULT;
1957                 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1958         }
1959         case PACKET_COPY_THRESH:
1960         {
1961                 int val;
1962
1963                 if (optlen != sizeof(val))
1964                         return -EINVAL;
1965                 if (copy_from_user(&val, optval, sizeof(val)))
1966                         return -EFAULT;
1967
1968                 pkt_sk(sk)->copy_thresh = val;
1969                 return 0;
1970         }
1971         case PACKET_VERSION:
1972         {
1973                 int val;
1974
1975                 if (optlen != sizeof(val))
1976                         return -EINVAL;
1977                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1978                         return -EBUSY;
1979                 if (copy_from_user(&val, optval, sizeof(val)))
1980                         return -EFAULT;
1981                 switch (val) {
1982                 case TPACKET_V1:
1983                 case TPACKET_V2:
1984                         po->tp_version = val;
1985                         return 0;
1986                 default:
1987                         return -EINVAL;
1988                 }
1989         }
1990         case PACKET_RESERVE:
1991         {
1992                 unsigned int val;
1993
1994                 if (optlen != sizeof(val))
1995                         return -EINVAL;
1996                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1997                         return -EBUSY;
1998                 if (copy_from_user(&val, optval, sizeof(val)))
1999                         return -EFAULT;
2000                 po->tp_reserve = val;
2001                 return 0;
2002         }
2003         case PACKET_LOSS:
2004         {
2005                 unsigned int val;
2006
2007                 if (optlen != sizeof(val))
2008                         return -EINVAL;
2009                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2010                         return -EBUSY;
2011                 if (copy_from_user(&val, optval, sizeof(val)))
2012                         return -EFAULT;
2013                 po->tp_loss = !!val;
2014                 return 0;
2015         }
2016         case PACKET_AUXDATA:
2017         {
2018                 int val;
2019
2020                 if (optlen < sizeof(val))
2021                         return -EINVAL;
2022                 if (copy_from_user(&val, optval, sizeof(val)))
2023                         return -EFAULT;
2024
2025                 po->auxdata = !!val;
2026                 return 0;
2027         }
2028         case PACKET_ORIGDEV:
2029         {
2030                 int val;
2031
2032                 if (optlen < sizeof(val))
2033                         return -EINVAL;
2034                 if (copy_from_user(&val, optval, sizeof(val)))
2035                         return -EFAULT;
2036
2037                 po->origdev = !!val;
2038                 return 0;
2039         }
2040         case PACKET_VNET_HDR:
2041         {
2042                 int val;
2043
2044                 if (sock->type != SOCK_RAW)
2045                         return -EINVAL;
2046                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2047                         return -EBUSY;
2048                 if (optlen < sizeof(val))
2049                         return -EINVAL;
2050                 if (copy_from_user(&val, optval, sizeof(val)))
2051                         return -EFAULT;
2052
2053                 po->has_vnet_hdr = !!val;
2054                 return 0;
2055         }
2056         case PACKET_TIMESTAMP:
2057         {
2058                 int val;
2059
2060                 if (optlen != sizeof(val))
2061                         return -EINVAL;
2062                 if (copy_from_user(&val, optval, sizeof(val)))
2063                         return -EFAULT;
2064
2065                 po->tp_tstamp = val;
2066                 return 0;
2067         }
2068         default:
2069                 return -ENOPROTOOPT;
2070         }
2071 }
2072
2073 static int packet_getsockopt(struct socket *sock, int level, int optname,
2074                              char __user *optval, int __user *optlen)
2075 {
2076         int len;
2077         int val;
2078         struct sock *sk = sock->sk;
2079         struct packet_sock *po = pkt_sk(sk);
2080         void *data;
2081         struct tpacket_stats st;
2082
2083         if (level != SOL_PACKET)
2084                 return -ENOPROTOOPT;
2085
2086         if (get_user(len, optlen))
2087                 return -EFAULT;
2088
2089         if (len < 0)
2090                 return -EINVAL;
2091
2092         switch (optname) {
2093         case PACKET_STATISTICS:
2094                 if (len > sizeof(struct tpacket_stats))
2095                         len = sizeof(struct tpacket_stats);
2096                 spin_lock_bh(&sk->sk_receive_queue.lock);
2097                 st = po->stats;
2098                 memset(&po->stats, 0, sizeof(st));
2099                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2100                 st.tp_packets += st.tp_drops;
2101
2102                 data = &st;
2103                 break;
2104         case PACKET_AUXDATA:
2105                 if (len > sizeof(int))
2106                         len = sizeof(int);
2107                 val = po->auxdata;
2108
2109                 data = &val;
2110                 break;
2111         case PACKET_ORIGDEV:
2112                 if (len > sizeof(int))
2113                         len = sizeof(int);
2114                 val = po->origdev;
2115
2116                 data = &val;
2117                 break;
2118         case PACKET_VNET_HDR:
2119                 if (len > sizeof(int))
2120                         len = sizeof(int);
2121                 val = po->has_vnet_hdr;
2122
2123                 data = &val;
2124                 break;
2125         case PACKET_VERSION:
2126                 if (len > sizeof(int))
2127                         len = sizeof(int);
2128                 val = po->tp_version;
2129                 data = &val;
2130                 break;
2131         case PACKET_HDRLEN:
2132                 if (len > sizeof(int))
2133                         len = sizeof(int);
2134                 if (copy_from_user(&val, optval, len))
2135                         return -EFAULT;
2136                 switch (val) {
2137                 case TPACKET_V1:
2138                         val = sizeof(struct tpacket_hdr);
2139                         break;
2140                 case TPACKET_V2:
2141                         val = sizeof(struct tpacket2_hdr);
2142                         break;
2143                 default:
2144                         return -EINVAL;
2145                 }
2146                 data = &val;
2147                 break;
2148         case PACKET_RESERVE:
2149                 if (len > sizeof(unsigned int))
2150                         len = sizeof(unsigned int);
2151                 val = po->tp_reserve;
2152                 data = &val;
2153                 break;
2154         case PACKET_LOSS:
2155                 if (len > sizeof(unsigned int))
2156                         len = sizeof(unsigned int);
2157                 val = po->tp_loss;
2158                 data = &val;
2159                 break;
2160         case PACKET_TIMESTAMP:
2161                 if (len > sizeof(int))
2162                         len = sizeof(int);
2163                 val = po->tp_tstamp;
2164                 data = &val;
2165                 break;
2166         default:
2167                 return -ENOPROTOOPT;
2168         }
2169
2170         if (put_user(len, optlen))
2171                 return -EFAULT;
2172         if (copy_to_user(optval, data, len))
2173                 return -EFAULT;
2174         return 0;
2175 }
2176
2177
2178 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2179 {
2180         struct sock *sk;
2181         struct hlist_node *node;
2182         struct net_device *dev = data;
2183         struct net *net = dev_net(dev);
2184
2185         rcu_read_lock();
2186         sk_for_each_rcu(sk, node, &net->packet.sklist) {
2187                 struct packet_sock *po = pkt_sk(sk);
2188
2189                 switch (msg) {
2190                 case NETDEV_UNREGISTER:
2191                         if (po->mclist)
2192                                 packet_dev_mclist(dev, po->mclist, -1);
2193                         /* fallthrough */
2194
2195                 case NETDEV_DOWN:
2196                         if (dev->ifindex == po->ifindex) {
2197                                 spin_lock(&po->bind_lock);
2198                                 if (po->running) {
2199                                         __dev_remove_pack(&po->prot_hook);
2200                                         __sock_put(sk);
2201                                         po->running = 0;
2202                                         sk->sk_err = ENETDOWN;
2203                                         if (!sock_flag(sk, SOCK_DEAD))
2204                                                 sk->sk_error_report(sk);
2205                                 }
2206                                 if (msg == NETDEV_UNREGISTER) {
2207                                         po->ifindex = -1;
2208                                         po->prot_hook.dev = NULL;
2209                                 }
2210                                 spin_unlock(&po->bind_lock);
2211                         }
2212                         break;
2213                 case NETDEV_UP:
2214                         if (dev->ifindex == po->ifindex) {
2215                                 spin_lock(&po->bind_lock);
2216                                 if (po->num && !po->running) {
2217                                         dev_add_pack(&po->prot_hook);
2218                                         sock_hold(sk);
2219                                         po->running = 1;
2220                                 }
2221                                 spin_unlock(&po->bind_lock);
2222                         }
2223                         break;
2224                 }
2225         }
2226         rcu_read_unlock();
2227         return NOTIFY_DONE;
2228 }
2229
2230
2231 static int packet_ioctl(struct socket *sock, unsigned int cmd,
2232                         unsigned long arg)
2233 {
2234         struct sock *sk = sock->sk;
2235
2236         switch (cmd) {
2237         case SIOCOUTQ:
2238         {
2239                 int amount = sk_wmem_alloc_get(sk);
2240
2241                 return put_user(amount, (int __user *)arg);
2242         }
2243         case SIOCINQ:
2244         {
2245                 struct sk_buff *skb;
2246                 int amount = 0;
2247
2248                 spin_lock_bh(&sk->sk_receive_queue.lock);
2249                 skb = skb_peek(&sk->sk_receive_queue);
2250                 if (skb)
2251                         amount = skb->len;
2252                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2253                 return put_user(amount, (int __user *)arg);
2254         }
2255         case SIOCGSTAMP:
2256                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2257         case SIOCGSTAMPNS:
2258                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2259
2260 #ifdef CONFIG_INET
2261         case SIOCADDRT:
2262         case SIOCDELRT:
2263         case SIOCDARP:
2264         case SIOCGARP:
2265         case SIOCSARP:
2266         case SIOCGIFADDR:
2267         case SIOCSIFADDR:
2268         case SIOCGIFBRDADDR:
2269         case SIOCSIFBRDADDR:
2270         case SIOCGIFNETMASK:
2271         case SIOCSIFNETMASK:
2272         case SIOCGIFDSTADDR:
2273         case SIOCSIFDSTADDR:
2274         case SIOCSIFFLAGS:
2275                 return inet_dgram_ops.ioctl(sock, cmd, arg);
2276 #endif
2277
2278         default:
2279                 return -ENOIOCTLCMD;
2280         }
2281         return 0;
2282 }
2283
2284 static unsigned int packet_poll(struct file *file, struct socket *sock,
2285                                 poll_table *wait)
2286 {
2287         struct sock *sk = sock->sk;
2288         struct packet_sock *po = pkt_sk(sk);
2289         unsigned int mask = datagram_poll(file, sock, wait);
2290
2291         spin_lock_bh(&sk->sk_receive_queue.lock);
2292         if (po->rx_ring.pg_vec) {
2293                 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2294                         mask |= POLLIN | POLLRDNORM;
2295         }
2296         spin_unlock_bh(&sk->sk_receive_queue.lock);
2297         spin_lock_bh(&sk->sk_write_queue.lock);
2298         if (po->tx_ring.pg_vec) {
2299                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2300                         mask |= POLLOUT | POLLWRNORM;
2301         }
2302         spin_unlock_bh(&sk->sk_write_queue.lock);
2303         return mask;
2304 }
2305
2306
2307 /* Dirty? Well, I still did not learn better way to account
2308  * for user mmaps.
2309  */
2310
2311 static void packet_mm_open(struct vm_area_struct *vma)
2312 {
2313         struct file *file = vma->vm_file;
2314         struct socket *sock = file->private_data;
2315         struct sock *sk = sock->sk;
2316
2317         if (sk)
2318                 atomic_inc(&pkt_sk(sk)->mapped);
2319 }
2320
2321 static void packet_mm_close(struct vm_area_struct *vma)
2322 {
2323         struct file *file = vma->vm_file;
2324         struct socket *sock = file->private_data;
2325         struct sock *sk = sock->sk;
2326
2327         if (sk)
2328                 atomic_dec(&pkt_sk(sk)->mapped);
2329 }
2330
2331 static const struct vm_operations_struct packet_mmap_ops = {
2332         .open   =       packet_mm_open,
2333         .close  =       packet_mm_close,
2334 };
2335
2336 static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
2337                         unsigned int len)
2338 {
2339         int i;
2340
2341         for (i = 0; i < len; i++) {
2342                 if (likely(pg_vec[i].buffer)) {
2343                         if (pg_vec[i].flags & PGV_FROM_VMALLOC)
2344                                 vfree(pg_vec[i].buffer);
2345                         else
2346                                 free_pages((unsigned long)pg_vec[i].buffer,
2347                                            order);
2348                         pg_vec[i].buffer = NULL;
2349                 }
2350         }
2351         kfree(pg_vec);
2352 }
2353
2354 static inline char *alloc_one_pg_vec_page(unsigned long order,
2355                                           unsigned char *flags)
2356 {
2357         char *buffer = NULL;
2358         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
2359                           __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
2360
2361         buffer = (char *) __get_free_pages(gfp_flags, order);
2362
2363         if (buffer)
2364                 return buffer;
2365
2366         /*
2367          * __get_free_pages failed, fall back to vmalloc
2368          */
2369         *flags |= PGV_FROM_VMALLOC;
2370         buffer = vmalloc((1 << order) * PAGE_SIZE);
2371
2372         if (buffer)
2373                 return buffer;
2374
2375         /*
2376          * vmalloc failed, lets dig into swap here
2377          */
2378         *flags = 0;
2379         gfp_flags &= ~__GFP_NORETRY;
2380         buffer = (char *)__get_free_pages(gfp_flags, order);
2381         if (buffer)
2382                 return buffer;
2383
2384         /*
2385          * complete and utter failure
2386          */
2387         return NULL;
2388 }
2389
2390 static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
2391 {
2392         unsigned int block_nr = req->tp_block_nr;
2393         struct pgv *pg_vec;
2394         int i;
2395
2396         pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
2397         if (unlikely(!pg_vec))
2398                 goto out;
2399
2400         for (i = 0; i < block_nr; i++) {
2401                 pg_vec[i].buffer = alloc_one_pg_vec_page(order,
2402                                                          &pg_vec[i].flags);
2403                 if (unlikely(!pg_vec[i].buffer))
2404                         goto out_free_pgvec;
2405         }
2406
2407 out:
2408         return pg_vec;
2409
2410 out_free_pgvec:
2411         free_pg_vec(pg_vec, order, block_nr);
2412         kfree(pg_vec);
2413         pg_vec = NULL;
2414         goto out;
2415 }
2416
2417 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2418                 int closing, int tx_ring)
2419 {
2420         struct pgv *pg_vec = NULL;
2421         struct packet_sock *po = pkt_sk(sk);
2422         int was_running, order = 0;
2423         struct packet_ring_buffer *rb;
2424         struct sk_buff_head *rb_queue;
2425         __be16 num;
2426         int err;
2427
2428         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2429         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2430
2431         err = -EBUSY;
2432         if (!closing) {
2433                 if (atomic_read(&po->mapped))
2434                         goto out;
2435                 if (atomic_read(&rb->pending))
2436                         goto out;
2437         }
2438
2439         if (req->tp_block_nr) {
2440                 /* Sanity tests and some calculations */
2441                 err = -EBUSY;
2442                 if (unlikely(rb->pg_vec))
2443                         goto out;
2444
2445                 switch (po->tp_version) {
2446                 case TPACKET_V1:
2447                         po->tp_hdrlen = TPACKET_HDRLEN;
2448                         break;
2449                 case TPACKET_V2:
2450                         po->tp_hdrlen = TPACKET2_HDRLEN;
2451                         break;
2452                 }
2453
2454                 err = -EINVAL;
2455                 if (unlikely((int)req->tp_block_size <= 0))
2456                         goto out;
2457                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2458                         goto out;
2459                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2460                                         po->tp_reserve))
2461                         goto out;
2462                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2463                         goto out;
2464
2465                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2466                 if (unlikely(rb->frames_per_block <= 0))
2467                         goto out;
2468                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2469                                         req->tp_frame_nr))
2470                         goto out;
2471
2472                 err = -ENOMEM;
2473                 order = get_order(req->tp_block_size);
2474                 pg_vec = alloc_pg_vec(req, order);
2475                 if (unlikely(!pg_vec))
2476                         goto out;
2477         }
2478         /* Done */
2479         else {
2480                 err = -EINVAL;
2481                 if (unlikely(req->tp_frame_nr))
2482                         goto out;
2483         }
2484
2485         lock_sock(sk);
2486
2487         /* Detach socket from network */
2488         spin_lock(&po->bind_lock);
2489         was_running = po->running;
2490         num = po->num;
2491         if (was_running) {
2492                 __dev_remove_pack(&po->prot_hook);
2493                 po->num = 0;
2494                 po->running = 0;
2495                 __sock_put(sk);
2496         }
2497         spin_unlock(&po->bind_lock);
2498
2499         synchronize_net();
2500
2501         err = -EBUSY;
2502         mutex_lock(&po->pg_vec_lock);
2503         if (closing || atomic_read(&po->mapped) == 0) {
2504                 err = 0;
2505 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2506                 spin_lock_bh(&rb_queue->lock);
2507                 pg_vec = XC(rb->pg_vec, pg_vec);
2508                 rb->frame_max = (req->tp_frame_nr - 1);
2509                 rb->head = 0;
2510                 rb->frame_size = req->tp_frame_size;
2511                 spin_unlock_bh(&rb_queue->lock);
2512
2513                 order = XC(rb->pg_vec_order, order);
2514                 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2515
2516                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2517                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2518                                                 tpacket_rcv : packet_rcv;
2519                 skb_queue_purge(rb_queue);
2520 #undef XC
2521                 if (atomic_read(&po->mapped))
2522                         pr_err("packet_mmap: vma is busy: %d\n",
2523                                atomic_read(&po->mapped));
2524         }
2525         mutex_unlock(&po->pg_vec_lock);
2526
2527         spin_lock(&po->bind_lock);
2528         if (was_running && !po->running) {
2529                 sock_hold(sk);
2530                 po->running = 1;
2531                 po->num = num;
2532                 dev_add_pack(&po->prot_hook);
2533         }
2534         spin_unlock(&po->bind_lock);
2535
2536         release_sock(sk);
2537
2538         if (pg_vec)
2539                 free_pg_vec(pg_vec, order, req->tp_block_nr);
2540 out:
2541         return err;
2542 }
2543
2544 static int packet_mmap(struct file *file, struct socket *sock,
2545                 struct vm_area_struct *vma)
2546 {
2547         struct sock *sk = sock->sk;
2548         struct packet_sock *po = pkt_sk(sk);
2549         unsigned long size, expected_size;
2550         struct packet_ring_buffer *rb;
2551         unsigned long start;
2552         int err = -EINVAL;
2553         int i;
2554
2555         if (vma->vm_pgoff)
2556                 return -EINVAL;
2557
2558         mutex_lock(&po->pg_vec_lock);
2559
2560         expected_size = 0;
2561         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2562                 if (rb->pg_vec) {
2563                         expected_size += rb->pg_vec_len
2564                                                 * rb->pg_vec_pages
2565                                                 * PAGE_SIZE;
2566                 }
2567         }
2568
2569         if (expected_size == 0)
2570                 goto out;
2571
2572         size = vma->vm_end - vma->vm_start;
2573         if (size != expected_size)
2574                 goto out;
2575
2576         start = vma->vm_start;
2577         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2578                 if (rb->pg_vec == NULL)
2579                         continue;
2580
2581                 for (i = 0; i < rb->pg_vec_len; i++) {
2582                         struct page *page;
2583                         void *kaddr = rb->pg_vec[i].buffer;
2584                         int pg_num;
2585
2586                         for (pg_num = 0; pg_num < rb->pg_vec_pages;
2587                                         pg_num++) {
2588                                 if (rb->pg_vec[i].flags & PGV_FROM_VMALLOC)
2589                                         page = vmalloc_to_page(kaddr);
2590                                 else
2591                                         page = virt_to_page(kaddr);
2592
2593                                 err = vm_insert_page(vma, start, page);
2594                                 if (unlikely(err))
2595                                         goto out;
2596                                 start += PAGE_SIZE;
2597                                 kaddr += PAGE_SIZE;
2598                         }
2599                 }
2600         }
2601
2602         atomic_inc(&po->mapped);
2603         vma->vm_ops = &packet_mmap_ops;
2604         err = 0;
2605
2606 out:
2607         mutex_unlock(&po->pg_vec_lock);
2608         return err;
2609 }
2610
2611 static const struct proto_ops packet_ops_spkt = {
2612         .family =       PF_PACKET,
2613         .owner =        THIS_MODULE,
2614         .release =      packet_release,
2615         .bind =         packet_bind_spkt,
2616         .connect =      sock_no_connect,
2617         .socketpair =   sock_no_socketpair,
2618         .accept =       sock_no_accept,
2619         .getname =      packet_getname_spkt,
2620         .poll =         datagram_poll,
2621         .ioctl =        packet_ioctl,
2622         .listen =       sock_no_listen,
2623         .shutdown =     sock_no_shutdown,
2624         .setsockopt =   sock_no_setsockopt,
2625         .getsockopt =   sock_no_getsockopt,
2626         .sendmsg =      packet_sendmsg_spkt,
2627         .recvmsg =      packet_recvmsg,
2628         .mmap =         sock_no_mmap,
2629         .sendpage =     sock_no_sendpage,
2630 };
2631
2632 static const struct proto_ops packet_ops = {
2633         .family =       PF_PACKET,
2634         .owner =        THIS_MODULE,
2635         .release =      packet_release,
2636         .bind =         packet_bind,
2637         .connect =      sock_no_connect,
2638         .socketpair =   sock_no_socketpair,
2639         .accept =       sock_no_accept,
2640         .getname =      packet_getname,
2641         .poll =         packet_poll,
2642         .ioctl =        packet_ioctl,
2643         .listen =       sock_no_listen,
2644         .shutdown =     sock_no_shutdown,
2645         .setsockopt =   packet_setsockopt,
2646         .getsockopt =   packet_getsockopt,
2647         .sendmsg =      packet_sendmsg,
2648         .recvmsg =      packet_recvmsg,
2649         .mmap =         packet_mmap,
2650         .sendpage =     sock_no_sendpage,
2651 };
2652
2653 static const struct net_proto_family packet_family_ops = {
2654         .family =       PF_PACKET,
2655         .create =       packet_create,
2656         .owner  =       THIS_MODULE,
2657 };
2658
2659 static struct notifier_block packet_netdev_notifier = {
2660         .notifier_call =        packet_notifier,
2661 };
2662
2663 #ifdef CONFIG_PROC_FS
2664
2665 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2666         __acquires(RCU)
2667 {
2668         struct net *net = seq_file_net(seq);
2669
2670         rcu_read_lock();
2671         return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
2672 }
2673
2674 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2675 {
2676         struct net *net = seq_file_net(seq);
2677         return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
2678 }
2679
2680 static void packet_seq_stop(struct seq_file *seq, void *v)
2681         __releases(RCU)
2682 {
2683         rcu_read_unlock();
2684 }
2685
2686 static int packet_seq_show(struct seq_file *seq, void *v)
2687 {
2688         if (v == SEQ_START_TOKEN)
2689                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2690         else {
2691                 struct sock *s = sk_entry(v);
2692                 const struct packet_sock *po = pkt_sk(s);
2693
2694                 seq_printf(seq,
2695                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2696                            s,
2697                            atomic_read(&s->sk_refcnt),
2698                            s->sk_type,
2699                            ntohs(po->num),
2700                            po->ifindex,
2701                            po->running,
2702                            atomic_read(&s->sk_rmem_alloc),
2703                            sock_i_uid(s),
2704                            sock_i_ino(s));
2705         }
2706
2707         return 0;
2708 }
2709
2710 static const struct seq_operations packet_seq_ops = {
2711         .start  = packet_seq_start,
2712         .next   = packet_seq_next,
2713         .stop   = packet_seq_stop,
2714         .show   = packet_seq_show,
2715 };
2716
2717 static int packet_seq_open(struct inode *inode, struct file *file)
2718 {
2719         return seq_open_net(inode, file, &packet_seq_ops,
2720                             sizeof(struct seq_net_private));
2721 }
2722
2723 static const struct file_operations packet_seq_fops = {
2724         .owner          = THIS_MODULE,
2725         .open           = packet_seq_open,
2726         .read           = seq_read,
2727         .llseek         = seq_lseek,
2728         .release        = seq_release_net,
2729 };
2730
2731 #endif
2732
2733 static int __net_init packet_net_init(struct net *net)
2734 {
2735         spin_lock_init(&net->packet.sklist_lock);
2736         INIT_HLIST_HEAD(&net->packet.sklist);
2737
2738         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2739                 return -ENOMEM;
2740
2741         return 0;
2742 }
2743
2744 static void __net_exit packet_net_exit(struct net *net)
2745 {
2746         proc_net_remove(net, "packet");
2747 }
2748
2749 static struct pernet_operations packet_net_ops = {
2750         .init = packet_net_init,
2751         .exit = packet_net_exit,
2752 };
2753
2754
2755 static void __exit packet_exit(void)
2756 {
2757         unregister_netdevice_notifier(&packet_netdev_notifier);
2758         unregister_pernet_subsys(&packet_net_ops);
2759         sock_unregister(PF_PACKET);
2760         proto_unregister(&packet_proto);
2761 }
2762
2763 static int __init packet_init(void)
2764 {
2765         int rc = proto_register(&packet_proto, 0);
2766
2767         if (rc != 0)
2768                 goto out;
2769
2770         sock_register(&packet_family_ops);
2771         register_pernet_subsys(&packet_net_ops);
2772         register_netdevice_notifier(&packet_netdev_notifier);
2773 out:
2774         return rc;
2775 }
2776
2777 module_init(packet_init);
2778 module_exit(packet_exit);
2779 MODULE_LICENSE("GPL");
2780 MODULE_ALIAS_NETPROTO(PF_PACKET);