[NETFILTER]: Revert nf_reset change
[linux-2.6.git] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Version:     $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:       
15  *              Alan Cox        :       verify_area() now used correctly
16  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
17  *              Alan Cox        :       tidied skbuff lists.
18  *              Alan Cox        :       Now uses generic datagram routines I
19  *                                      added. Also fixed the peek/read crash
20  *                                      from all old Linux datagram code.
21  *              Alan Cox        :       Uses the improved datagram code.
22  *              Alan Cox        :       Added NULL's for socket options.
23  *              Alan Cox        :       Re-commented the code.
24  *              Alan Cox        :       Use new kernel side addressing
25  *              Rob Janssen     :       Correct MTU usage.
26  *              Dave Platt      :       Counter leaks caused by incorrect
27  *                                      interrupt locking and some slightly
28  *                                      dubious gcc output. Can you read
29  *                                      compiler: it said _VOLATILE_
30  *      Richard Kooijman        :       Timestamp fixes.
31  *              Alan Cox        :       New buffers. Use sk->mac.raw.
32  *              Alan Cox        :       sendmsg/recvmsg support.
33  *              Alan Cox        :       Protocol setting support
34  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
35  *      Cyrus Durgin            :       Fixed kerneld for kmod.
36  *      Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and 
38  *                                      packet_set_ring memory leak.
39  *
40  *              This program is free software; you can redistribute it and/or
41  *              modify it under the terms of the GNU General Public License
42  *              as published by the Free Software Foundation; either version
43  *              2 of the License, or (at your option) any later version.
44  *
45  */
46  
47 #include <linux/config.h>
48 #include <linux/types.h>
49 #include <linux/sched.h>
50 #include <linux/mm.h>
51 #include <linux/fcntl.h>
52 #include <linux/socket.h>
53 #include <linux/in.h>
54 #include <linux/inet.h>
55 #include <linux/netdevice.h>
56 #include <linux/if_packet.h>
57 #include <linux/wireless.h>
58 #include <linux/kmod.h>
59 #include <net/ip.h>
60 #include <net/protocol.h>
61 #include <linux/skbuff.h>
62 #include <net/sock.h>
63 #include <linux/errno.h>
64 #include <linux/timer.h>
65 #include <asm/system.h>
66 #include <asm/uaccess.h>
67 #include <asm/ioctls.h>
68 #include <asm/page.h>
69 #include <asm/io.h>
70 #include <linux/proc_fs.h>
71 #include <linux/seq_file.h>
72 #include <linux/poll.h>
73 #include <linux/module.h>
74 #include <linux/init.h>
75
76 #ifdef CONFIG_INET
77 #include <net/inet_common.h>
78 #endif
79
80 #define CONFIG_SOCK_PACKET      1
81
82 /*
83    Proposed replacement for SIOC{ADD,DEL}MULTI and
84    IFF_PROMISC, IFF_ALLMULTI flags.
85
86    It is more expensive, but I believe,
87    it is really correct solution: reentereble, safe and fault tolerant.
88
89    IFF_PROMISC/IFF_ALLMULTI/SIOC{ADD/DEL}MULTI are faked by keeping
90    reference count and global flag, so that real status is
91    (gflag|(count != 0)), so that we can use obsolete faulty interface
92    not harming clever users.
93  */
94 #define CONFIG_PACKET_MULTICAST 1
95
96 /*
97    Assumptions:
98    - if device has no dev->hard_header routine, it adds and removes ll header
99      inside itself. In this case ll header is invisible outside of device,
100      but higher levels still should reserve dev->hard_header_len.
101      Some devices are enough clever to reallocate skb, when header
102      will not fit to reserved space (tunnel), another ones are silly
103      (PPP).
104    - packet socket receives packets with pulled ll header,
105      so that SOCK_RAW should push it back.
106
107 On receive:
108 -----------
109
110 Incoming, dev->hard_header!=NULL
111    mac.raw -> ll header
112    data    -> data
113
114 Outgoing, dev->hard_header!=NULL
115    mac.raw -> ll header
116    data    -> ll header
117
118 Incoming, dev->hard_header==NULL
119    mac.raw -> UNKNOWN position. It is very likely, that it points to ll header.
120               PPP makes it, that is wrong, because introduce assymetry
121               between rx and tx paths.
122    data    -> data
123
124 Outgoing, dev->hard_header==NULL
125    mac.raw -> data. ll header is still not built!
126    data    -> data
127
128 Resume
129   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
130
131
132 On transmit:
133 ------------
134
135 dev->hard_header != NULL
136    mac.raw -> ll header
137    data    -> ll header
138
139 dev->hard_header == NULL (ll header is added by device, we cannot control it)
140    mac.raw -> data
141    data -> data
142
143    We should set nh.raw on output to correct posistion,
144    packet classifier depends on it.
145  */
146
147 /* List of all packet sockets. */
148 static HLIST_HEAD(packet_sklist);
149 static DEFINE_RWLOCK(packet_sklist_lock);
150
151 static atomic_t packet_socks_nr;
152
153
154 /* Private packet socket structures. */
155
156 #ifdef CONFIG_PACKET_MULTICAST
157 struct packet_mclist
158 {
159         struct packet_mclist    *next;
160         int                     ifindex;
161         int                     count;
162         unsigned short          type;
163         unsigned short          alen;
164         unsigned char           addr[8];
165 };
166 #endif
167 #ifdef CONFIG_PACKET_MMAP
168 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
169 #endif
170
171 static void packet_flush_mclist(struct sock *sk);
172
173 struct packet_sock {
174         /* struct sock has to be the first member of packet_sock */
175         struct sock             sk;
176         struct tpacket_stats    stats;
177 #ifdef CONFIG_PACKET_MMAP
178         char *                  *pg_vec;
179         unsigned int            head;
180         unsigned int            frames_per_block;
181         unsigned int            frame_size;
182         unsigned int            frame_max;
183         int                     copy_thresh;
184 #endif
185         struct packet_type      prot_hook;
186         spinlock_t              bind_lock;
187         char                    running;        /* prot_hook is attached*/
188         int                     ifindex;        /* bound device         */
189         unsigned short          num;
190 #ifdef CONFIG_PACKET_MULTICAST
191         struct packet_mclist    *mclist;
192 #endif
193 #ifdef CONFIG_PACKET_MMAP
194         atomic_t                mapped;
195         unsigned int            pg_vec_order;
196         unsigned int            pg_vec_pages;
197         unsigned int            pg_vec_len;
198 #endif
199 };
200
201 #ifdef CONFIG_PACKET_MMAP
202
203 static inline char *packet_lookup_frame(struct packet_sock *po, unsigned int position)
204 {
205         unsigned int pg_vec_pos, frame_offset;
206         char *frame;
207
208         pg_vec_pos = position / po->frames_per_block;
209         frame_offset = position % po->frames_per_block;
210
211         frame = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
212         
213         return frame;
214 }
215 #endif
216
217 static inline struct packet_sock *pkt_sk(struct sock *sk)
218 {
219         return (struct packet_sock *)sk;
220 }
221
222 static void packet_sock_destruct(struct sock *sk)
223 {
224         BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
225         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
226
227         if (!sock_flag(sk, SOCK_DEAD)) {
228                 printk("Attempt to release alive packet socket: %p\n", sk);
229                 return;
230         }
231
232         atomic_dec(&packet_socks_nr);
233 #ifdef PACKET_REFCNT_DEBUG
234         printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
235 #endif
236 }
237
238
239 static struct proto_ops packet_ops;
240
241 #ifdef CONFIG_SOCK_PACKET
242 static struct proto_ops packet_ops_spkt;
243
244 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
245 {
246         struct sock *sk;
247         struct sockaddr_pkt *spkt;
248
249         /*
250          *      When we registered the protocol we saved the socket in the data
251          *      field for just this event.
252          */
253
254         sk = pt->af_packet_priv;
255         
256         /*
257          *      Yank back the headers [hope the device set this
258          *      right or kerboom...]
259          *
260          *      Incoming packets have ll header pulled,
261          *      push it back.
262          *
263          *      For outgoing ones skb->data == skb->mac.raw
264          *      so that this procedure is noop.
265          */
266
267         if (skb->pkt_type == PACKET_LOOPBACK)
268                 goto out;
269
270         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
271                 goto oom;
272
273         /* drop any routing info */
274         dst_release(skb->dst);
275         skb->dst = NULL;
276
277         /* drop conntrack reference */
278         nf_reset(skb);
279
280         spkt = (struct sockaddr_pkt*)skb->cb;
281
282         skb_push(skb, skb->data-skb->mac.raw);
283
284         /*
285          *      The SOCK_PACKET socket receives _all_ frames.
286          */
287
288         spkt->spkt_family = dev->type;
289         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
290         spkt->spkt_protocol = skb->protocol;
291
292         /*
293          *      Charge the memory to the socket. This is done specifically
294          *      to prevent sockets using all the memory up.
295          */
296
297         if (sock_queue_rcv_skb(sk,skb) == 0)
298                 return 0;
299
300 out:
301         kfree_skb(skb);
302 oom:
303         return 0;
304 }
305
306
307 /*
308  *      Output a raw packet to a device layer. This bypasses all the other
309  *      protocol layers and you must therefore supply it with a complete frame
310  */
311  
312 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
313                                struct msghdr *msg, size_t len)
314 {
315         struct sock *sk = sock->sk;
316         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
317         struct sk_buff *skb;
318         struct net_device *dev;
319         unsigned short proto=0;
320         int err;
321         
322         /*
323          *      Get and verify the address. 
324          */
325
326         if (saddr)
327         {
328                 if (msg->msg_namelen < sizeof(struct sockaddr))
329                         return(-EINVAL);
330                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
331                         proto=saddr->spkt_protocol;
332         }
333         else
334                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
335
336         /*
337          *      Find the device first to size check it 
338          */
339
340         saddr->spkt_device[13] = 0;
341         dev = dev_get_by_name(saddr->spkt_device);
342         err = -ENODEV;
343         if (dev == NULL)
344                 goto out_unlock;
345         
346         /*
347          *      You may not queue a frame bigger than the mtu. This is the lowest level
348          *      raw protocol and you must do your own fragmentation at this level.
349          */
350          
351         err = -EMSGSIZE;
352         if(len>dev->mtu+dev->hard_header_len)
353                 goto out_unlock;
354
355         err = -ENOBUFS;
356         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
357
358         /*
359          *      If the write buffer is full, then tough. At this level the user gets to
360          *      deal with the problem - do your own algorithmic backoffs. That's far
361          *      more flexible.
362          */
363          
364         if (skb == NULL) 
365                 goto out_unlock;
366
367         /*
368          *      Fill it in 
369          */
370          
371         /* FIXME: Save some space for broken drivers that write a
372          * hard header at transmission time by themselves. PPP is the
373          * notable one here. This should really be fixed at the driver level.
374          */
375         skb_reserve(skb, LL_RESERVED_SPACE(dev));
376         skb->nh.raw = skb->data;
377
378         /* Try to align data part correctly */
379         if (dev->hard_header) {
380                 skb->data -= dev->hard_header_len;
381                 skb->tail -= dev->hard_header_len;
382                 if (len < dev->hard_header_len)
383                         skb->nh.raw = skb->data;
384         }
385
386         /* Returns -EFAULT on error */
387         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
388         skb->protocol = proto;
389         skb->dev = dev;
390         skb->priority = sk->sk_priority;
391         if (err)
392                 goto out_free;
393
394         err = -ENETDOWN;
395         if (!(dev->flags & IFF_UP))
396                 goto out_free;
397
398         /*
399          *      Now send it
400          */
401
402         dev_queue_xmit(skb);
403         dev_put(dev);
404         return(len);
405
406 out_free:
407         kfree_skb(skb);
408 out_unlock:
409         if (dev)
410                 dev_put(dev);
411         return err;
412 }
413 #endif
414
415 static inline unsigned run_filter(struct sk_buff *skb, struct sock *sk, unsigned res)
416 {
417         struct sk_filter *filter;
418
419         bh_lock_sock(sk);
420         filter = sk->sk_filter;
421         /*
422          * Our caller already checked that filter != NULL but we need to
423          * verify that under bh_lock_sock() to be safe
424          */
425         if (likely(filter != NULL))
426                 res = sk_run_filter(skb, filter->insns, filter->len);
427         bh_unlock_sock(sk);
428
429         return res;
430 }
431
432 /*
433    This function makes lazy skb cloning in hope that most of packets
434    are discarded by BPF.
435
436    Note tricky part: we DO mangle shared skb! skb->data, skb->len
437    and skb->cb are mangled. It works because (and until) packets
438    falling here are owned by current CPU. Output packets are cloned
439    by dev_queue_xmit_nit(), input packets are processed by net_bh
440    sequencially, so that if we return skb to original state on exit,
441    we will not harm anyone.
442  */
443
444 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
445 {
446         struct sock *sk;
447         struct sockaddr_ll *sll;
448         struct packet_sock *po;
449         u8 * skb_head = skb->data;
450         int skb_len = skb->len;
451         unsigned snaplen;
452
453         if (skb->pkt_type == PACKET_LOOPBACK)
454                 goto drop;
455
456         sk = pt->af_packet_priv;
457         po = pkt_sk(sk);
458
459         skb->dev = dev;
460
461         if (dev->hard_header) {
462                 /* The device has an explicit notion of ll header,
463                    exported to higher levels.
464
465                    Otherwise, the device hides datails of it frame
466                    structure, so that corresponding packet head
467                    never delivered to user.
468                  */
469                 if (sk->sk_type != SOCK_DGRAM)
470                         skb_push(skb, skb->data - skb->mac.raw);
471                 else if (skb->pkt_type == PACKET_OUTGOING) {
472                         /* Special case: outgoing packets have ll header at head */
473                         skb_pull(skb, skb->nh.raw - skb->data);
474                 }
475         }
476
477         snaplen = skb->len;
478
479         if (sk->sk_filter) {
480                 unsigned res = run_filter(skb, sk, snaplen);
481                 if (res == 0)
482                         goto drop_n_restore;
483                 if (snaplen > res)
484                         snaplen = res;
485         }
486
487         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
488             (unsigned)sk->sk_rcvbuf)
489                 goto drop_n_acct;
490
491         if (skb_shared(skb)) {
492                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
493                 if (nskb == NULL)
494                         goto drop_n_acct;
495
496                 if (skb_head != skb->data) {
497                         skb->data = skb_head;
498                         skb->len = skb_len;
499                 }
500                 kfree_skb(skb);
501                 skb = nskb;
502         }
503
504         sll = (struct sockaddr_ll*)skb->cb;
505         sll->sll_family = AF_PACKET;
506         sll->sll_hatype = dev->type;
507         sll->sll_protocol = skb->protocol;
508         sll->sll_pkttype = skb->pkt_type;
509         sll->sll_ifindex = dev->ifindex;
510         sll->sll_halen = 0;
511
512         if (dev->hard_header_parse)
513                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
514
515         if (pskb_trim(skb, snaplen))
516                 goto drop_n_acct;
517
518         skb_set_owner_r(skb, sk);
519         skb->dev = NULL;
520         dst_release(skb->dst);
521         skb->dst = NULL;
522
523         /* drop conntrack reference */
524         nf_reset(skb);
525
526         spin_lock(&sk->sk_receive_queue.lock);
527         po->stats.tp_packets++;
528         __skb_queue_tail(&sk->sk_receive_queue, skb);
529         spin_unlock(&sk->sk_receive_queue.lock);
530         sk->sk_data_ready(sk, skb->len);
531         return 0;
532
533 drop_n_acct:
534         spin_lock(&sk->sk_receive_queue.lock);
535         po->stats.tp_drops++;
536         spin_unlock(&sk->sk_receive_queue.lock);
537
538 drop_n_restore:
539         if (skb_head != skb->data && skb_shared(skb)) {
540                 skb->data = skb_head;
541                 skb->len = skb_len;
542         }
543 drop:
544         kfree_skb(skb);
545         return 0;
546 }
547
548 #ifdef CONFIG_PACKET_MMAP
549 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt)
550 {
551         struct sock *sk;
552         struct packet_sock *po;
553         struct sockaddr_ll *sll;
554         struct tpacket_hdr *h;
555         u8 * skb_head = skb->data;
556         int skb_len = skb->len;
557         unsigned snaplen;
558         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
559         unsigned short macoff, netoff;
560         struct sk_buff *copy_skb = NULL;
561
562         if (skb->pkt_type == PACKET_LOOPBACK)
563                 goto drop;
564
565         sk = pt->af_packet_priv;
566         po = pkt_sk(sk);
567
568         if (dev->hard_header) {
569                 if (sk->sk_type != SOCK_DGRAM)
570                         skb_push(skb, skb->data - skb->mac.raw);
571                 else if (skb->pkt_type == PACKET_OUTGOING) {
572                         /* Special case: outgoing packets have ll header at head */
573                         skb_pull(skb, skb->nh.raw - skb->data);
574                         if (skb->ip_summed == CHECKSUM_HW)
575                                 status |= TP_STATUS_CSUMNOTREADY;
576                 }
577         }
578
579         snaplen = skb->len;
580
581         if (sk->sk_filter) {
582                 unsigned res = run_filter(skb, sk, snaplen);
583                 if (res == 0)
584                         goto drop_n_restore;
585                 if (snaplen > res)
586                         snaplen = res;
587         }
588
589         if (sk->sk_type == SOCK_DGRAM) {
590                 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
591         } else {
592                 unsigned maclen = skb->nh.raw - skb->data;
593                 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
594                 macoff = netoff - maclen;
595         }
596
597         if (macoff + snaplen > po->frame_size) {
598                 if (po->copy_thresh &&
599                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
600                     (unsigned)sk->sk_rcvbuf) {
601                         if (skb_shared(skb)) {
602                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
603                         } else {
604                                 copy_skb = skb_get(skb);
605                                 skb_head = skb->data;
606                         }
607                         if (copy_skb)
608                                 skb_set_owner_r(copy_skb, sk);
609                 }
610                 snaplen = po->frame_size - macoff;
611                 if ((int)snaplen < 0)
612                         snaplen = 0;
613         }
614         if (snaplen > skb->len-skb->data_len)
615                 snaplen = skb->len-skb->data_len;
616
617         spin_lock(&sk->sk_receive_queue.lock);
618         h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head);
619         
620         if (h->tp_status)
621                 goto ring_is_full;
622         po->head = po->head != po->frame_max ? po->head+1 : 0;
623         po->stats.tp_packets++;
624         if (copy_skb) {
625                 status |= TP_STATUS_COPY;
626                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
627         }
628         if (!po->stats.tp_drops)
629                 status &= ~TP_STATUS_LOSING;
630         spin_unlock(&sk->sk_receive_queue.lock);
631
632         memcpy((u8*)h + macoff, skb->data, snaplen);
633
634         h->tp_len = skb->len;
635         h->tp_snaplen = snaplen;
636         h->tp_mac = macoff;
637         h->tp_net = netoff;
638         if (skb->stamp.tv_sec == 0) { 
639                 do_gettimeofday(&skb->stamp);
640                 sock_enable_timestamp(sk);
641         }
642         h->tp_sec = skb->stamp.tv_sec;
643         h->tp_usec = skb->stamp.tv_usec;
644
645         sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
646         sll->sll_halen = 0;
647         if (dev->hard_header_parse)
648                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
649         sll->sll_family = AF_PACKET;
650         sll->sll_hatype = dev->type;
651         sll->sll_protocol = skb->protocol;
652         sll->sll_pkttype = skb->pkt_type;
653         sll->sll_ifindex = dev->ifindex;
654
655         h->tp_status = status;
656         mb();
657
658         {
659                 struct page *p_start, *p_end;
660                 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
661
662                 p_start = virt_to_page(h);
663                 p_end = virt_to_page(h_end);
664                 while (p_start <= p_end) {
665                         flush_dcache_page(p_start);
666                         p_start++;
667                 }
668         }
669
670         sk->sk_data_ready(sk, 0);
671
672 drop_n_restore:
673         if (skb_head != skb->data && skb_shared(skb)) {
674                 skb->data = skb_head;
675                 skb->len = skb_len;
676         }
677 drop:
678         kfree_skb(skb);
679         return 0;
680
681 ring_is_full:
682         po->stats.tp_drops++;
683         spin_unlock(&sk->sk_receive_queue.lock);
684
685         sk->sk_data_ready(sk, 0);
686         if (copy_skb)
687                 kfree_skb(copy_skb);
688         goto drop_n_restore;
689 }
690
691 #endif
692
693
694 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
695                           struct msghdr *msg, size_t len)
696 {
697         struct sock *sk = sock->sk;
698         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
699         struct sk_buff *skb;
700         struct net_device *dev;
701         unsigned short proto;
702         unsigned char *addr;
703         int ifindex, err, reserve = 0;
704
705         /*
706          *      Get and verify the address. 
707          */
708          
709         if (saddr == NULL) {
710                 struct packet_sock *po = pkt_sk(sk);
711
712                 ifindex = po->ifindex;
713                 proto   = po->num;
714                 addr    = NULL;
715         } else {
716                 err = -EINVAL;
717                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
718                         goto out;
719                 ifindex = saddr->sll_ifindex;
720                 proto   = saddr->sll_protocol;
721                 addr    = saddr->sll_addr;
722         }
723
724
725         dev = dev_get_by_index(ifindex);
726         err = -ENXIO;
727         if (dev == NULL)
728                 goto out_unlock;
729         if (sock->type == SOCK_RAW)
730                 reserve = dev->hard_header_len;
731
732         err = -EMSGSIZE;
733         if (len > dev->mtu+reserve)
734                 goto out_unlock;
735
736         skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
737                                 msg->msg_flags & MSG_DONTWAIT, &err);
738         if (skb==NULL)
739                 goto out_unlock;
740
741         skb_reserve(skb, LL_RESERVED_SPACE(dev));
742         skb->nh.raw = skb->data;
743
744         if (dev->hard_header) {
745                 int res;
746                 err = -EINVAL;
747                 res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
748                 if (sock->type != SOCK_DGRAM) {
749                         skb->tail = skb->data;
750                         skb->len = 0;
751                 } else if (res < 0)
752                         goto out_free;
753         }
754
755         /* Returns -EFAULT on error */
756         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
757         if (err)
758                 goto out_free;
759
760         skb->protocol = proto;
761         skb->dev = dev;
762         skb->priority = sk->sk_priority;
763
764         err = -ENETDOWN;
765         if (!(dev->flags & IFF_UP))
766                 goto out_free;
767
768         /*
769          *      Now send it
770          */
771
772         err = dev_queue_xmit(skb);
773         if (err > 0 && (err = net_xmit_errno(err)) != 0)
774                 goto out_unlock;
775
776         dev_put(dev);
777
778         return(len);
779
780 out_free:
781         kfree_skb(skb);
782 out_unlock:
783         if (dev)
784                 dev_put(dev);
785 out:
786         return err;
787 }
788
789 /*
790  *      Close a PACKET socket. This is fairly simple. We immediately go
791  *      to 'closed' state and remove our protocol entry in the device list.
792  */
793
794 static int packet_release(struct socket *sock)
795 {
796         struct sock *sk = sock->sk;
797         struct packet_sock *po;
798
799         if (!sk)
800                 return 0;
801
802         po = pkt_sk(sk);
803
804         write_lock_bh(&packet_sklist_lock);
805         sk_del_node_init(sk);
806         write_unlock_bh(&packet_sklist_lock);
807
808         /*
809          *      Unhook packet receive handler.
810          */
811
812         if (po->running) {
813                 /*
814                  *      Remove the protocol hook
815                  */
816                 dev_remove_pack(&po->prot_hook);
817                 po->running = 0;
818                 po->num = 0;
819                 __sock_put(sk);
820         }
821
822 #ifdef CONFIG_PACKET_MULTICAST
823         packet_flush_mclist(sk);
824 #endif
825
826 #ifdef CONFIG_PACKET_MMAP
827         if (po->pg_vec) {
828                 struct tpacket_req req;
829                 memset(&req, 0, sizeof(req));
830                 packet_set_ring(sk, &req, 1);
831         }
832 #endif
833
834         /*
835          *      Now the socket is dead. No more input will appear.
836          */
837
838         sock_orphan(sk);
839         sock->sk = NULL;
840
841         /* Purge queues */
842
843         skb_queue_purge(&sk->sk_receive_queue);
844
845         sock_put(sk);
846         return 0;
847 }
848
849 /*
850  *      Attach a packet hook.
851  */
852
853 static int packet_do_bind(struct sock *sk, struct net_device *dev, int protocol)
854 {
855         struct packet_sock *po = pkt_sk(sk);
856         /*
857          *      Detach an existing hook if present.
858          */
859
860         lock_sock(sk);
861
862         spin_lock(&po->bind_lock);
863         if (po->running) {
864                 __sock_put(sk);
865                 po->running = 0;
866                 po->num = 0;
867                 spin_unlock(&po->bind_lock);
868                 dev_remove_pack(&po->prot_hook);
869                 spin_lock(&po->bind_lock);
870         }
871
872         po->num = protocol;
873         po->prot_hook.type = protocol;
874         po->prot_hook.dev = dev;
875
876         po->ifindex = dev ? dev->ifindex : 0;
877
878         if (protocol == 0)
879                 goto out_unlock;
880
881         if (dev) {
882                 if (dev->flags&IFF_UP) {
883                         dev_add_pack(&po->prot_hook);
884                         sock_hold(sk);
885                         po->running = 1;
886                 } else {
887                         sk->sk_err = ENETDOWN;
888                         if (!sock_flag(sk, SOCK_DEAD))
889                                 sk->sk_error_report(sk);
890                 }
891         } else {
892                 dev_add_pack(&po->prot_hook);
893                 sock_hold(sk);
894                 po->running = 1;
895         }
896
897 out_unlock:
898         spin_unlock(&po->bind_lock);
899         release_sock(sk);
900         return 0;
901 }
902
903 /*
904  *      Bind a packet socket to a device
905  */
906
907 #ifdef CONFIG_SOCK_PACKET
908
909 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
910 {
911         struct sock *sk=sock->sk;
912         char name[15];
913         struct net_device *dev;
914         int err = -ENODEV;
915         
916         /*
917          *      Check legality
918          */
919          
920         if(addr_len!=sizeof(struct sockaddr))
921                 return -EINVAL;
922         strlcpy(name,uaddr->sa_data,sizeof(name));
923
924         dev = dev_get_by_name(name);
925         if (dev) {
926                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
927                 dev_put(dev);
928         }
929         return err;
930 }
931 #endif
932
933 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
934 {
935         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
936         struct sock *sk=sock->sk;
937         struct net_device *dev = NULL;
938         int err;
939
940
941         /*
942          *      Check legality
943          */
944          
945         if (addr_len < sizeof(struct sockaddr_ll))
946                 return -EINVAL;
947         if (sll->sll_family != AF_PACKET)
948                 return -EINVAL;
949
950         if (sll->sll_ifindex) {
951                 err = -ENODEV;
952                 dev = dev_get_by_index(sll->sll_ifindex);
953                 if (dev == NULL)
954                         goto out;
955         }
956         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
957         if (dev)
958                 dev_put(dev);
959
960 out:
961         return err;
962 }
963
964 static struct proto packet_proto = {
965         .name     = "PACKET",
966         .owner    = THIS_MODULE,
967         .obj_size = sizeof(struct packet_sock),
968 };
969
970 /*
971  *      Create a packet of type SOCK_PACKET. 
972  */
973
974 static int packet_create(struct socket *sock, int protocol)
975 {
976         struct sock *sk;
977         struct packet_sock *po;
978         int err;
979
980         if (!capable(CAP_NET_RAW))
981                 return -EPERM;
982         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW
983 #ifdef CONFIG_SOCK_PACKET
984             && sock->type != SOCK_PACKET
985 #endif
986             )
987                 return -ESOCKTNOSUPPORT;
988
989         sock->state = SS_UNCONNECTED;
990
991         err = -ENOBUFS;
992         sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
993         if (sk == NULL)
994                 goto out;
995
996         sock->ops = &packet_ops;
997 #ifdef CONFIG_SOCK_PACKET
998         if (sock->type == SOCK_PACKET)
999                 sock->ops = &packet_ops_spkt;
1000 #endif
1001         sock_init_data(sock, sk);
1002
1003         po = pkt_sk(sk);
1004         sk->sk_family = PF_PACKET;
1005         po->num = protocol;
1006
1007         sk->sk_destruct = packet_sock_destruct;
1008         atomic_inc(&packet_socks_nr);
1009
1010         /*
1011          *      Attach a protocol block
1012          */
1013
1014         spin_lock_init(&po->bind_lock);
1015         po->prot_hook.func = packet_rcv;
1016 #ifdef CONFIG_SOCK_PACKET
1017         if (sock->type == SOCK_PACKET)
1018                 po->prot_hook.func = packet_rcv_spkt;
1019 #endif
1020         po->prot_hook.af_packet_priv = sk;
1021
1022         if (protocol) {
1023                 po->prot_hook.type = protocol;
1024                 dev_add_pack(&po->prot_hook);
1025                 sock_hold(sk);
1026                 po->running = 1;
1027         }
1028
1029         write_lock_bh(&packet_sklist_lock);
1030         sk_add_node(sk, &packet_sklist);
1031         write_unlock_bh(&packet_sklist_lock);
1032         return(0);
1033 out:
1034         return err;
1035 }
1036
1037 /*
1038  *      Pull a packet from our receive queue and hand it to the user.
1039  *      If necessary we block.
1040  */
1041
1042 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1043                           struct msghdr *msg, size_t len, int flags)
1044 {
1045         struct sock *sk = sock->sk;
1046         struct sk_buff *skb;
1047         int copied, err;
1048
1049         err = -EINVAL;
1050         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1051                 goto out;
1052
1053 #if 0
1054         /* What error should we return now? EUNATTACH? */
1055         if (pkt_sk(sk)->ifindex < 0)
1056                 return -ENODEV;
1057 #endif
1058
1059         /*
1060          *      If the address length field is there to be filled in, we fill
1061          *      it in now.
1062          */
1063
1064         if (sock->type == SOCK_PACKET)
1065                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1066         else
1067                 msg->msg_namelen = sizeof(struct sockaddr_ll);
1068
1069         /*
1070          *      Call the generic datagram receiver. This handles all sorts
1071          *      of horrible races and re-entrancy so we can forget about it
1072          *      in the protocol layers.
1073          *
1074          *      Now it will return ENETDOWN, if device have just gone down,
1075          *      but then it will block.
1076          */
1077
1078         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1079
1080         /*
1081          *      An error occurred so return it. Because skb_recv_datagram() 
1082          *      handles the blocking we don't see and worry about blocking
1083          *      retries.
1084          */
1085
1086         if(skb==NULL)
1087                 goto out;
1088
1089         /*
1090          *      You lose any data beyond the buffer you gave. If it worries a
1091          *      user program they can ask the device for its MTU anyway.
1092          */
1093
1094         copied = skb->len;
1095         if (copied > len)
1096         {
1097                 copied=len;
1098                 msg->msg_flags|=MSG_TRUNC;
1099         }
1100
1101         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1102         if (err)
1103                 goto out_free;
1104
1105         sock_recv_timestamp(msg, sk, skb);
1106
1107         if (msg->msg_name)
1108                 memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
1109
1110         /*
1111          *      Free or return the buffer as appropriate. Again this
1112          *      hides all the races and re-entrancy issues from us.
1113          */
1114         err = (flags&MSG_TRUNC) ? skb->len : copied;
1115
1116 out_free:
1117         skb_free_datagram(sk, skb);
1118 out:
1119         return err;
1120 }
1121
1122 #ifdef CONFIG_SOCK_PACKET
1123 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1124                                int *uaddr_len, int peer)
1125 {
1126         struct net_device *dev;
1127         struct sock *sk = sock->sk;
1128
1129         if (peer)
1130                 return -EOPNOTSUPP;
1131
1132         uaddr->sa_family = AF_PACKET;
1133         dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1134         if (dev) {
1135                 strlcpy(uaddr->sa_data, dev->name, 15);
1136                 dev_put(dev);
1137         } else
1138                 memset(uaddr->sa_data, 0, 14);
1139         *uaddr_len = sizeof(*uaddr);
1140
1141         return 0;
1142 }
1143 #endif
1144
1145 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1146                           int *uaddr_len, int peer)
1147 {
1148         struct net_device *dev;
1149         struct sock *sk = sock->sk;
1150         struct packet_sock *po = pkt_sk(sk);
1151         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1152
1153         if (peer)
1154                 return -EOPNOTSUPP;
1155
1156         sll->sll_family = AF_PACKET;
1157         sll->sll_ifindex = po->ifindex;
1158         sll->sll_protocol = po->num;
1159         dev = dev_get_by_index(po->ifindex);
1160         if (dev) {
1161                 sll->sll_hatype = dev->type;
1162                 sll->sll_halen = dev->addr_len;
1163                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1164                 dev_put(dev);
1165         } else {
1166                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1167                 sll->sll_halen = 0;
1168         }
1169         *uaddr_len = sizeof(*sll);
1170
1171         return 0;
1172 }
1173
1174 #ifdef CONFIG_PACKET_MULTICAST
1175 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1176 {
1177         switch (i->type) {
1178         case PACKET_MR_MULTICAST:
1179                 if (what > 0)
1180                         dev_mc_add(dev, i->addr, i->alen, 0);
1181                 else
1182                         dev_mc_delete(dev, i->addr, i->alen, 0);
1183                 break;
1184         case PACKET_MR_PROMISC:
1185                 dev_set_promiscuity(dev, what);
1186                 break;
1187         case PACKET_MR_ALLMULTI:
1188                 dev_set_allmulti(dev, what);
1189                 break;
1190         default:;
1191         }
1192 }
1193
1194 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1195 {
1196         for ( ; i; i=i->next) {
1197                 if (i->ifindex == dev->ifindex)
1198                         packet_dev_mc(dev, i, what);
1199         }
1200 }
1201
1202 static int packet_mc_add(struct sock *sk, struct packet_mreq *mreq)
1203 {
1204         struct packet_sock *po = pkt_sk(sk);
1205         struct packet_mclist *ml, *i;
1206         struct net_device *dev;
1207         int err;
1208
1209         rtnl_lock();
1210
1211         err = -ENODEV;
1212         dev = __dev_get_by_index(mreq->mr_ifindex);
1213         if (!dev)
1214                 goto done;
1215
1216         err = -EINVAL;
1217         if (mreq->mr_alen > dev->addr_len)
1218                 goto done;
1219
1220         err = -ENOBUFS;
1221         i = (struct packet_mclist *)kmalloc(sizeof(*i), GFP_KERNEL);
1222         if (i == NULL)
1223                 goto done;
1224
1225         err = 0;
1226         for (ml = po->mclist; ml; ml = ml->next) {
1227                 if (ml->ifindex == mreq->mr_ifindex &&
1228                     ml->type == mreq->mr_type &&
1229                     ml->alen == mreq->mr_alen &&
1230                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1231                         ml->count++;
1232                         /* Free the new element ... */
1233                         kfree(i);
1234                         goto done;
1235                 }
1236         }
1237
1238         i->type = mreq->mr_type;
1239         i->ifindex = mreq->mr_ifindex;
1240         i->alen = mreq->mr_alen;
1241         memcpy(i->addr, mreq->mr_address, i->alen);
1242         i->count = 1;
1243         i->next = po->mclist;
1244         po->mclist = i;
1245         packet_dev_mc(dev, i, +1);
1246
1247 done:
1248         rtnl_unlock();
1249         return err;
1250 }
1251
1252 static int packet_mc_drop(struct sock *sk, struct packet_mreq *mreq)
1253 {
1254         struct packet_mclist *ml, **mlp;
1255
1256         rtnl_lock();
1257
1258         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1259                 if (ml->ifindex == mreq->mr_ifindex &&
1260                     ml->type == mreq->mr_type &&
1261                     ml->alen == mreq->mr_alen &&
1262                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1263                         if (--ml->count == 0) {
1264                                 struct net_device *dev;
1265                                 *mlp = ml->next;
1266                                 dev = dev_get_by_index(ml->ifindex);
1267                                 if (dev) {
1268                                         packet_dev_mc(dev, ml, -1);
1269                                         dev_put(dev);
1270                                 }
1271                                 kfree(ml);
1272                         }
1273                         rtnl_unlock();
1274                         return 0;
1275                 }
1276         }
1277         rtnl_unlock();
1278         return -EADDRNOTAVAIL;
1279 }
1280
1281 static void packet_flush_mclist(struct sock *sk)
1282 {
1283         struct packet_sock *po = pkt_sk(sk);
1284         struct packet_mclist *ml;
1285
1286         if (!po->mclist)
1287                 return;
1288
1289         rtnl_lock();
1290         while ((ml = po->mclist) != NULL) {
1291                 struct net_device *dev;
1292
1293                 po->mclist = ml->next;
1294                 if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1295                         packet_dev_mc(dev, ml, -1);
1296                         dev_put(dev);
1297                 }
1298                 kfree(ml);
1299         }
1300         rtnl_unlock();
1301 }
1302 #endif
1303
1304 static int
1305 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1306 {
1307         struct sock *sk = sock->sk;
1308         int ret;
1309
1310         if (level != SOL_PACKET)
1311                 return -ENOPROTOOPT;
1312
1313         switch(optname) {
1314 #ifdef CONFIG_PACKET_MULTICAST
1315         case PACKET_ADD_MEMBERSHIP:     
1316         case PACKET_DROP_MEMBERSHIP:
1317         {
1318                 struct packet_mreq mreq;
1319                 if (optlen<sizeof(mreq))
1320                         return -EINVAL;
1321                 if (copy_from_user(&mreq,optval,sizeof(mreq)))
1322                         return -EFAULT;
1323                 if (optname == PACKET_ADD_MEMBERSHIP)
1324                         ret = packet_mc_add(sk, &mreq);
1325                 else
1326                         ret = packet_mc_drop(sk, &mreq);
1327                 return ret;
1328         }
1329 #endif
1330 #ifdef CONFIG_PACKET_MMAP
1331         case PACKET_RX_RING:
1332         {
1333                 struct tpacket_req req;
1334
1335                 if (optlen<sizeof(req))
1336                         return -EINVAL;
1337                 if (copy_from_user(&req,optval,sizeof(req)))
1338                         return -EFAULT;
1339                 return packet_set_ring(sk, &req, 0);
1340         }
1341         case PACKET_COPY_THRESH:
1342         {
1343                 int val;
1344
1345                 if (optlen!=sizeof(val))
1346                         return -EINVAL;
1347                 if (copy_from_user(&val,optval,sizeof(val)))
1348                         return -EFAULT;
1349
1350                 pkt_sk(sk)->copy_thresh = val;
1351                 return 0;
1352         }
1353 #endif
1354         default:
1355                 return -ENOPROTOOPT;
1356         }
1357 }
1358
1359 static int packet_getsockopt(struct socket *sock, int level, int optname,
1360                              char __user *optval, int __user *optlen)
1361 {
1362         int len;
1363         struct sock *sk = sock->sk;
1364         struct packet_sock *po = pkt_sk(sk);
1365
1366         if (level != SOL_PACKET)
1367                 return -ENOPROTOOPT;
1368
1369         if (get_user(len,optlen))
1370                 return -EFAULT;
1371
1372         if (len < 0)
1373                 return -EINVAL;
1374                 
1375         switch(optname) {
1376         case PACKET_STATISTICS:
1377         {
1378                 struct tpacket_stats st;
1379
1380                 if (len > sizeof(struct tpacket_stats))
1381                         len = sizeof(struct tpacket_stats);
1382                 spin_lock_bh(&sk->sk_receive_queue.lock);
1383                 st = po->stats;
1384                 memset(&po->stats, 0, sizeof(st));
1385                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1386                 st.tp_packets += st.tp_drops;
1387
1388                 if (copy_to_user(optval, &st, len))
1389                         return -EFAULT;
1390                 break;
1391         }
1392         default:
1393                 return -ENOPROTOOPT;
1394         }
1395
1396         if (put_user(len, optlen))
1397                 return -EFAULT;
1398         return 0;
1399 }
1400
1401
1402 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1403 {
1404         struct sock *sk;
1405         struct hlist_node *node;
1406         struct net_device *dev = (struct net_device*)data;
1407
1408         read_lock(&packet_sklist_lock);
1409         sk_for_each(sk, node, &packet_sklist) {
1410                 struct packet_sock *po = pkt_sk(sk);
1411
1412                 switch (msg) {
1413                 case NETDEV_UNREGISTER:
1414 #ifdef CONFIG_PACKET_MULTICAST
1415                         if (po->mclist)
1416                                 packet_dev_mclist(dev, po->mclist, -1);
1417                         // fallthrough
1418 #endif
1419                 case NETDEV_DOWN:
1420                         if (dev->ifindex == po->ifindex) {
1421                                 spin_lock(&po->bind_lock);
1422                                 if (po->running) {
1423                                         __dev_remove_pack(&po->prot_hook);
1424                                         __sock_put(sk);
1425                                         po->running = 0;
1426                                         sk->sk_err = ENETDOWN;
1427                                         if (!sock_flag(sk, SOCK_DEAD))
1428                                                 sk->sk_error_report(sk);
1429                                 }
1430                                 if (msg == NETDEV_UNREGISTER) {
1431                                         po->ifindex = -1;
1432                                         po->prot_hook.dev = NULL;
1433                                 }
1434                                 spin_unlock(&po->bind_lock);
1435                         }
1436                         break;
1437                 case NETDEV_UP:
1438                         spin_lock(&po->bind_lock);
1439                         if (dev->ifindex == po->ifindex && po->num &&
1440                             !po->running) {
1441                                 dev_add_pack(&po->prot_hook);
1442                                 sock_hold(sk);
1443                                 po->running = 1;
1444                         }
1445                         spin_unlock(&po->bind_lock);
1446                         break;
1447                 }
1448         }
1449         read_unlock(&packet_sklist_lock);
1450         return NOTIFY_DONE;
1451 }
1452
1453
1454 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1455                         unsigned long arg)
1456 {
1457         struct sock *sk = sock->sk;
1458
1459         switch(cmd) {
1460                 case SIOCOUTQ:
1461                 {
1462                         int amount = atomic_read(&sk->sk_wmem_alloc);
1463                         return put_user(amount, (int __user *)arg);
1464                 }
1465                 case SIOCINQ:
1466                 {
1467                         struct sk_buff *skb;
1468                         int amount = 0;
1469
1470                         spin_lock_bh(&sk->sk_receive_queue.lock);
1471                         skb = skb_peek(&sk->sk_receive_queue);
1472                         if (skb)
1473                                 amount = skb->len;
1474                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1475                         return put_user(amount, (int __user *)arg);
1476                 }
1477                 case SIOCGSTAMP:
1478                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1479                         
1480 #ifdef CONFIG_INET
1481                 case SIOCADDRT:
1482                 case SIOCDELRT:
1483                 case SIOCDARP:
1484                 case SIOCGARP:
1485                 case SIOCSARP:
1486                 case SIOCGIFADDR:
1487                 case SIOCSIFADDR:
1488                 case SIOCGIFBRDADDR:
1489                 case SIOCSIFBRDADDR:
1490                 case SIOCGIFNETMASK:
1491                 case SIOCSIFNETMASK:
1492                 case SIOCGIFDSTADDR:
1493                 case SIOCSIFDSTADDR:
1494                 case SIOCSIFFLAGS:
1495                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1496 #endif
1497
1498                 default:
1499                         return dev_ioctl(cmd, (void __user *)arg);
1500         }
1501         return 0;
1502 }
1503
1504 #ifndef CONFIG_PACKET_MMAP
1505 #define packet_mmap sock_no_mmap
1506 #define packet_poll datagram_poll
1507 #else
1508
1509 static unsigned int packet_poll(struct file * file, struct socket *sock,
1510                                 poll_table *wait)
1511 {
1512         struct sock *sk = sock->sk;
1513         struct packet_sock *po = pkt_sk(sk);
1514         unsigned int mask = datagram_poll(file, sock, wait);
1515
1516         spin_lock_bh(&sk->sk_receive_queue.lock);
1517         if (po->pg_vec) {
1518                 unsigned last = po->head ? po->head-1 : po->frame_max;
1519                 struct tpacket_hdr *h;
1520
1521                 h = (struct tpacket_hdr *)packet_lookup_frame(po, last);
1522
1523                 if (h->tp_status)
1524                         mask |= POLLIN | POLLRDNORM;
1525         }
1526         spin_unlock_bh(&sk->sk_receive_queue.lock);
1527         return mask;
1528 }
1529
1530
1531 /* Dirty? Well, I still did not learn better way to account
1532  * for user mmaps.
1533  */
1534
1535 static void packet_mm_open(struct vm_area_struct *vma)
1536 {
1537         struct file *file = vma->vm_file;
1538         struct inode *inode = file->f_dentry->d_inode;
1539         struct socket * sock = SOCKET_I(inode);
1540         struct sock *sk = sock->sk;
1541         
1542         if (sk)
1543                 atomic_inc(&pkt_sk(sk)->mapped);
1544 }
1545
1546 static void packet_mm_close(struct vm_area_struct *vma)
1547 {
1548         struct file *file = vma->vm_file;
1549         struct inode *inode = file->f_dentry->d_inode;
1550         struct socket * sock = SOCKET_I(inode);
1551         struct sock *sk = sock->sk;
1552         
1553         if (sk)
1554                 atomic_dec(&pkt_sk(sk)->mapped);
1555 }
1556
1557 static struct vm_operations_struct packet_mmap_ops = {
1558         .open = packet_mm_open,
1559         .close =packet_mm_close,
1560 };
1561
1562 static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1563 {
1564         return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1565 }
1566
1567 static void free_pg_vec(char **pg_vec, unsigned order, unsigned len)
1568 {
1569         int i;
1570
1571         for (i=0; i<len; i++) {
1572                 if (pg_vec[i]) {
1573                         struct page *page, *pend;
1574
1575                         pend = pg_vec_endpage(pg_vec[i], order);
1576                         for (page = virt_to_page(pg_vec[i]); page <= pend; page++)
1577                                 ClearPageReserved(page);
1578                         free_pages((unsigned long)pg_vec[i], order);
1579                 }
1580         }
1581         kfree(pg_vec);
1582 }
1583
1584
1585 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1586 {
1587         char **pg_vec = NULL;
1588         struct packet_sock *po = pkt_sk(sk);
1589         int was_running, num, order = 0;
1590         int err = 0;
1591         
1592         if (req->tp_block_nr) {
1593                 int i, l;
1594
1595                 /* Sanity tests and some calculations */
1596
1597                 if (po->pg_vec)
1598                         return -EBUSY;
1599
1600                 if ((int)req->tp_block_size <= 0)
1601                         return -EINVAL;
1602                 if (req->tp_block_size&(PAGE_SIZE-1))
1603                         return -EINVAL;
1604                 if (req->tp_frame_size < TPACKET_HDRLEN)
1605                         return -EINVAL;
1606                 if (req->tp_frame_size&(TPACKET_ALIGNMENT-1))
1607                         return -EINVAL;
1608
1609                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1610                 if (po->frames_per_block <= 0)
1611                         return -EINVAL;
1612                 if (po->frames_per_block*req->tp_block_nr != req->tp_frame_nr)
1613                         return -EINVAL;
1614                 /* OK! */
1615
1616                 /* Allocate page vector */
1617                 while ((PAGE_SIZE<<order) < req->tp_block_size)
1618                         order++;
1619
1620                 err = -ENOMEM;
1621
1622                 pg_vec = kmalloc(req->tp_block_nr*sizeof(char *), GFP_KERNEL);
1623                 if (pg_vec == NULL)
1624                         goto out;
1625                 memset(pg_vec, 0, req->tp_block_nr*sizeof(char **));
1626
1627                 for (i=0; i<req->tp_block_nr; i++) {
1628                         struct page *page, *pend;
1629                         pg_vec[i] = (char *)__get_free_pages(GFP_KERNEL, order);
1630                         if (!pg_vec[i])
1631                                 goto out_free_pgvec;
1632
1633                         pend = pg_vec_endpage(pg_vec[i], order);
1634                         for (page = virt_to_page(pg_vec[i]); page <= pend; page++)
1635                                 SetPageReserved(page);
1636                 }
1637                 /* Page vector is allocated */
1638
1639                 l = 0;
1640                 for (i=0; i<req->tp_block_nr; i++) {
1641                         char *ptr = pg_vec[i];
1642                         struct tpacket_hdr *header;
1643                         int k;
1644
1645                         for (k=0; k<po->frames_per_block; k++) {
1646                                 
1647                                 header = (struct tpacket_hdr*)ptr;
1648                                 header->tp_status = TP_STATUS_KERNEL;
1649                                 ptr += req->tp_frame_size;
1650                         }
1651                 }
1652                 /* Done */
1653         } else {
1654                 if (req->tp_frame_nr)
1655                         return -EINVAL;
1656         }
1657
1658         lock_sock(sk);
1659
1660         /* Detach socket from network */
1661         spin_lock(&po->bind_lock);
1662         was_running = po->running;
1663         num = po->num;
1664         if (was_running) {
1665                 __dev_remove_pack(&po->prot_hook);
1666                 po->num = 0;
1667                 po->running = 0;
1668                 __sock_put(sk);
1669         }
1670         spin_unlock(&po->bind_lock);
1671                 
1672         synchronize_net();
1673
1674         err = -EBUSY;
1675         if (closing || atomic_read(&po->mapped) == 0) {
1676                 err = 0;
1677 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1678
1679                 spin_lock_bh(&sk->sk_receive_queue.lock);
1680                 pg_vec = XC(po->pg_vec, pg_vec);
1681                 po->frame_max = req->tp_frame_nr-1;
1682                 po->head = 0;
1683                 po->frame_size = req->tp_frame_size;
1684                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1685
1686                 order = XC(po->pg_vec_order, order);
1687                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1688
1689                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1690                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1691                 skb_queue_purge(&sk->sk_receive_queue);
1692 #undef XC
1693                 if (atomic_read(&po->mapped))
1694                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1695         }
1696
1697         spin_lock(&po->bind_lock);
1698         if (was_running && !po->running) {
1699                 sock_hold(sk);
1700                 po->running = 1;
1701                 po->num = num;
1702                 dev_add_pack(&po->prot_hook);
1703         }
1704         spin_unlock(&po->bind_lock);
1705
1706         release_sock(sk);
1707
1708 out_free_pgvec:
1709         if (pg_vec)
1710                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1711 out:
1712         return err;
1713 }
1714
1715 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1716 {
1717         struct sock *sk = sock->sk;
1718         struct packet_sock *po = pkt_sk(sk);
1719         unsigned long size;
1720         unsigned long start;
1721         int err = -EINVAL;
1722         int i;
1723
1724         if (vma->vm_pgoff)
1725                 return -EINVAL;
1726
1727         size = vma->vm_end - vma->vm_start;
1728
1729         lock_sock(sk);
1730         if (po->pg_vec == NULL)
1731                 goto out;
1732         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1733                 goto out;
1734
1735         atomic_inc(&po->mapped);
1736         start = vma->vm_start;
1737         err = -EAGAIN;
1738         for (i=0; i<po->pg_vec_len; i++) {
1739                 if (remap_pfn_range(vma, start,
1740                                      __pa(po->pg_vec[i]) >> PAGE_SHIFT,
1741                                      po->pg_vec_pages*PAGE_SIZE,
1742                                      vma->vm_page_prot))
1743                         goto out;
1744                 start += po->pg_vec_pages*PAGE_SIZE;
1745         }
1746         vma->vm_ops = &packet_mmap_ops;
1747         err = 0;
1748
1749 out:
1750         release_sock(sk);
1751         return err;
1752 }
1753 #endif
1754
1755
1756 #ifdef CONFIG_SOCK_PACKET
1757 static struct proto_ops packet_ops_spkt = {
1758         .family =       PF_PACKET,
1759         .owner =        THIS_MODULE,
1760         .release =      packet_release,
1761         .bind =         packet_bind_spkt,
1762         .connect =      sock_no_connect,
1763         .socketpair =   sock_no_socketpair,
1764         .accept =       sock_no_accept,
1765         .getname =      packet_getname_spkt,
1766         .poll =         datagram_poll,
1767         .ioctl =        packet_ioctl,
1768         .listen =       sock_no_listen,
1769         .shutdown =     sock_no_shutdown,
1770         .setsockopt =   sock_no_setsockopt,
1771         .getsockopt =   sock_no_getsockopt,
1772         .sendmsg =      packet_sendmsg_spkt,
1773         .recvmsg =      packet_recvmsg,
1774         .mmap =         sock_no_mmap,
1775         .sendpage =     sock_no_sendpage,
1776 };
1777 #endif
1778
1779 static struct proto_ops packet_ops = {
1780         .family =       PF_PACKET,
1781         .owner =        THIS_MODULE,
1782         .release =      packet_release,
1783         .bind =         packet_bind,
1784         .connect =      sock_no_connect,
1785         .socketpair =   sock_no_socketpair,
1786         .accept =       sock_no_accept,
1787         .getname =      packet_getname, 
1788         .poll =         packet_poll,
1789         .ioctl =        packet_ioctl,
1790         .listen =       sock_no_listen,
1791         .shutdown =     sock_no_shutdown,
1792         .setsockopt =   packet_setsockopt,
1793         .getsockopt =   packet_getsockopt,
1794         .sendmsg =      packet_sendmsg,
1795         .recvmsg =      packet_recvmsg,
1796         .mmap =         packet_mmap,
1797         .sendpage =     sock_no_sendpage,
1798 };
1799
1800 static struct net_proto_family packet_family_ops = {
1801         .family =       PF_PACKET,
1802         .create =       packet_create,
1803         .owner  =       THIS_MODULE,
1804 };
1805
1806 static struct notifier_block packet_netdev_notifier = {
1807         .notifier_call =packet_notifier,
1808 };
1809
1810 #ifdef CONFIG_PROC_FS
1811 static inline struct sock *packet_seq_idx(loff_t off)
1812 {
1813         struct sock *s;
1814         struct hlist_node *node;
1815
1816         sk_for_each(s, node, &packet_sklist) {
1817                 if (!off--)
1818                         return s;
1819         }
1820         return NULL;
1821 }
1822
1823 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1824 {
1825         read_lock(&packet_sklist_lock);
1826         return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1827 }
1828
1829 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1830 {
1831         ++*pos;
1832         return  (v == SEQ_START_TOKEN) 
1833                 ? sk_head(&packet_sklist) 
1834                 : sk_next((struct sock*)v) ;
1835 }
1836
1837 static void packet_seq_stop(struct seq_file *seq, void *v)
1838 {
1839         read_unlock(&packet_sklist_lock);               
1840 }
1841
1842 static int packet_seq_show(struct seq_file *seq, void *v) 
1843 {
1844         if (v == SEQ_START_TOKEN)
1845                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1846         else {
1847                 struct sock *s = v;
1848                 const struct packet_sock *po = pkt_sk(s);
1849
1850                 seq_printf(seq,
1851                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1852                            s,
1853                            atomic_read(&s->sk_refcnt),
1854                            s->sk_type,
1855                            ntohs(po->num),
1856                            po->ifindex,
1857                            po->running,
1858                            atomic_read(&s->sk_rmem_alloc),
1859                            sock_i_uid(s),
1860                            sock_i_ino(s) );
1861         }
1862
1863         return 0;
1864 }
1865
1866 static struct seq_operations packet_seq_ops = {
1867         .start  = packet_seq_start,
1868         .next   = packet_seq_next,
1869         .stop   = packet_seq_stop,
1870         .show   = packet_seq_show,
1871 };
1872
1873 static int packet_seq_open(struct inode *inode, struct file *file)
1874 {
1875         return seq_open(file, &packet_seq_ops);
1876 }
1877
1878 static struct file_operations packet_seq_fops = {
1879         .owner          = THIS_MODULE,
1880         .open           = packet_seq_open,
1881         .read           = seq_read,
1882         .llseek         = seq_lseek,
1883         .release        = seq_release,
1884 };
1885
1886 #endif
1887
1888 static void __exit packet_exit(void)
1889 {
1890         proc_net_remove("packet");
1891         unregister_netdevice_notifier(&packet_netdev_notifier);
1892         sock_unregister(PF_PACKET);
1893         proto_unregister(&packet_proto);
1894 }
1895
1896 static int __init packet_init(void)
1897 {
1898         int rc = proto_register(&packet_proto, 0);
1899
1900         if (rc != 0)
1901                 goto out;
1902
1903         sock_register(&packet_family_ops);
1904         register_netdevice_notifier(&packet_netdev_notifier);
1905         proc_net_fops_create("packet", 0, &packet_seq_fops);
1906 out:
1907         return rc;
1908 }
1909
1910 module_init(packet_init);
1911 module_exit(packet_exit);
1912 MODULE_LICENSE("GPL");
1913 MODULE_ALIAS_NETPROTO(PF_PACKET);