]> nv-tegra.nvidia Code Review - linux-2.6.git/blob - net/ipv4/ipmr.c
Merge branch 'merge' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc
[linux-2.6.git] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@redhat.com>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *      Fixes:
15  *      Michael Chastain        :       Incorrect size of copying.
16  *      Alan Cox                :       Added the cache manager code
17  *      Alan Cox                :       Fixed the clone/copy bug and device race.
18  *      Mike McLagan            :       Routing by source
19  *      Malcolm Beattie         :       Buffer handling fixes.
20  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
21  *      SVR Anand               :       Fixed several multicast bugs and problems.
22  *      Alexey Kuznetsov        :       Status, optimisations and more.
23  *      Brad Parker             :       Better behaviour on mrouted upcall
24  *                                      overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
27  *                                      Relax this requrement to work with older peers.
28  *
29  */
30
31 #include <asm/system.h>
32 #include <asm/uaccess.h>
33 #include <linux/types.h>
34 #include <linux/sched.h>
35 #include <linux/capability.h>
36 #include <linux/errno.h>
37 #include <linux/timer.h>
38 #include <linux/mm.h>
39 #include <linux/kernel.h>
40 #include <linux/fcntl.h>
41 #include <linux/stat.h>
42 #include <linux/socket.h>
43 #include <linux/in.h>
44 #include <linux/inet.h>
45 #include <linux/netdevice.h>
46 #include <linux/inetdevice.h>
47 #include <linux/igmp.h>
48 #include <linux/proc_fs.h>
49 #include <linux/seq_file.h>
50 #include <linux/mroute.h>
51 #include <linux/init.h>
52 #include <linux/if_ether.h>
53 #include <net/ip.h>
54 #include <net/protocol.h>
55 #include <linux/skbuff.h>
56 #include <net/route.h>
57 #include <net/sock.h>
58 #include <net/icmp.h>
59 #include <net/udp.h>
60 #include <net/raw.h>
61 #include <linux/notifier.h>
62 #include <linux/if_arp.h>
63 #include <linux/netfilter_ipv4.h>
64 #include <net/ipip.h>
65 #include <net/checksum.h>
66
67 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
68 #define CONFIG_IP_PIMSM 1
69 #endif
70
71 static struct sock *mroute_socket;
72
73
74 /* Big lock, protecting vif table, mrt cache and mroute socket state.
75    Note that the changes are semaphored via rtnl_lock.
76  */
77
78 static DEFINE_RWLOCK(mrt_lock);
79
80 /*
81  *      Multicast router control variables
82  */
83
84 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
85 static int maxvif;
86
87 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
88
89 static int mroute_do_assert;                            /* Set in PIM assert    */
90 static int mroute_do_pim;
91
92 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
93
94 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
95 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
96
97 /* Special spinlock for queue of unresolved entries */
98 static DEFINE_SPINLOCK(mfc_unres_lock);
99
100 /* We return to original Alan's scheme. Hash table of resolved
101    entries is changed only in process context and protected
102    with weak lock mrt_lock. Queue of unresolved entries is protected
103    with strong spinlock mfc_unres_lock.
104
105    In this case data path is free of exclusive locks at all.
106  */
107
108 static kmem_cache_t *mrt_cachep __read_mostly;
109
110 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
111 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
112 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
113
114 #ifdef CONFIG_IP_PIMSM_V2
115 static struct net_protocol pim_protocol;
116 #endif
117
118 static struct timer_list ipmr_expire_timer;
119
120 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
121
122 static
123 struct net_device *ipmr_new_tunnel(struct vifctl *v)
124 {
125         struct net_device  *dev;
126
127         dev = __dev_get_by_name("tunl0");
128
129         if (dev) {
130                 int err;
131                 struct ifreq ifr;
132                 mm_segment_t    oldfs;
133                 struct ip_tunnel_parm p;
134                 struct in_device  *in_dev;
135
136                 memset(&p, 0, sizeof(p));
137                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
138                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
139                 p.iph.version = 4;
140                 p.iph.ihl = 5;
141                 p.iph.protocol = IPPROTO_IPIP;
142                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
143                 ifr.ifr_ifru.ifru_data = (void*)&p;
144
145                 oldfs = get_fs(); set_fs(KERNEL_DS);
146                 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
147                 set_fs(oldfs);
148
149                 dev = NULL;
150
151                 if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
152                         dev->flags |= IFF_MULTICAST;
153
154                         in_dev = __in_dev_get_rtnl(dev);
155                         if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
156                                 goto failure;
157                         in_dev->cnf.rp_filter = 0;
158
159                         if (dev_open(dev))
160                                 goto failure;
161                 }
162         }
163         return dev;
164
165 failure:
166         /* allow the register to be completed before unregistering. */
167         rtnl_unlock();
168         rtnl_lock();
169
170         unregister_netdevice(dev);
171         return NULL;
172 }
173
174 #ifdef CONFIG_IP_PIMSM
175
176 static int reg_vif_num = -1;
177
178 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
179 {
180         read_lock(&mrt_lock);
181         ((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
182         ((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
183         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
184         read_unlock(&mrt_lock);
185         kfree_skb(skb);
186         return 0;
187 }
188
189 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
190 {
191         return (struct net_device_stats*)netdev_priv(dev);
192 }
193
194 static void reg_vif_setup(struct net_device *dev)
195 {
196         dev->type               = ARPHRD_PIMREG;
197         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
198         dev->flags              = IFF_NOARP;
199         dev->hard_start_xmit    = reg_vif_xmit;
200         dev->get_stats          = reg_vif_get_stats;
201         dev->destructor         = free_netdev;
202 }
203
204 static struct net_device *ipmr_reg_vif(void)
205 {
206         struct net_device *dev;
207         struct in_device *in_dev;
208
209         dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
210                            reg_vif_setup);
211
212         if (dev == NULL)
213                 return NULL;
214
215         if (register_netdevice(dev)) {
216                 free_netdev(dev);
217                 return NULL;
218         }
219         dev->iflink = 0;
220
221         if ((in_dev = inetdev_init(dev)) == NULL)
222                 goto failure;
223
224         in_dev->cnf.rp_filter = 0;
225
226         if (dev_open(dev))
227                 goto failure;
228
229         return dev;
230
231 failure:
232         /* allow the register to be completed before unregistering. */
233         rtnl_unlock();
234         rtnl_lock();
235
236         unregister_netdevice(dev);
237         return NULL;
238 }
239 #endif
240
241 /*
242  *      Delete a VIF entry
243  */
244  
245 static int vif_delete(int vifi)
246 {
247         struct vif_device *v;
248         struct net_device *dev;
249         struct in_device *in_dev;
250
251         if (vifi < 0 || vifi >= maxvif)
252                 return -EADDRNOTAVAIL;
253
254         v = &vif_table[vifi];
255
256         write_lock_bh(&mrt_lock);
257         dev = v->dev;
258         v->dev = NULL;
259
260         if (!dev) {
261                 write_unlock_bh(&mrt_lock);
262                 return -EADDRNOTAVAIL;
263         }
264
265 #ifdef CONFIG_IP_PIMSM
266         if (vifi == reg_vif_num)
267                 reg_vif_num = -1;
268 #endif
269
270         if (vifi+1 == maxvif) {
271                 int tmp;
272                 for (tmp=vifi-1; tmp>=0; tmp--) {
273                         if (VIF_EXISTS(tmp))
274                                 break;
275                 }
276                 maxvif = tmp+1;
277         }
278
279         write_unlock_bh(&mrt_lock);
280
281         dev_set_allmulti(dev, -1);
282
283         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
284                 in_dev->cnf.mc_forwarding--;
285                 ip_rt_multicast_event(in_dev);
286         }
287
288         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
289                 unregister_netdevice(dev);
290
291         dev_put(dev);
292         return 0;
293 }
294
295 /* Destroy an unresolved cache entry, killing queued skbs
296    and reporting error to netlink readers.
297  */
298
299 static void ipmr_destroy_unres(struct mfc_cache *c)
300 {
301         struct sk_buff *skb;
302         struct nlmsgerr *e;
303
304         atomic_dec(&cache_resolve_queue_len);
305
306         while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
307                 if (skb->nh.iph->version == 0) {
308                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
309                         nlh->nlmsg_type = NLMSG_ERROR;
310                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
311                         skb_trim(skb, nlh->nlmsg_len);
312                         e = NLMSG_DATA(nlh);
313                         e->error = -ETIMEDOUT;
314                         memset(&e->msg, 0, sizeof(e->msg));
315                         netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
316                 } else
317                         kfree_skb(skb);
318         }
319
320         kmem_cache_free(mrt_cachep, c);
321 }
322
323
324 /* Single timer process for all the unresolved queue. */
325
326 static void ipmr_expire_process(unsigned long dummy)
327 {
328         unsigned long now;
329         unsigned long expires;
330         struct mfc_cache *c, **cp;
331
332         if (!spin_trylock(&mfc_unres_lock)) {
333                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
334                 return;
335         }
336
337         if (atomic_read(&cache_resolve_queue_len) == 0)
338                 goto out;
339
340         now = jiffies;
341         expires = 10*HZ;
342         cp = &mfc_unres_queue;
343
344         while ((c=*cp) != NULL) {
345                 if (time_after(c->mfc_un.unres.expires, now)) {
346                         unsigned long interval = c->mfc_un.unres.expires - now;
347                         if (interval < expires)
348                                 expires = interval;
349                         cp = &c->next;
350                         continue;
351                 }
352
353                 *cp = c->next;
354
355                 ipmr_destroy_unres(c);
356         }
357
358         if (atomic_read(&cache_resolve_queue_len))
359                 mod_timer(&ipmr_expire_timer, jiffies + expires);
360
361 out:
362         spin_unlock(&mfc_unres_lock);
363 }
364
365 /* Fill oifs list. It is called under write locked mrt_lock. */
366
367 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
368 {
369         int vifi;
370
371         cache->mfc_un.res.minvif = MAXVIFS;
372         cache->mfc_un.res.maxvif = 0;
373         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
374
375         for (vifi=0; vifi<maxvif; vifi++) {
376                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
377                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
378                         if (cache->mfc_un.res.minvif > vifi)
379                                 cache->mfc_un.res.minvif = vifi;
380                         if (cache->mfc_un.res.maxvif <= vifi)
381                                 cache->mfc_un.res.maxvif = vifi + 1;
382                 }
383         }
384 }
385
386 static int vif_add(struct vifctl *vifc, int mrtsock)
387 {
388         int vifi = vifc->vifc_vifi;
389         struct vif_device *v = &vif_table[vifi];
390         struct net_device *dev;
391         struct in_device *in_dev;
392
393         /* Is vif busy ? */
394         if (VIF_EXISTS(vifi))
395                 return -EADDRINUSE;
396
397         switch (vifc->vifc_flags) {
398 #ifdef CONFIG_IP_PIMSM
399         case VIFF_REGISTER:
400                 /*
401                  * Special Purpose VIF in PIM
402                  * All the packets will be sent to the daemon
403                  */
404                 if (reg_vif_num >= 0)
405                         return -EADDRINUSE;
406                 dev = ipmr_reg_vif();
407                 if (!dev)
408                         return -ENOBUFS;
409                 break;
410 #endif
411         case VIFF_TUNNEL:       
412                 dev = ipmr_new_tunnel(vifc);
413                 if (!dev)
414                         return -ENOBUFS;
415                 break;
416         case 0:
417                 dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
418                 if (!dev)
419                         return -EADDRNOTAVAIL;
420                 dev_put(dev);
421                 break;
422         default:
423                 return -EINVAL;
424         }
425
426         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
427                 return -EADDRNOTAVAIL;
428         in_dev->cnf.mc_forwarding++;
429         dev_set_allmulti(dev, +1);
430         ip_rt_multicast_event(in_dev);
431
432         /*
433          *      Fill in the VIF structures
434          */
435         v->rate_limit=vifc->vifc_rate_limit;
436         v->local=vifc->vifc_lcl_addr.s_addr;
437         v->remote=vifc->vifc_rmt_addr.s_addr;
438         v->flags=vifc->vifc_flags;
439         if (!mrtsock)
440                 v->flags |= VIFF_STATIC;
441         v->threshold=vifc->vifc_threshold;
442         v->bytes_in = 0;
443         v->bytes_out = 0;
444         v->pkt_in = 0;
445         v->pkt_out = 0;
446         v->link = dev->ifindex;
447         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
448                 v->link = dev->iflink;
449
450         /* And finish update writing critical data */
451         write_lock_bh(&mrt_lock);
452         dev_hold(dev);
453         v->dev=dev;
454 #ifdef CONFIG_IP_PIMSM
455         if (v->flags&VIFF_REGISTER)
456                 reg_vif_num = vifi;
457 #endif
458         if (vifi+1 > maxvif)
459                 maxvif = vifi+1;
460         write_unlock_bh(&mrt_lock);
461         return 0;
462 }
463
464 static struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp)
465 {
466         int line=MFC_HASH(mcastgrp,origin);
467         struct mfc_cache *c;
468
469         for (c=mfc_cache_array[line]; c; c = c->next) {
470                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
471                         break;
472         }
473         return c;
474 }
475
476 /*
477  *      Allocate a multicast cache entry
478  */
479 static struct mfc_cache *ipmr_cache_alloc(void)
480 {
481         struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL);
482         if(c==NULL)
483                 return NULL;
484         memset(c, 0, sizeof(*c));
485         c->mfc_un.res.minvif = MAXVIFS;
486         return c;
487 }
488
489 static struct mfc_cache *ipmr_cache_alloc_unres(void)
490 {
491         struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC);
492         if(c==NULL)
493                 return NULL;
494         memset(c, 0, sizeof(*c));
495         skb_queue_head_init(&c->mfc_un.unres.unresolved);
496         c->mfc_un.unres.expires = jiffies + 10*HZ;
497         return c;
498 }
499
500 /*
501  *      A cache entry has gone into a resolved state from queued
502  */
503  
504 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
505 {
506         struct sk_buff *skb;
507         struct nlmsgerr *e;
508
509         /*
510          *      Play the pending entries through our router
511          */
512
513         while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
514                 if (skb->nh.iph->version == 0) {
515                         int err;
516                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
517
518                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
519                                 nlh->nlmsg_len = skb->tail - (u8*)nlh;
520                         } else {
521                                 nlh->nlmsg_type = NLMSG_ERROR;
522                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
523                                 skb_trim(skb, nlh->nlmsg_len);
524                                 e = NLMSG_DATA(nlh);
525                                 e->error = -EMSGSIZE;
526                                 memset(&e->msg, 0, sizeof(e->msg));
527                         }
528                         err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
529                 } else
530                         ip_mr_forward(skb, c, 0);
531         }
532 }
533
534 /*
535  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
536  *      expects the following bizarre scheme.
537  *
538  *      Called under mrt_lock.
539  */
540  
541 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
542 {
543         struct sk_buff *skb;
544         int ihl = pkt->nh.iph->ihl<<2;
545         struct igmphdr *igmp;
546         struct igmpmsg *msg;
547         int ret;
548
549 #ifdef CONFIG_IP_PIMSM
550         if (assert == IGMPMSG_WHOLEPKT)
551                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
552         else
553 #endif
554                 skb = alloc_skb(128, GFP_ATOMIC);
555
556         if(!skb)
557                 return -ENOBUFS;
558
559 #ifdef CONFIG_IP_PIMSM
560         if (assert == IGMPMSG_WHOLEPKT) {
561                 /* Ugly, but we have no choice with this interface.
562                    Duplicate old header, fix ihl, length etc.
563                    And all this only to mangle msg->im_msgtype and
564                    to set msg->im_mbz to "mbz" :-)
565                  */
566                 msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
567                 skb->nh.raw = skb->h.raw = (u8*)msg;
568                 memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
569                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
570                 msg->im_mbz = 0;
571                 msg->im_vif = reg_vif_num;
572                 skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
573                 skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
574         } else 
575 #endif
576         {       
577                 
578         /*
579          *      Copy the IP header
580          */
581
582         skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
583         memcpy(skb->data,pkt->data,ihl);
584         skb->nh.iph->protocol = 0;                      /* Flag to the kernel this is a route add */
585         msg = (struct igmpmsg*)skb->nh.iph;
586         msg->im_vif = vifi;
587         skb->dst = dst_clone(pkt->dst);
588
589         /*
590          *      Add our header
591          */
592
593         igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
594         igmp->type      =
595         msg->im_msgtype = assert;
596         igmp->code      =       0;
597         skb->nh.iph->tot_len=htons(skb->len);                   /* Fix the length */
598         skb->h.raw = skb->nh.raw;
599         }
600
601         if (mroute_socket == NULL) {
602                 kfree_skb(skb);
603                 return -EINVAL;
604         }
605
606         /*
607          *      Deliver to mrouted
608          */
609         if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
610                 if (net_ratelimit())
611                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
612                 kfree_skb(skb);
613         }
614
615         return ret;
616 }
617
618 /*
619  *      Queue a packet for resolution. It gets locked cache entry!
620  */
621  
622 static int
623 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
624 {
625         int err;
626         struct mfc_cache *c;
627
628         spin_lock_bh(&mfc_unres_lock);
629         for (c=mfc_unres_queue; c; c=c->next) {
630                 if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
631                     c->mfc_origin == skb->nh.iph->saddr)
632                         break;
633         }
634
635         if (c == NULL) {
636                 /*
637                  *      Create a new entry if allowable
638                  */
639
640                 if (atomic_read(&cache_resolve_queue_len)>=10 ||
641                     (c=ipmr_cache_alloc_unres())==NULL) {
642                         spin_unlock_bh(&mfc_unres_lock);
643
644                         kfree_skb(skb);
645                         return -ENOBUFS;
646                 }
647
648                 /*
649                  *      Fill in the new cache entry
650                  */
651                 c->mfc_parent=-1;
652                 c->mfc_origin=skb->nh.iph->saddr;
653                 c->mfc_mcastgrp=skb->nh.iph->daddr;
654
655                 /*
656                  *      Reflect first query at mrouted.
657                  */
658                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
659                         /* If the report failed throw the cache entry 
660                            out - Brad Parker
661                          */
662                         spin_unlock_bh(&mfc_unres_lock);
663
664                         kmem_cache_free(mrt_cachep, c);
665                         kfree_skb(skb);
666                         return err;
667                 }
668
669                 atomic_inc(&cache_resolve_queue_len);
670                 c->next = mfc_unres_queue;
671                 mfc_unres_queue = c;
672
673                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
674         }
675
676         /*
677          *      See if we can append the packet
678          */
679         if (c->mfc_un.unres.unresolved.qlen>3) {
680                 kfree_skb(skb);
681                 err = -ENOBUFS;
682         } else {
683                 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
684                 err = 0;
685         }
686
687         spin_unlock_bh(&mfc_unres_lock);
688         return err;
689 }
690
691 /*
692  *      MFC cache manipulation by user space mroute daemon
693  */
694
695 static int ipmr_mfc_delete(struct mfcctl *mfc)
696 {
697         int line;
698         struct mfc_cache *c, **cp;
699
700         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
701
702         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
703                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
704                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
705                         write_lock_bh(&mrt_lock);
706                         *cp = c->next;
707                         write_unlock_bh(&mrt_lock);
708
709                         kmem_cache_free(mrt_cachep, c);
710                         return 0;
711                 }
712         }
713         return -ENOENT;
714 }
715
716 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
717 {
718         int line;
719         struct mfc_cache *uc, *c, **cp;
720
721         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
722
723         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
724                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
725                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
726                         break;
727         }
728
729         if (c != NULL) {
730                 write_lock_bh(&mrt_lock);
731                 c->mfc_parent = mfc->mfcc_parent;
732                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
733                 if (!mrtsock)
734                         c->mfc_flags |= MFC_STATIC;
735                 write_unlock_bh(&mrt_lock);
736                 return 0;
737         }
738
739         if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
740                 return -EINVAL;
741
742         c=ipmr_cache_alloc();
743         if (c==NULL)
744                 return -ENOMEM;
745
746         c->mfc_origin=mfc->mfcc_origin.s_addr;
747         c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
748         c->mfc_parent=mfc->mfcc_parent;
749         ipmr_update_thresholds(c, mfc->mfcc_ttls);
750         if (!mrtsock)
751                 c->mfc_flags |= MFC_STATIC;
752
753         write_lock_bh(&mrt_lock);
754         c->next = mfc_cache_array[line];
755         mfc_cache_array[line] = c;
756         write_unlock_bh(&mrt_lock);
757
758         /*
759          *      Check to see if we resolved a queued list. If so we
760          *      need to send on the frames and tidy up.
761          */
762         spin_lock_bh(&mfc_unres_lock);
763         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
764              cp = &uc->next) {
765                 if (uc->mfc_origin == c->mfc_origin &&
766                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
767                         *cp = uc->next;
768                         if (atomic_dec_and_test(&cache_resolve_queue_len))
769                                 del_timer(&ipmr_expire_timer);
770                         break;
771                 }
772         }
773         spin_unlock_bh(&mfc_unres_lock);
774
775         if (uc) {
776                 ipmr_cache_resolve(uc, c);
777                 kmem_cache_free(mrt_cachep, uc);
778         }
779         return 0;
780 }
781
782 /*
783  *      Close the multicast socket, and clear the vif tables etc
784  */
785  
786 static void mroute_clean_tables(struct sock *sk)
787 {
788         int i;
789                 
790         /*
791          *      Shut down all active vif entries
792          */
793         for(i=0; i<maxvif; i++) {
794                 if (!(vif_table[i].flags&VIFF_STATIC))
795                         vif_delete(i);
796         }
797
798         /*
799          *      Wipe the cache
800          */
801         for (i=0;i<MFC_LINES;i++) {
802                 struct mfc_cache *c, **cp;
803
804                 cp = &mfc_cache_array[i];
805                 while ((c = *cp) != NULL) {
806                         if (c->mfc_flags&MFC_STATIC) {
807                                 cp = &c->next;
808                                 continue;
809                         }
810                         write_lock_bh(&mrt_lock);
811                         *cp = c->next;
812                         write_unlock_bh(&mrt_lock);
813
814                         kmem_cache_free(mrt_cachep, c);
815                 }
816         }
817
818         if (atomic_read(&cache_resolve_queue_len) != 0) {
819                 struct mfc_cache *c;
820
821                 spin_lock_bh(&mfc_unres_lock);
822                 while (mfc_unres_queue != NULL) {
823                         c = mfc_unres_queue;
824                         mfc_unres_queue = c->next;
825                         spin_unlock_bh(&mfc_unres_lock);
826
827                         ipmr_destroy_unres(c);
828
829                         spin_lock_bh(&mfc_unres_lock);
830                 }
831                 spin_unlock_bh(&mfc_unres_lock);
832         }
833 }
834
835 static void mrtsock_destruct(struct sock *sk)
836 {
837         rtnl_lock();
838         if (sk == mroute_socket) {
839                 ipv4_devconf.mc_forwarding--;
840
841                 write_lock_bh(&mrt_lock);
842                 mroute_socket=NULL;
843                 write_unlock_bh(&mrt_lock);
844
845                 mroute_clean_tables(sk);
846         }
847         rtnl_unlock();
848 }
849
850 /*
851  *      Socket options and virtual interface manipulation. The whole
852  *      virtual interface system is a complete heap, but unfortunately
853  *      that's how BSD mrouted happens to think. Maybe one day with a proper
854  *      MOSPF/PIM router set up we can clean this up.
855  */
856  
857 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
858 {
859         int ret;
860         struct vifctl vif;
861         struct mfcctl mfc;
862         
863         if(optname!=MRT_INIT)
864         {
865                 if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
866                         return -EACCES;
867         }
868
869         switch(optname)
870         {
871                 case MRT_INIT:
872                         if (sk->sk_type != SOCK_RAW ||
873                             inet_sk(sk)->num != IPPROTO_IGMP)
874                                 return -EOPNOTSUPP;
875                         if(optlen!=sizeof(int))
876                                 return -ENOPROTOOPT;
877
878                         rtnl_lock();
879                         if (mroute_socket) {
880                                 rtnl_unlock();
881                                 return -EADDRINUSE;
882                         }
883
884                         ret = ip_ra_control(sk, 1, mrtsock_destruct);
885                         if (ret == 0) {
886                                 write_lock_bh(&mrt_lock);
887                                 mroute_socket=sk;
888                                 write_unlock_bh(&mrt_lock);
889
890                                 ipv4_devconf.mc_forwarding++;
891                         }
892                         rtnl_unlock();
893                         return ret;
894                 case MRT_DONE:
895                         if (sk!=mroute_socket)
896                                 return -EACCES;
897                         return ip_ra_control(sk, 0, NULL);
898                 case MRT_ADD_VIF:
899                 case MRT_DEL_VIF:
900                         if(optlen!=sizeof(vif))
901                                 return -EINVAL;
902                         if (copy_from_user(&vif,optval,sizeof(vif)))
903                                 return -EFAULT; 
904                         if(vif.vifc_vifi >= MAXVIFS)
905                                 return -ENFILE;
906                         rtnl_lock();
907                         if (optname==MRT_ADD_VIF) {
908                                 ret = vif_add(&vif, sk==mroute_socket);
909                         } else {
910                                 ret = vif_delete(vif.vifc_vifi);
911                         }
912                         rtnl_unlock();
913                         return ret;
914
915                 /*
916                  *      Manipulate the forwarding caches. These live
917                  *      in a sort of kernel/user symbiosis.
918                  */
919                 case MRT_ADD_MFC:
920                 case MRT_DEL_MFC:
921                         if(optlen!=sizeof(mfc))
922                                 return -EINVAL;
923                         if (copy_from_user(&mfc,optval, sizeof(mfc)))
924                                 return -EFAULT;
925                         rtnl_lock();
926                         if (optname==MRT_DEL_MFC)
927                                 ret = ipmr_mfc_delete(&mfc);
928                         else
929                                 ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
930                         rtnl_unlock();
931                         return ret;
932                 /*
933                  *      Control PIM assert.
934                  */
935                 case MRT_ASSERT:
936                 {
937                         int v;
938                         if(get_user(v,(int __user *)optval))
939                                 return -EFAULT;
940                         mroute_do_assert=(v)?1:0;
941                         return 0;
942                 }
943 #ifdef CONFIG_IP_PIMSM
944                 case MRT_PIM:
945                 {
946                         int v, ret;
947                         if(get_user(v,(int __user *)optval))
948                                 return -EFAULT;
949                         v = (v)?1:0;
950                         rtnl_lock();
951                         ret = 0;
952                         if (v != mroute_do_pim) {
953                                 mroute_do_pim = v;
954                                 mroute_do_assert = v;
955 #ifdef CONFIG_IP_PIMSM_V2
956                                 if (mroute_do_pim)
957                                         ret = inet_add_protocol(&pim_protocol,
958                                                                 IPPROTO_PIM);
959                                 else
960                                         ret = inet_del_protocol(&pim_protocol,
961                                                                 IPPROTO_PIM);
962                                 if (ret < 0)
963                                         ret = -EAGAIN;
964 #endif
965                         }
966                         rtnl_unlock();
967                         return ret;
968                 }
969 #endif
970                 /*
971                  *      Spurious command, or MRT_VERSION which you cannot
972                  *      set.
973                  */
974                 default:
975                         return -ENOPROTOOPT;
976         }
977 }
978
979 /*
980  *      Getsock opt support for the multicast routing system.
981  */
982  
983 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
984 {
985         int olr;
986         int val;
987
988         if(optname!=MRT_VERSION && 
989 #ifdef CONFIG_IP_PIMSM
990            optname!=MRT_PIM &&
991 #endif
992            optname!=MRT_ASSERT)
993                 return -ENOPROTOOPT;
994
995         if (get_user(olr, optlen))
996                 return -EFAULT;
997
998         olr = min_t(unsigned int, olr, sizeof(int));
999         if (olr < 0)
1000                 return -EINVAL;
1001                 
1002         if(put_user(olr,optlen))
1003                 return -EFAULT;
1004         if(optname==MRT_VERSION)
1005                 val=0x0305;
1006 #ifdef CONFIG_IP_PIMSM
1007         else if(optname==MRT_PIM)
1008                 val=mroute_do_pim;
1009 #endif
1010         else
1011                 val=mroute_do_assert;
1012         if(copy_to_user(optval,&val,olr))
1013                 return -EFAULT;
1014         return 0;
1015 }
1016
1017 /*
1018  *      The IP multicast ioctl support routines.
1019  */
1020  
1021 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1022 {
1023         struct sioc_sg_req sr;
1024         struct sioc_vif_req vr;
1025         struct vif_device *vif;
1026         struct mfc_cache *c;
1027         
1028         switch(cmd)
1029         {
1030                 case SIOCGETVIFCNT:
1031                         if (copy_from_user(&vr,arg,sizeof(vr)))
1032                                 return -EFAULT; 
1033                         if(vr.vifi>=maxvif)
1034                                 return -EINVAL;
1035                         read_lock(&mrt_lock);
1036                         vif=&vif_table[vr.vifi];
1037                         if(VIF_EXISTS(vr.vifi)) {
1038                                 vr.icount=vif->pkt_in;
1039                                 vr.ocount=vif->pkt_out;
1040                                 vr.ibytes=vif->bytes_in;
1041                                 vr.obytes=vif->bytes_out;
1042                                 read_unlock(&mrt_lock);
1043
1044                                 if (copy_to_user(arg,&vr,sizeof(vr)))
1045                                         return -EFAULT;
1046                                 return 0;
1047                         }
1048                         read_unlock(&mrt_lock);
1049                         return -EADDRNOTAVAIL;
1050                 case SIOCGETSGCNT:
1051                         if (copy_from_user(&sr,arg,sizeof(sr)))
1052                                 return -EFAULT;
1053
1054                         read_lock(&mrt_lock);
1055                         c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1056                         if (c) {
1057                                 sr.pktcnt = c->mfc_un.res.pkt;
1058                                 sr.bytecnt = c->mfc_un.res.bytes;
1059                                 sr.wrong_if = c->mfc_un.res.wrong_if;
1060                                 read_unlock(&mrt_lock);
1061
1062                                 if (copy_to_user(arg,&sr,sizeof(sr)))
1063                                         return -EFAULT;
1064                                 return 0;
1065                         }
1066                         read_unlock(&mrt_lock);
1067                         return -EADDRNOTAVAIL;
1068                 default:
1069                         return -ENOIOCTLCMD;
1070         }
1071 }
1072
1073
1074 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1075 {
1076         struct vif_device *v;
1077         int ct;
1078         if (event != NETDEV_UNREGISTER)
1079                 return NOTIFY_DONE;
1080         v=&vif_table[0];
1081         for(ct=0;ct<maxvif;ct++,v++) {
1082                 if (v->dev==ptr)
1083                         vif_delete(ct);
1084         }
1085         return NOTIFY_DONE;
1086 }
1087
1088
1089 static struct notifier_block ip_mr_notifier={
1090         .notifier_call = ipmr_device_event,
1091 };
1092
1093 /*
1094  *      Encapsulate a packet by attaching a valid IPIP header to it.
1095  *      This avoids tunnel drivers and other mess and gives us the speed so
1096  *      important for multicast video.
1097  */
1098  
1099 static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr)
1100 {
1101         struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
1102
1103         iph->version    =       4;
1104         iph->tos        =       skb->nh.iph->tos;
1105         iph->ttl        =       skb->nh.iph->ttl;
1106         iph->frag_off   =       0;
1107         iph->daddr      =       daddr;
1108         iph->saddr      =       saddr;
1109         iph->protocol   =       IPPROTO_IPIP;
1110         iph->ihl        =       5;
1111         iph->tot_len    =       htons(skb->len);
1112         ip_select_ident(iph, skb->dst, NULL);
1113         ip_send_check(iph);
1114
1115         skb->h.ipiph = skb->nh.iph;
1116         skb->nh.iph = iph;
1117         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1118         nf_reset(skb);
1119 }
1120
1121 static inline int ipmr_forward_finish(struct sk_buff *skb)
1122 {
1123         struct ip_options * opt = &(IPCB(skb)->opt);
1124
1125         IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1126
1127         if (unlikely(opt->optlen))
1128                 ip_forward_options(skb);
1129
1130         return dst_output(skb);
1131 }
1132
1133 /*
1134  *      Processing handlers for ipmr_forward
1135  */
1136
1137 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1138 {
1139         struct iphdr *iph = skb->nh.iph;
1140         struct vif_device *vif = &vif_table[vifi];
1141         struct net_device *dev;
1142         struct rtable *rt;
1143         int    encap = 0;
1144
1145         if (vif->dev == NULL)
1146                 goto out_free;
1147
1148 #ifdef CONFIG_IP_PIMSM
1149         if (vif->flags & VIFF_REGISTER) {
1150                 vif->pkt_out++;
1151                 vif->bytes_out+=skb->len;
1152                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
1153                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
1154                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1155                 kfree_skb(skb);
1156                 return;
1157         }
1158 #endif
1159
1160         if (vif->flags&VIFF_TUNNEL) {
1161                 struct flowi fl = { .oif = vif->link,
1162                                     .nl_u = { .ip4_u =
1163                                               { .daddr = vif->remote,
1164                                                 .saddr = vif->local,
1165                                                 .tos = RT_TOS(iph->tos) } },
1166                                     .proto = IPPROTO_IPIP };
1167                 if (ip_route_output_key(&rt, &fl))
1168                         goto out_free;
1169                 encap = sizeof(struct iphdr);
1170         } else {
1171                 struct flowi fl = { .oif = vif->link,
1172                                     .nl_u = { .ip4_u =
1173                                               { .daddr = iph->daddr,
1174                                                 .tos = RT_TOS(iph->tos) } },
1175                                     .proto = IPPROTO_IPIP };
1176                 if (ip_route_output_key(&rt, &fl))
1177                         goto out_free;
1178         }
1179
1180         dev = rt->u.dst.dev;
1181
1182         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1183                 /* Do not fragment multicasts. Alas, IPv4 does not
1184                    allow to send ICMP, so that packets will disappear
1185                    to blackhole.
1186                  */
1187
1188                 IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1189                 ip_rt_put(rt);
1190                 goto out_free;
1191         }
1192
1193         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1194
1195         if (skb_cow(skb, encap)) {
1196                 ip_rt_put(rt);
1197                 goto out_free;
1198         }
1199
1200         vif->pkt_out++;
1201         vif->bytes_out+=skb->len;
1202
1203         dst_release(skb->dst);
1204         skb->dst = &rt->u.dst;
1205         iph = skb->nh.iph;
1206         ip_decrease_ttl(iph);
1207
1208         /* FIXME: forward and output firewalls used to be called here.
1209          * What do we do with netfilter? -- RR */
1210         if (vif->flags & VIFF_TUNNEL) {
1211                 ip_encap(skb, vif->local, vif->remote);
1212                 /* FIXME: extra output firewall step used to be here. --RR */
1213                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
1214                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
1215         }
1216
1217         IPCB(skb)->flags |= IPSKB_FORWARDED;
1218
1219         /*
1220          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1221          * not only before forwarding, but after forwarding on all output
1222          * interfaces. It is clear, if mrouter runs a multicasting
1223          * program, it should receive packets not depending to what interface
1224          * program is joined.
1225          * If we will not make it, the program will have to join on all
1226          * interfaces. On the other hand, multihoming host (or router, but
1227          * not mrouter) cannot join to more than one interface - it will
1228          * result in receiving multiple packets.
1229          */
1230         NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev, 
1231                 ipmr_forward_finish);
1232         return;
1233
1234 out_free:
1235         kfree_skb(skb);
1236         return;
1237 }
1238
1239 static int ipmr_find_vif(struct net_device *dev)
1240 {
1241         int ct;
1242         for (ct=maxvif-1; ct>=0; ct--) {
1243                 if (vif_table[ct].dev == dev)
1244                         break;
1245         }
1246         return ct;
1247 }
1248
1249 /* "local" means that we should preserve one skb (for local delivery) */
1250
1251 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1252 {
1253         int psend = -1;
1254         int vif, ct;
1255
1256         vif = cache->mfc_parent;
1257         cache->mfc_un.res.pkt++;
1258         cache->mfc_un.res.bytes += skb->len;
1259
1260         /*
1261          * Wrong interface: drop packet and (maybe) send PIM assert.
1262          */
1263         if (vif_table[vif].dev != skb->dev) {
1264                 int true_vifi;
1265
1266                 if (((struct rtable*)skb->dst)->fl.iif == 0) {
1267                         /* It is our own packet, looped back.
1268                            Very complicated situation...
1269
1270                            The best workaround until routing daemons will be
1271                            fixed is not to redistribute packet, if it was
1272                            send through wrong interface. It means, that
1273                            multicast applications WILL NOT work for
1274                            (S,G), which have default multicast route pointing
1275                            to wrong oif. In any case, it is not a good
1276                            idea to use multicasting applications on router.
1277                          */
1278                         goto dont_forward;
1279                 }
1280
1281                 cache->mfc_un.res.wrong_if++;
1282                 true_vifi = ipmr_find_vif(skb->dev);
1283
1284                 if (true_vifi >= 0 && mroute_do_assert &&
1285                     /* pimsm uses asserts, when switching from RPT to SPT,
1286                        so that we cannot check that packet arrived on an oif.
1287                        It is bad, but otherwise we would need to move pretty
1288                        large chunk of pimd to kernel. Ough... --ANK
1289                      */
1290                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1291                     time_after(jiffies, 
1292                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1293                         cache->mfc_un.res.last_assert = jiffies;
1294                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1295                 }
1296                 goto dont_forward;
1297         }
1298
1299         vif_table[vif].pkt_in++;
1300         vif_table[vif].bytes_in+=skb->len;
1301
1302         /*
1303          *      Forward the frame
1304          */
1305         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1306                 if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
1307                         if (psend != -1) {
1308                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1309                                 if (skb2)
1310                                         ipmr_queue_xmit(skb2, cache, psend);
1311                         }
1312                         psend=ct;
1313                 }
1314         }
1315         if (psend != -1) {
1316                 if (local) {
1317                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1318                         if (skb2)
1319                                 ipmr_queue_xmit(skb2, cache, psend);
1320                 } else {
1321                         ipmr_queue_xmit(skb, cache, psend);
1322                         return 0;
1323                 }
1324         }
1325
1326 dont_forward:
1327         if (!local)
1328                 kfree_skb(skb);
1329         return 0;
1330 }
1331
1332
1333 /*
1334  *      Multicast packets for forwarding arrive here
1335  */
1336
1337 int ip_mr_input(struct sk_buff *skb)
1338 {
1339         struct mfc_cache *cache;
1340         int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1341
1342         /* Packet is looped back after forward, it should not be
1343            forwarded second time, but still can be delivered locally.
1344          */
1345         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1346                 goto dont_forward;
1347
1348         if (!local) {
1349                     if (IPCB(skb)->opt.router_alert) {
1350                             if (ip_call_ra_chain(skb))
1351                                     return 0;
1352                     } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
1353                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1354                                Cisco IOS <= 11.2(8)) do not put router alert
1355                                option to IGMP packets destined to routable
1356                                groups. It is very bad, because it means
1357                                that we can forward NO IGMP messages.
1358                              */
1359                             read_lock(&mrt_lock);
1360                             if (mroute_socket) {
1361                                     nf_reset(skb);
1362                                     raw_rcv(mroute_socket, skb);
1363                                     read_unlock(&mrt_lock);
1364                                     return 0;
1365                             }
1366                             read_unlock(&mrt_lock);
1367                     }
1368         }
1369
1370         read_lock(&mrt_lock);
1371         cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
1372
1373         /*
1374          *      No usable cache entry
1375          */
1376         if (cache==NULL) {
1377                 int vif;
1378
1379                 if (local) {
1380                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1381                         ip_local_deliver(skb);
1382                         if (skb2 == NULL) {
1383                                 read_unlock(&mrt_lock);
1384                                 return -ENOBUFS;
1385                         }
1386                         skb = skb2;
1387                 }
1388
1389                 vif = ipmr_find_vif(skb->dev);
1390                 if (vif >= 0) {
1391                         int err = ipmr_cache_unresolved(vif, skb);
1392                         read_unlock(&mrt_lock);
1393
1394                         return err;
1395                 }
1396                 read_unlock(&mrt_lock);
1397                 kfree_skb(skb);
1398                 return -ENODEV;
1399         }
1400
1401         ip_mr_forward(skb, cache, local);
1402
1403         read_unlock(&mrt_lock);
1404
1405         if (local)
1406                 return ip_local_deliver(skb);
1407
1408         return 0;
1409
1410 dont_forward:
1411         if (local)
1412                 return ip_local_deliver(skb);
1413         kfree_skb(skb);
1414         return 0;
1415 }
1416
1417 #ifdef CONFIG_IP_PIMSM_V1
1418 /*
1419  * Handle IGMP messages of PIMv1
1420  */
1421
1422 int pim_rcv_v1(struct sk_buff * skb)
1423 {
1424         struct igmphdr *pim;
1425         struct iphdr   *encap;
1426         struct net_device  *reg_dev = NULL;
1427
1428         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 
1429                 goto drop;
1430
1431         pim = (struct igmphdr*)skb->h.raw;
1432
1433         if (!mroute_do_pim ||
1434             skb->len < sizeof(*pim) + sizeof(*encap) ||
1435             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 
1436                 goto drop;
1437
1438         encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
1439         /*
1440            Check that:
1441            a. packet is really destinted to a multicast group
1442            b. packet is not a NULL-REGISTER
1443            c. packet is not truncated
1444          */
1445         if (!MULTICAST(encap->daddr) ||
1446             encap->tot_len == 0 ||
1447             ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 
1448                 goto drop;
1449
1450         read_lock(&mrt_lock);
1451         if (reg_vif_num >= 0)
1452                 reg_dev = vif_table[reg_vif_num].dev;
1453         if (reg_dev)
1454                 dev_hold(reg_dev);
1455         read_unlock(&mrt_lock);
1456
1457         if (reg_dev == NULL) 
1458                 goto drop;
1459
1460         skb->mac.raw = skb->nh.raw;
1461         skb_pull(skb, (u8*)encap - skb->data);
1462         skb->nh.iph = (struct iphdr *)skb->data;
1463         skb->dev = reg_dev;
1464         skb->protocol = htons(ETH_P_IP);
1465         skb->ip_summed = 0;
1466         skb->pkt_type = PACKET_HOST;
1467         dst_release(skb->dst);
1468         skb->dst = NULL;
1469         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1470         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1471         nf_reset(skb);
1472         netif_rx(skb);
1473         dev_put(reg_dev);
1474         return 0;
1475  drop:
1476         kfree_skb(skb);
1477         return 0;
1478 }
1479 #endif
1480
1481 #ifdef CONFIG_IP_PIMSM_V2
1482 static int pim_rcv(struct sk_buff * skb)
1483 {
1484         struct pimreghdr *pim;
1485         struct iphdr   *encap;
1486         struct net_device  *reg_dev = NULL;
1487
1488         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 
1489                 goto drop;
1490
1491         pim = (struct pimreghdr*)skb->h.raw;
1492         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1493             (pim->flags&PIM_NULL_REGISTER) ||
1494             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && 
1495              (u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 
1496                 goto drop;
1497
1498         /* check if the inner packet is destined to mcast group */
1499         encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
1500         if (!MULTICAST(encap->daddr) ||
1501             encap->tot_len == 0 ||
1502             ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 
1503                 goto drop;
1504
1505         read_lock(&mrt_lock);
1506         if (reg_vif_num >= 0)
1507                 reg_dev = vif_table[reg_vif_num].dev;
1508         if (reg_dev)
1509                 dev_hold(reg_dev);
1510         read_unlock(&mrt_lock);
1511
1512         if (reg_dev == NULL) 
1513                 goto drop;
1514
1515         skb->mac.raw = skb->nh.raw;
1516         skb_pull(skb, (u8*)encap - skb->data);
1517         skb->nh.iph = (struct iphdr *)skb->data;
1518         skb->dev = reg_dev;
1519         skb->protocol = htons(ETH_P_IP);
1520         skb->ip_summed = 0;
1521         skb->pkt_type = PACKET_HOST;
1522         dst_release(skb->dst);
1523         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1524         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1525         skb->dst = NULL;
1526         nf_reset(skb);
1527         netif_rx(skb);
1528         dev_put(reg_dev);
1529         return 0;
1530  drop:
1531         kfree_skb(skb);
1532         return 0;
1533 }
1534 #endif
1535
1536 static int
1537 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1538 {
1539         int ct;
1540         struct rtnexthop *nhp;
1541         struct net_device *dev = vif_table[c->mfc_parent].dev;
1542         u8 *b = skb->tail;
1543         struct rtattr *mp_head;
1544
1545         if (dev)
1546                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1547
1548         mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1549
1550         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1551                 if (c->mfc_un.res.ttls[ct] < 255) {
1552                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1553                                 goto rtattr_failure;
1554                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1555                         nhp->rtnh_flags = 0;
1556                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1557                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1558                         nhp->rtnh_len = sizeof(*nhp);
1559                 }
1560         }
1561         mp_head->rta_type = RTA_MULTIPATH;
1562         mp_head->rta_len = skb->tail - (u8*)mp_head;
1563         rtm->rtm_type = RTN_MULTICAST;
1564         return 1;
1565
1566 rtattr_failure:
1567         skb_trim(skb, b - skb->data);
1568         return -EMSGSIZE;
1569 }
1570
1571 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1572 {
1573         int err;
1574         struct mfc_cache *cache;
1575         struct rtable *rt = (struct rtable*)skb->dst;
1576
1577         read_lock(&mrt_lock);
1578         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1579
1580         if (cache==NULL) {
1581                 struct sk_buff *skb2;
1582                 struct net_device *dev;
1583                 int vif;
1584
1585                 if (nowait) {
1586                         read_unlock(&mrt_lock);
1587                         return -EAGAIN;
1588                 }
1589
1590                 dev = skb->dev;
1591                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1592                         read_unlock(&mrt_lock);
1593                         return -ENODEV;
1594                 }
1595                 skb2 = skb_clone(skb, GFP_ATOMIC);
1596                 if (!skb2) {
1597                         read_unlock(&mrt_lock);
1598                         return -ENOMEM;
1599                 }
1600
1601                 skb2->nh.raw = skb_push(skb2, sizeof(struct iphdr));
1602                 skb2->nh.iph->ihl = sizeof(struct iphdr)>>2;
1603                 skb2->nh.iph->saddr = rt->rt_src;
1604                 skb2->nh.iph->daddr = rt->rt_dst;
1605                 skb2->nh.iph->version = 0;
1606                 err = ipmr_cache_unresolved(vif, skb2);
1607                 read_unlock(&mrt_lock);
1608                 return err;
1609         }
1610
1611         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1612                 cache->mfc_flags |= MFC_NOTIFY;
1613         err = ipmr_fill_mroute(skb, cache, rtm);
1614         read_unlock(&mrt_lock);
1615         return err;
1616 }
1617
1618 #ifdef CONFIG_PROC_FS   
1619 /*
1620  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1621  */
1622 struct ipmr_vif_iter {
1623         int ct;
1624 };
1625
1626 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1627                                            loff_t pos)
1628 {
1629         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1630                 if(!VIF_EXISTS(iter->ct))
1631                         continue;
1632                 if (pos-- == 0) 
1633                         return &vif_table[iter->ct];
1634         }
1635         return NULL;
1636 }
1637
1638 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1639 {
1640         read_lock(&mrt_lock);
1641         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1) 
1642                 : SEQ_START_TOKEN;
1643 }
1644
1645 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1646 {
1647         struct ipmr_vif_iter *iter = seq->private;
1648
1649         ++*pos;
1650         if (v == SEQ_START_TOKEN)
1651                 return ipmr_vif_seq_idx(iter, 0);
1652         
1653         while (++iter->ct < maxvif) {
1654                 if(!VIF_EXISTS(iter->ct))
1655                         continue;
1656                 return &vif_table[iter->ct];
1657         }
1658         return NULL;
1659 }
1660
1661 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1662 {
1663         read_unlock(&mrt_lock);
1664 }
1665
1666 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1667 {
1668         if (v == SEQ_START_TOKEN) {
1669                 seq_puts(seq, 
1670                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1671         } else {
1672                 const struct vif_device *vif = v;
1673                 const char *name =  vif->dev ? vif->dev->name : "none";
1674
1675                 seq_printf(seq,
1676                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1677                            vif - vif_table,
1678                            name, vif->bytes_in, vif->pkt_in, 
1679                            vif->bytes_out, vif->pkt_out,
1680                            vif->flags, vif->local, vif->remote);
1681         }
1682         return 0;
1683 }
1684
1685 static struct seq_operations ipmr_vif_seq_ops = {
1686         .start = ipmr_vif_seq_start,
1687         .next  = ipmr_vif_seq_next,
1688         .stop  = ipmr_vif_seq_stop,
1689         .show  = ipmr_vif_seq_show,
1690 };
1691
1692 static int ipmr_vif_open(struct inode *inode, struct file *file)
1693 {
1694         struct seq_file *seq;
1695         int rc = -ENOMEM;
1696         struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1697        
1698         if (!s)
1699                 goto out;
1700
1701         rc = seq_open(file, &ipmr_vif_seq_ops);
1702         if (rc)
1703                 goto out_kfree;
1704
1705         s->ct = 0;
1706         seq = file->private_data;
1707         seq->private = s;
1708 out:
1709         return rc;
1710 out_kfree:
1711         kfree(s);
1712         goto out;
1713
1714 }
1715
1716 static struct file_operations ipmr_vif_fops = {
1717         .owner   = THIS_MODULE,
1718         .open    = ipmr_vif_open,
1719         .read    = seq_read,
1720         .llseek  = seq_lseek,
1721         .release = seq_release_private,
1722 };
1723
1724 struct ipmr_mfc_iter {
1725         struct mfc_cache **cache;
1726         int ct;
1727 };
1728
1729
1730 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1731 {
1732         struct mfc_cache *mfc;
1733
1734         it->cache = mfc_cache_array;
1735         read_lock(&mrt_lock);
1736         for (it->ct = 0; it->ct < MFC_LINES; it->ct++) 
1737                 for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) 
1738                         if (pos-- == 0) 
1739                                 return mfc;
1740         read_unlock(&mrt_lock);
1741
1742         it->cache = &mfc_unres_queue;
1743         spin_lock_bh(&mfc_unres_lock);
1744         for(mfc = mfc_unres_queue; mfc; mfc = mfc->next) 
1745                 if (pos-- == 0)
1746                         return mfc;
1747         spin_unlock_bh(&mfc_unres_lock);
1748
1749         it->cache = NULL;
1750         return NULL;
1751 }
1752
1753
1754 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1755 {
1756         struct ipmr_mfc_iter *it = seq->private;
1757         it->cache = NULL;
1758         it->ct = 0;
1759         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1) 
1760                 : SEQ_START_TOKEN;
1761 }
1762
1763 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1764 {
1765         struct mfc_cache *mfc = v;
1766         struct ipmr_mfc_iter *it = seq->private;
1767
1768         ++*pos;
1769
1770         if (v == SEQ_START_TOKEN)
1771                 return ipmr_mfc_seq_idx(seq->private, 0);
1772
1773         if (mfc->next)
1774                 return mfc->next;
1775         
1776         if (it->cache == &mfc_unres_queue) 
1777                 goto end_of_list;
1778
1779         BUG_ON(it->cache != mfc_cache_array);
1780
1781         while (++it->ct < MFC_LINES) {
1782                 mfc = mfc_cache_array[it->ct];
1783                 if (mfc)
1784                         return mfc;
1785         }
1786
1787         /* exhausted cache_array, show unresolved */
1788         read_unlock(&mrt_lock);
1789         it->cache = &mfc_unres_queue;
1790         it->ct = 0;
1791                 
1792         spin_lock_bh(&mfc_unres_lock);
1793         mfc = mfc_unres_queue;
1794         if (mfc) 
1795                 return mfc;
1796
1797  end_of_list:
1798         spin_unlock_bh(&mfc_unres_lock);
1799         it->cache = NULL;
1800
1801         return NULL;
1802 }
1803
1804 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1805 {
1806         struct ipmr_mfc_iter *it = seq->private;
1807
1808         if (it->cache == &mfc_unres_queue)
1809                 spin_unlock_bh(&mfc_unres_lock);
1810         else if (it->cache == mfc_cache_array)
1811                 read_unlock(&mrt_lock);
1812 }
1813
1814 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1815 {
1816         int n;
1817
1818         if (v == SEQ_START_TOKEN) {
1819                 seq_puts(seq, 
1820                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1821         } else {
1822                 const struct mfc_cache *mfc = v;
1823                 const struct ipmr_mfc_iter *it = seq->private;
1824                 
1825                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1826                            (unsigned long) mfc->mfc_mcastgrp,
1827                            (unsigned long) mfc->mfc_origin,
1828                            mfc->mfc_parent,
1829                            mfc->mfc_un.res.pkt,
1830                            mfc->mfc_un.res.bytes,
1831                            mfc->mfc_un.res.wrong_if);
1832
1833                 if (it->cache != &mfc_unres_queue) {
1834                         for(n = mfc->mfc_un.res.minvif; 
1835                             n < mfc->mfc_un.res.maxvif; n++ ) {
1836                                 if(VIF_EXISTS(n) 
1837                                    && mfc->mfc_un.res.ttls[n] < 255)
1838                                 seq_printf(seq, 
1839                                            " %2d:%-3d", 
1840                                            n, mfc->mfc_un.res.ttls[n]);
1841                         }
1842                 }
1843                 seq_putc(seq, '\n');
1844         }
1845         return 0;
1846 }
1847
1848 static struct seq_operations ipmr_mfc_seq_ops = {
1849         .start = ipmr_mfc_seq_start,
1850         .next  = ipmr_mfc_seq_next,
1851         .stop  = ipmr_mfc_seq_stop,
1852         .show  = ipmr_mfc_seq_show,
1853 };
1854
1855 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1856 {
1857         struct seq_file *seq;
1858         int rc = -ENOMEM;
1859         struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1860        
1861         if (!s)
1862                 goto out;
1863
1864         rc = seq_open(file, &ipmr_mfc_seq_ops);
1865         if (rc)
1866                 goto out_kfree;
1867
1868         seq = file->private_data;
1869         seq->private = s;
1870 out:
1871         return rc;
1872 out_kfree:
1873         kfree(s);
1874         goto out;
1875
1876 }
1877
1878 static struct file_operations ipmr_mfc_fops = {
1879         .owner   = THIS_MODULE,
1880         .open    = ipmr_mfc_open,
1881         .read    = seq_read,
1882         .llseek  = seq_lseek,
1883         .release = seq_release_private,
1884 };
1885 #endif  
1886
1887 #ifdef CONFIG_IP_PIMSM_V2
1888 static struct net_protocol pim_protocol = {
1889         .handler        =       pim_rcv,
1890 };
1891 #endif
1892
1893
1894 /*
1895  *      Setup for IP multicast routing
1896  */
1897  
1898 void __init ip_mr_init(void)
1899 {
1900         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1901                                        sizeof(struct mfc_cache),
1902                                        0, SLAB_HWCACHE_ALIGN,
1903                                        NULL, NULL);
1904         if (!mrt_cachep)
1905                 panic("cannot allocate ip_mrt_cache");
1906
1907         init_timer(&ipmr_expire_timer);
1908         ipmr_expire_timer.function=ipmr_expire_process;
1909         register_netdevice_notifier(&ip_mr_notifier);
1910 #ifdef CONFIG_PROC_FS   
1911         proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1912         proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1913 #endif  
1914 }