Merge rsync://oss.sgi.com/git/xfs-2.6
[pandora-kernel.git] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@redhat.com>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *      Fixes:
15  *      Michael Chastain        :       Incorrect size of copying.
16  *      Alan Cox                :       Added the cache manager code
17  *      Alan Cox                :       Fixed the clone/copy bug and device race.
18  *      Mike McLagan            :       Routing by source
19  *      Malcolm Beattie         :       Buffer handling fixes.
20  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
21  *      SVR Anand               :       Fixed several multicast bugs and problems.
22  *      Alexey Kuznetsov        :       Status, optimisations and more.
23  *      Brad Parker             :       Better behaviour on mrouted upcall
24  *                                      overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
27  *                                      Relax this requrement to work with older peers.
28  *
29  */
30
31 #include <linux/config.h>
32 #include <asm/system.h>
33 #include <asm/uaccess.h>
34 #include <linux/types.h>
35 #include <linux/sched.h>
36 #include <linux/errno.h>
37 #include <linux/timer.h>
38 #include <linux/mm.h>
39 #include <linux/kernel.h>
40 #include <linux/fcntl.h>
41 #include <linux/stat.h>
42 #include <linux/socket.h>
43 #include <linux/in.h>
44 #include <linux/inet.h>
45 #include <linux/netdevice.h>
46 #include <linux/inetdevice.h>
47 #include <linux/igmp.h>
48 #include <linux/proc_fs.h>
49 #include <linux/seq_file.h>
50 #include <linux/mroute.h>
51 #include <linux/init.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/sock.h>
56 #include <net/icmp.h>
57 #include <net/udp.h>
58 #include <net/raw.h>
59 #include <linux/notifier.h>
60 #include <linux/if_arp.h>
61 #include <linux/netfilter_ipv4.h>
62 #include <net/ipip.h>
63 #include <net/checksum.h>
64
65 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
66 #define CONFIG_IP_PIMSM 1
67 #endif
68
69 static struct sock *mroute_socket;
70
71
72 /* Big lock, protecting vif table, mrt cache and mroute socket state.
73    Note that the changes are semaphored via rtnl_lock.
74  */
75
76 static DEFINE_RWLOCK(mrt_lock);
77
78 /*
79  *      Multicast router control variables
80  */
81
82 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
83 static int maxvif;
84
85 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
86
87 static int mroute_do_assert;                            /* Set in PIM assert    */
88 static int mroute_do_pim;
89
90 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
91
92 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
93 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
94
95 /* Special spinlock for queue of unresolved entries */
96 static DEFINE_SPINLOCK(mfc_unres_lock);
97
98 /* We return to original Alan's scheme. Hash table of resolved
99    entries is changed only in process context and protected
100    with weak lock mrt_lock. Queue of unresolved entries is protected
101    with strong spinlock mfc_unres_lock.
102
103    In this case data path is free of exclusive locks at all.
104  */
105
106 static kmem_cache_t *mrt_cachep;
107
108 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
109 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
110 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
111
112 #ifdef CONFIG_IP_PIMSM_V2
113 static struct net_protocol pim_protocol;
114 #endif
115
116 static struct timer_list ipmr_expire_timer;
117
118 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
119
120 static
121 struct net_device *ipmr_new_tunnel(struct vifctl *v)
122 {
123         struct net_device  *dev;
124
125         dev = __dev_get_by_name("tunl0");
126
127         if (dev) {
128                 int err;
129                 struct ifreq ifr;
130                 mm_segment_t    oldfs;
131                 struct ip_tunnel_parm p;
132                 struct in_device  *in_dev;
133
134                 memset(&p, 0, sizeof(p));
135                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
136                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
137                 p.iph.version = 4;
138                 p.iph.ihl = 5;
139                 p.iph.protocol = IPPROTO_IPIP;
140                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
141                 ifr.ifr_ifru.ifru_data = (void*)&p;
142
143                 oldfs = get_fs(); set_fs(KERNEL_DS);
144                 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
145                 set_fs(oldfs);
146
147                 dev = NULL;
148
149                 if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
150                         dev->flags |= IFF_MULTICAST;
151
152                         in_dev = __in_dev_get(dev);
153                         if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
154                                 goto failure;
155                         in_dev->cnf.rp_filter = 0;
156
157                         if (dev_open(dev))
158                                 goto failure;
159                 }
160         }
161         return dev;
162
163 failure:
164         /* allow the register to be completed before unregistering. */
165         rtnl_unlock();
166         rtnl_lock();
167
168         unregister_netdevice(dev);
169         return NULL;
170 }
171
172 #ifdef CONFIG_IP_PIMSM
173
174 static int reg_vif_num = -1;
175
176 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
177 {
178         read_lock(&mrt_lock);
179         ((struct net_device_stats*)dev->priv)->tx_bytes += skb->len;
180         ((struct net_device_stats*)dev->priv)->tx_packets++;
181         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
182         read_unlock(&mrt_lock);
183         kfree_skb(skb);
184         return 0;
185 }
186
187 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
188 {
189         return (struct net_device_stats*)dev->priv;
190 }
191
192 static void reg_vif_setup(struct net_device *dev)
193 {
194         dev->type               = ARPHRD_PIMREG;
195         dev->mtu                = 1500 - sizeof(struct iphdr) - 8;
196         dev->flags              = IFF_NOARP;
197         dev->hard_start_xmit    = reg_vif_xmit;
198         dev->get_stats          = reg_vif_get_stats;
199         dev->destructor         = free_netdev;
200 }
201
202 static struct net_device *ipmr_reg_vif(void)
203 {
204         struct net_device *dev;
205         struct in_device *in_dev;
206
207         dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
208                            reg_vif_setup);
209
210         if (dev == NULL)
211                 return NULL;
212
213         if (register_netdevice(dev)) {
214                 free_netdev(dev);
215                 return NULL;
216         }
217         dev->iflink = 0;
218
219         if ((in_dev = inetdev_init(dev)) == NULL)
220                 goto failure;
221
222         in_dev->cnf.rp_filter = 0;
223
224         if (dev_open(dev))
225                 goto failure;
226
227         return dev;
228
229 failure:
230         /* allow the register to be completed before unregistering. */
231         rtnl_unlock();
232         rtnl_lock();
233
234         unregister_netdevice(dev);
235         return NULL;
236 }
237 #endif
238
239 /*
240  *      Delete a VIF entry
241  */
242  
243 static int vif_delete(int vifi)
244 {
245         struct vif_device *v;
246         struct net_device *dev;
247         struct in_device *in_dev;
248
249         if (vifi < 0 || vifi >= maxvif)
250                 return -EADDRNOTAVAIL;
251
252         v = &vif_table[vifi];
253
254         write_lock_bh(&mrt_lock);
255         dev = v->dev;
256         v->dev = NULL;
257
258         if (!dev) {
259                 write_unlock_bh(&mrt_lock);
260                 return -EADDRNOTAVAIL;
261         }
262
263 #ifdef CONFIG_IP_PIMSM
264         if (vifi == reg_vif_num)
265                 reg_vif_num = -1;
266 #endif
267
268         if (vifi+1 == maxvif) {
269                 int tmp;
270                 for (tmp=vifi-1; tmp>=0; tmp--) {
271                         if (VIF_EXISTS(tmp))
272                                 break;
273                 }
274                 maxvif = tmp+1;
275         }
276
277         write_unlock_bh(&mrt_lock);
278
279         dev_set_allmulti(dev, -1);
280
281         if ((in_dev = __in_dev_get(dev)) != NULL) {
282                 in_dev->cnf.mc_forwarding--;
283                 ip_rt_multicast_event(in_dev);
284         }
285
286         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
287                 unregister_netdevice(dev);
288
289         dev_put(dev);
290         return 0;
291 }
292
293 /* Destroy an unresolved cache entry, killing queued skbs
294    and reporting error to netlink readers.
295  */
296
297 static void ipmr_destroy_unres(struct mfc_cache *c)
298 {
299         struct sk_buff *skb;
300
301         atomic_dec(&cache_resolve_queue_len);
302
303         while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
304                 if (skb->nh.iph->version == 0) {
305                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
306                         nlh->nlmsg_type = NLMSG_ERROR;
307                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
308                         skb_trim(skb, nlh->nlmsg_len);
309                         ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
310                         netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
311                 } else
312                         kfree_skb(skb);
313         }
314
315         kmem_cache_free(mrt_cachep, c);
316 }
317
318
319 /* Single timer process for all the unresolved queue. */
320
321 static void ipmr_expire_process(unsigned long dummy)
322 {
323         unsigned long now;
324         unsigned long expires;
325         struct mfc_cache *c, **cp;
326
327         if (!spin_trylock(&mfc_unres_lock)) {
328                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
329                 return;
330         }
331
332         if (atomic_read(&cache_resolve_queue_len) == 0)
333                 goto out;
334
335         now = jiffies;
336         expires = 10*HZ;
337         cp = &mfc_unres_queue;
338
339         while ((c=*cp) != NULL) {
340                 if (time_after(c->mfc_un.unres.expires, now)) {
341                         unsigned long interval = c->mfc_un.unres.expires - now;
342                         if (interval < expires)
343                                 expires = interval;
344                         cp = &c->next;
345                         continue;
346                 }
347
348                 *cp = c->next;
349
350                 ipmr_destroy_unres(c);
351         }
352
353         if (atomic_read(&cache_resolve_queue_len))
354                 mod_timer(&ipmr_expire_timer, jiffies + expires);
355
356 out:
357         spin_unlock(&mfc_unres_lock);
358 }
359
360 /* Fill oifs list. It is called under write locked mrt_lock. */
361
362 static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls)
363 {
364         int vifi;
365
366         cache->mfc_un.res.minvif = MAXVIFS;
367         cache->mfc_un.res.maxvif = 0;
368         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
369
370         for (vifi=0; vifi<maxvif; vifi++) {
371                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
372                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
373                         if (cache->mfc_un.res.minvif > vifi)
374                                 cache->mfc_un.res.minvif = vifi;
375                         if (cache->mfc_un.res.maxvif <= vifi)
376                                 cache->mfc_un.res.maxvif = vifi + 1;
377                 }
378         }
379 }
380
381 static int vif_add(struct vifctl *vifc, int mrtsock)
382 {
383         int vifi = vifc->vifc_vifi;
384         struct vif_device *v = &vif_table[vifi];
385         struct net_device *dev;
386         struct in_device *in_dev;
387
388         /* Is vif busy ? */
389         if (VIF_EXISTS(vifi))
390                 return -EADDRINUSE;
391
392         switch (vifc->vifc_flags) {
393 #ifdef CONFIG_IP_PIMSM
394         case VIFF_REGISTER:
395                 /*
396                  * Special Purpose VIF in PIM
397                  * All the packets will be sent to the daemon
398                  */
399                 if (reg_vif_num >= 0)
400                         return -EADDRINUSE;
401                 dev = ipmr_reg_vif();
402                 if (!dev)
403                         return -ENOBUFS;
404                 break;
405 #endif
406         case VIFF_TUNNEL:       
407                 dev = ipmr_new_tunnel(vifc);
408                 if (!dev)
409                         return -ENOBUFS;
410                 break;
411         case 0:
412                 dev=ip_dev_find(vifc->vifc_lcl_addr.s_addr);
413                 if (!dev)
414                         return -EADDRNOTAVAIL;
415                 __dev_put(dev);
416                 break;
417         default:
418                 return -EINVAL;
419         }
420
421         if ((in_dev = __in_dev_get(dev)) == NULL)
422                 return -EADDRNOTAVAIL;
423         in_dev->cnf.mc_forwarding++;
424         dev_set_allmulti(dev, +1);
425         ip_rt_multicast_event(in_dev);
426
427         /*
428          *      Fill in the VIF structures
429          */
430         v->rate_limit=vifc->vifc_rate_limit;
431         v->local=vifc->vifc_lcl_addr.s_addr;
432         v->remote=vifc->vifc_rmt_addr.s_addr;
433         v->flags=vifc->vifc_flags;
434         if (!mrtsock)
435                 v->flags |= VIFF_STATIC;
436         v->threshold=vifc->vifc_threshold;
437         v->bytes_in = 0;
438         v->bytes_out = 0;
439         v->pkt_in = 0;
440         v->pkt_out = 0;
441         v->link = dev->ifindex;
442         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
443                 v->link = dev->iflink;
444
445         /* And finish update writing critical data */
446         write_lock_bh(&mrt_lock);
447         dev_hold(dev);
448         v->dev=dev;
449 #ifdef CONFIG_IP_PIMSM
450         if (v->flags&VIFF_REGISTER)
451                 reg_vif_num = vifi;
452 #endif
453         if (vifi+1 > maxvif)
454                 maxvif = vifi+1;
455         write_unlock_bh(&mrt_lock);
456         return 0;
457 }
458
459 static struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp)
460 {
461         int line=MFC_HASH(mcastgrp,origin);
462         struct mfc_cache *c;
463
464         for (c=mfc_cache_array[line]; c; c = c->next) {
465                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
466                         break;
467         }
468         return c;
469 }
470
471 /*
472  *      Allocate a multicast cache entry
473  */
474 static struct mfc_cache *ipmr_cache_alloc(void)
475 {
476         struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL);
477         if(c==NULL)
478                 return NULL;
479         memset(c, 0, sizeof(*c));
480         c->mfc_un.res.minvif = MAXVIFS;
481         return c;
482 }
483
484 static struct mfc_cache *ipmr_cache_alloc_unres(void)
485 {
486         struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC);
487         if(c==NULL)
488                 return NULL;
489         memset(c, 0, sizeof(*c));
490         skb_queue_head_init(&c->mfc_un.unres.unresolved);
491         c->mfc_un.unres.expires = jiffies + 10*HZ;
492         return c;
493 }
494
495 /*
496  *      A cache entry has gone into a resolved state from queued
497  */
498  
499 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
500 {
501         struct sk_buff *skb;
502
503         /*
504          *      Play the pending entries through our router
505          */
506
507         while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
508                 if (skb->nh.iph->version == 0) {
509                         int err;
510                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
511
512                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
513                                 nlh->nlmsg_len = skb->tail - (u8*)nlh;
514                         } else {
515                                 nlh->nlmsg_type = NLMSG_ERROR;
516                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
517                                 skb_trim(skb, nlh->nlmsg_len);
518                                 ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE;
519                         }
520                         err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
521                 } else
522                         ip_mr_forward(skb, c, 0);
523         }
524 }
525
526 /*
527  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
528  *      expects the following bizarre scheme.
529  *
530  *      Called under mrt_lock.
531  */
532  
533 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
534 {
535         struct sk_buff *skb;
536         int ihl = pkt->nh.iph->ihl<<2;
537         struct igmphdr *igmp;
538         struct igmpmsg *msg;
539         int ret;
540
541 #ifdef CONFIG_IP_PIMSM
542         if (assert == IGMPMSG_WHOLEPKT)
543                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
544         else
545 #endif
546                 skb = alloc_skb(128, GFP_ATOMIC);
547
548         if(!skb)
549                 return -ENOBUFS;
550
551 #ifdef CONFIG_IP_PIMSM
552         if (assert == IGMPMSG_WHOLEPKT) {
553                 /* Ugly, but we have no choice with this interface.
554                    Duplicate old header, fix ihl, length etc.
555                    And all this only to mangle msg->im_msgtype and
556                    to set msg->im_mbz to "mbz" :-)
557                  */
558                 msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
559                 skb->nh.raw = skb->h.raw = (u8*)msg;
560                 memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
561                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
562                 msg->im_mbz = 0;
563                 msg->im_vif = reg_vif_num;
564                 skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
565                 skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
566         } else 
567 #endif
568         {       
569                 
570         /*
571          *      Copy the IP header
572          */
573
574         skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
575         memcpy(skb->data,pkt->data,ihl);
576         skb->nh.iph->protocol = 0;                      /* Flag to the kernel this is a route add */
577         msg = (struct igmpmsg*)skb->nh.iph;
578         msg->im_vif = vifi;
579         skb->dst = dst_clone(pkt->dst);
580
581         /*
582          *      Add our header
583          */
584
585         igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
586         igmp->type      =
587         msg->im_msgtype = assert;
588         igmp->code      =       0;
589         skb->nh.iph->tot_len=htons(skb->len);                   /* Fix the length */
590         skb->h.raw = skb->nh.raw;
591         }
592
593         if (mroute_socket == NULL) {
594                 kfree_skb(skb);
595                 return -EINVAL;
596         }
597
598         /*
599          *      Deliver to mrouted
600          */
601         if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
602                 if (net_ratelimit())
603                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
604                 kfree_skb(skb);
605         }
606
607         return ret;
608 }
609
610 /*
611  *      Queue a packet for resolution. It gets locked cache entry!
612  */
613  
614 static int
615 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
616 {
617         int err;
618         struct mfc_cache *c;
619
620         spin_lock_bh(&mfc_unres_lock);
621         for (c=mfc_unres_queue; c; c=c->next) {
622                 if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
623                     c->mfc_origin == skb->nh.iph->saddr)
624                         break;
625         }
626
627         if (c == NULL) {
628                 /*
629                  *      Create a new entry if allowable
630                  */
631
632                 if (atomic_read(&cache_resolve_queue_len)>=10 ||
633                     (c=ipmr_cache_alloc_unres())==NULL) {
634                         spin_unlock_bh(&mfc_unres_lock);
635
636                         kfree_skb(skb);
637                         return -ENOBUFS;
638                 }
639
640                 /*
641                  *      Fill in the new cache entry
642                  */
643                 c->mfc_parent=-1;
644                 c->mfc_origin=skb->nh.iph->saddr;
645                 c->mfc_mcastgrp=skb->nh.iph->daddr;
646
647                 /*
648                  *      Reflect first query at mrouted.
649                  */
650                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
651                         /* If the report failed throw the cache entry 
652                            out - Brad Parker
653                          */
654                         spin_unlock_bh(&mfc_unres_lock);
655
656                         kmem_cache_free(mrt_cachep, c);
657                         kfree_skb(skb);
658                         return err;
659                 }
660
661                 atomic_inc(&cache_resolve_queue_len);
662                 c->next = mfc_unres_queue;
663                 mfc_unres_queue = c;
664
665                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
666         }
667
668         /*
669          *      See if we can append the packet
670          */
671         if (c->mfc_un.unres.unresolved.qlen>3) {
672                 kfree_skb(skb);
673                 err = -ENOBUFS;
674         } else {
675                 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
676                 err = 0;
677         }
678
679         spin_unlock_bh(&mfc_unres_lock);
680         return err;
681 }
682
683 /*
684  *      MFC cache manipulation by user space mroute daemon
685  */
686
687 static int ipmr_mfc_delete(struct mfcctl *mfc)
688 {
689         int line;
690         struct mfc_cache *c, **cp;
691
692         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
693
694         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
695                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
696                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
697                         write_lock_bh(&mrt_lock);
698                         *cp = c->next;
699                         write_unlock_bh(&mrt_lock);
700
701                         kmem_cache_free(mrt_cachep, c);
702                         return 0;
703                 }
704         }
705         return -ENOENT;
706 }
707
708 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
709 {
710         int line;
711         struct mfc_cache *uc, *c, **cp;
712
713         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
714
715         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
716                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
717                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
718                         break;
719         }
720
721         if (c != NULL) {
722                 write_lock_bh(&mrt_lock);
723                 c->mfc_parent = mfc->mfcc_parent;
724                 ipmr_update_threshoulds(c, mfc->mfcc_ttls);
725                 if (!mrtsock)
726                         c->mfc_flags |= MFC_STATIC;
727                 write_unlock_bh(&mrt_lock);
728                 return 0;
729         }
730
731         if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
732                 return -EINVAL;
733
734         c=ipmr_cache_alloc();
735         if (c==NULL)
736                 return -ENOMEM;
737
738         c->mfc_origin=mfc->mfcc_origin.s_addr;
739         c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
740         c->mfc_parent=mfc->mfcc_parent;
741         ipmr_update_threshoulds(c, mfc->mfcc_ttls);
742         if (!mrtsock)
743                 c->mfc_flags |= MFC_STATIC;
744
745         write_lock_bh(&mrt_lock);
746         c->next = mfc_cache_array[line];
747         mfc_cache_array[line] = c;
748         write_unlock_bh(&mrt_lock);
749
750         /*
751          *      Check to see if we resolved a queued list. If so we
752          *      need to send on the frames and tidy up.
753          */
754         spin_lock_bh(&mfc_unres_lock);
755         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
756              cp = &uc->next) {
757                 if (uc->mfc_origin == c->mfc_origin &&
758                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
759                         *cp = uc->next;
760                         if (atomic_dec_and_test(&cache_resolve_queue_len))
761                                 del_timer(&ipmr_expire_timer);
762                         break;
763                 }
764         }
765         spin_unlock_bh(&mfc_unres_lock);
766
767         if (uc) {
768                 ipmr_cache_resolve(uc, c);
769                 kmem_cache_free(mrt_cachep, uc);
770         }
771         return 0;
772 }
773
774 /*
775  *      Close the multicast socket, and clear the vif tables etc
776  */
777  
778 static void mroute_clean_tables(struct sock *sk)
779 {
780         int i;
781                 
782         /*
783          *      Shut down all active vif entries
784          */
785         for(i=0; i<maxvif; i++) {
786                 if (!(vif_table[i].flags&VIFF_STATIC))
787                         vif_delete(i);
788         }
789
790         /*
791          *      Wipe the cache
792          */
793         for (i=0;i<MFC_LINES;i++) {
794                 struct mfc_cache *c, **cp;
795
796                 cp = &mfc_cache_array[i];
797                 while ((c = *cp) != NULL) {
798                         if (c->mfc_flags&MFC_STATIC) {
799                                 cp = &c->next;
800                                 continue;
801                         }
802                         write_lock_bh(&mrt_lock);
803                         *cp = c->next;
804                         write_unlock_bh(&mrt_lock);
805
806                         kmem_cache_free(mrt_cachep, c);
807                 }
808         }
809
810         if (atomic_read(&cache_resolve_queue_len) != 0) {
811                 struct mfc_cache *c;
812
813                 spin_lock_bh(&mfc_unres_lock);
814                 while (mfc_unres_queue != NULL) {
815                         c = mfc_unres_queue;
816                         mfc_unres_queue = c->next;
817                         spin_unlock_bh(&mfc_unres_lock);
818
819                         ipmr_destroy_unres(c);
820
821                         spin_lock_bh(&mfc_unres_lock);
822                 }
823                 spin_unlock_bh(&mfc_unres_lock);
824         }
825 }
826
827 static void mrtsock_destruct(struct sock *sk)
828 {
829         rtnl_lock();
830         if (sk == mroute_socket) {
831                 ipv4_devconf.mc_forwarding--;
832
833                 write_lock_bh(&mrt_lock);
834                 mroute_socket=NULL;
835                 write_unlock_bh(&mrt_lock);
836
837                 mroute_clean_tables(sk);
838         }
839         rtnl_unlock();
840 }
841
842 /*
843  *      Socket options and virtual interface manipulation. The whole
844  *      virtual interface system is a complete heap, but unfortunately
845  *      that's how BSD mrouted happens to think. Maybe one day with a proper
846  *      MOSPF/PIM router set up we can clean this up.
847  */
848  
849 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
850 {
851         int ret;
852         struct vifctl vif;
853         struct mfcctl mfc;
854         
855         if(optname!=MRT_INIT)
856         {
857                 if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
858                         return -EACCES;
859         }
860
861         switch(optname)
862         {
863                 case MRT_INIT:
864                         if (sk->sk_type != SOCK_RAW ||
865                             inet_sk(sk)->num != IPPROTO_IGMP)
866                                 return -EOPNOTSUPP;
867                         if(optlen!=sizeof(int))
868                                 return -ENOPROTOOPT;
869
870                         rtnl_lock();
871                         if (mroute_socket) {
872                                 rtnl_unlock();
873                                 return -EADDRINUSE;
874                         }
875
876                         ret = ip_ra_control(sk, 1, mrtsock_destruct);
877                         if (ret == 0) {
878                                 write_lock_bh(&mrt_lock);
879                                 mroute_socket=sk;
880                                 write_unlock_bh(&mrt_lock);
881
882                                 ipv4_devconf.mc_forwarding++;
883                         }
884                         rtnl_unlock();
885                         return ret;
886                 case MRT_DONE:
887                         if (sk!=mroute_socket)
888                                 return -EACCES;
889                         return ip_ra_control(sk, 0, NULL);
890                 case MRT_ADD_VIF:
891                 case MRT_DEL_VIF:
892                         if(optlen!=sizeof(vif))
893                                 return -EINVAL;
894                         if (copy_from_user(&vif,optval,sizeof(vif)))
895                                 return -EFAULT; 
896                         if(vif.vifc_vifi >= MAXVIFS)
897                                 return -ENFILE;
898                         rtnl_lock();
899                         if (optname==MRT_ADD_VIF) {
900                                 ret = vif_add(&vif, sk==mroute_socket);
901                         } else {
902                                 ret = vif_delete(vif.vifc_vifi);
903                         }
904                         rtnl_unlock();
905                         return ret;
906
907                 /*
908                  *      Manipulate the forwarding caches. These live
909                  *      in a sort of kernel/user symbiosis.
910                  */
911                 case MRT_ADD_MFC:
912                 case MRT_DEL_MFC:
913                         if(optlen!=sizeof(mfc))
914                                 return -EINVAL;
915                         if (copy_from_user(&mfc,optval, sizeof(mfc)))
916                                 return -EFAULT;
917                         rtnl_lock();
918                         if (optname==MRT_DEL_MFC)
919                                 ret = ipmr_mfc_delete(&mfc);
920                         else
921                                 ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
922                         rtnl_unlock();
923                         return ret;
924                 /*
925                  *      Control PIM assert.
926                  */
927                 case MRT_ASSERT:
928                 {
929                         int v;
930                         if(get_user(v,(int __user *)optval))
931                                 return -EFAULT;
932                         mroute_do_assert=(v)?1:0;
933                         return 0;
934                 }
935 #ifdef CONFIG_IP_PIMSM
936                 case MRT_PIM:
937                 {
938                         int v, ret;
939                         if(get_user(v,(int __user *)optval))
940                                 return -EFAULT;
941                         v = (v)?1:0;
942                         rtnl_lock();
943                         ret = 0;
944                         if (v != mroute_do_pim) {
945                                 mroute_do_pim = v;
946                                 mroute_do_assert = v;
947 #ifdef CONFIG_IP_PIMSM_V2
948                                 if (mroute_do_pim)
949                                         ret = inet_add_protocol(&pim_protocol,
950                                                                 IPPROTO_PIM);
951                                 else
952                                         ret = inet_del_protocol(&pim_protocol,
953                                                                 IPPROTO_PIM);
954                                 if (ret < 0)
955                                         ret = -EAGAIN;
956 #endif
957                         }
958                         rtnl_unlock();
959                         return ret;
960                 }
961 #endif
962                 /*
963                  *      Spurious command, or MRT_VERSION which you cannot
964                  *      set.
965                  */
966                 default:
967                         return -ENOPROTOOPT;
968         }
969 }
970
971 /*
972  *      Getsock opt support for the multicast routing system.
973  */
974  
975 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
976 {
977         int olr;
978         int val;
979
980         if(optname!=MRT_VERSION && 
981 #ifdef CONFIG_IP_PIMSM
982            optname!=MRT_PIM &&
983 #endif
984            optname!=MRT_ASSERT)
985                 return -ENOPROTOOPT;
986
987         if (get_user(olr, optlen))
988                 return -EFAULT;
989
990         olr = min_t(unsigned int, olr, sizeof(int));
991         if (olr < 0)
992                 return -EINVAL;
993                 
994         if(put_user(olr,optlen))
995                 return -EFAULT;
996         if(optname==MRT_VERSION)
997                 val=0x0305;
998 #ifdef CONFIG_IP_PIMSM
999         else if(optname==MRT_PIM)
1000                 val=mroute_do_pim;
1001 #endif
1002         else
1003                 val=mroute_do_assert;
1004         if(copy_to_user(optval,&val,olr))
1005                 return -EFAULT;
1006         return 0;
1007 }
1008
1009 /*
1010  *      The IP multicast ioctl support routines.
1011  */
1012  
1013 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1014 {
1015         struct sioc_sg_req sr;
1016         struct sioc_vif_req vr;
1017         struct vif_device *vif;
1018         struct mfc_cache *c;
1019         
1020         switch(cmd)
1021         {
1022                 case SIOCGETVIFCNT:
1023                         if (copy_from_user(&vr,arg,sizeof(vr)))
1024                                 return -EFAULT; 
1025                         if(vr.vifi>=maxvif)
1026                                 return -EINVAL;
1027                         read_lock(&mrt_lock);
1028                         vif=&vif_table[vr.vifi];
1029                         if(VIF_EXISTS(vr.vifi)) {
1030                                 vr.icount=vif->pkt_in;
1031                                 vr.ocount=vif->pkt_out;
1032                                 vr.ibytes=vif->bytes_in;
1033                                 vr.obytes=vif->bytes_out;
1034                                 read_unlock(&mrt_lock);
1035
1036                                 if (copy_to_user(arg,&vr,sizeof(vr)))
1037                                         return -EFAULT;
1038                                 return 0;
1039                         }
1040                         read_unlock(&mrt_lock);
1041                         return -EADDRNOTAVAIL;
1042                 case SIOCGETSGCNT:
1043                         if (copy_from_user(&sr,arg,sizeof(sr)))
1044                                 return -EFAULT;
1045
1046                         read_lock(&mrt_lock);
1047                         c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1048                         if (c) {
1049                                 sr.pktcnt = c->mfc_un.res.pkt;
1050                                 sr.bytecnt = c->mfc_un.res.bytes;
1051                                 sr.wrong_if = c->mfc_un.res.wrong_if;
1052                                 read_unlock(&mrt_lock);
1053
1054                                 if (copy_to_user(arg,&sr,sizeof(sr)))
1055                                         return -EFAULT;
1056                                 return 0;
1057                         }
1058                         read_unlock(&mrt_lock);
1059                         return -EADDRNOTAVAIL;
1060                 default:
1061                         return -ENOIOCTLCMD;
1062         }
1063 }
1064
1065
1066 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1067 {
1068         struct vif_device *v;
1069         int ct;
1070         if (event != NETDEV_UNREGISTER)
1071                 return NOTIFY_DONE;
1072         v=&vif_table[0];
1073         for(ct=0;ct<maxvif;ct++,v++) {
1074                 if (v->dev==ptr)
1075                         vif_delete(ct);
1076         }
1077         return NOTIFY_DONE;
1078 }
1079
1080
1081 static struct notifier_block ip_mr_notifier={
1082         .notifier_call = ipmr_device_event,
1083 };
1084
1085 /*
1086  *      Encapsulate a packet by attaching a valid IPIP header to it.
1087  *      This avoids tunnel drivers and other mess and gives us the speed so
1088  *      important for multicast video.
1089  */
1090  
1091 static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr)
1092 {
1093         struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
1094
1095         iph->version    =       4;
1096         iph->tos        =       skb->nh.iph->tos;
1097         iph->ttl        =       skb->nh.iph->ttl;
1098         iph->frag_off   =       0;
1099         iph->daddr      =       daddr;
1100         iph->saddr      =       saddr;
1101         iph->protocol   =       IPPROTO_IPIP;
1102         iph->ihl        =       5;
1103         iph->tot_len    =       htons(skb->len);
1104         ip_select_ident(iph, skb->dst, NULL);
1105         ip_send_check(iph);
1106
1107         skb->h.ipiph = skb->nh.iph;
1108         skb->nh.iph = iph;
1109         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1110         nf_reset(skb);
1111 }
1112
1113 static inline int ipmr_forward_finish(struct sk_buff *skb)
1114 {
1115         struct ip_options * opt = &(IPCB(skb)->opt);
1116
1117         IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1118
1119         if (unlikely(opt->optlen))
1120                 ip_forward_options(skb);
1121
1122         return dst_output(skb);
1123 }
1124
1125 /*
1126  *      Processing handlers for ipmr_forward
1127  */
1128
1129 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1130 {
1131         struct iphdr *iph = skb->nh.iph;
1132         struct vif_device *vif = &vif_table[vifi];
1133         struct net_device *dev;
1134         struct rtable *rt;
1135         int    encap = 0;
1136
1137         if (vif->dev == NULL)
1138                 goto out_free;
1139
1140 #ifdef CONFIG_IP_PIMSM
1141         if (vif->flags & VIFF_REGISTER) {
1142                 vif->pkt_out++;
1143                 vif->bytes_out+=skb->len;
1144                 ((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len;
1145                 ((struct net_device_stats*)vif->dev->priv)->tx_packets++;
1146                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1147                 kfree_skb(skb);
1148                 return;
1149         }
1150 #endif
1151
1152         if (vif->flags&VIFF_TUNNEL) {
1153                 struct flowi fl = { .oif = vif->link,
1154                                     .nl_u = { .ip4_u =
1155                                               { .daddr = vif->remote,
1156                                                 .saddr = vif->local,
1157                                                 .tos = RT_TOS(iph->tos) } },
1158                                     .proto = IPPROTO_IPIP };
1159                 if (ip_route_output_key(&rt, &fl))
1160                         goto out_free;
1161                 encap = sizeof(struct iphdr);
1162         } else {
1163                 struct flowi fl = { .oif = vif->link,
1164                                     .nl_u = { .ip4_u =
1165                                               { .daddr = iph->daddr,
1166                                                 .tos = RT_TOS(iph->tos) } },
1167                                     .proto = IPPROTO_IPIP };
1168                 if (ip_route_output_key(&rt, &fl))
1169                         goto out_free;
1170         }
1171
1172         dev = rt->u.dst.dev;
1173
1174         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1175                 /* Do not fragment multicasts. Alas, IPv4 does not
1176                    allow to send ICMP, so that packets will disappear
1177                    to blackhole.
1178                  */
1179
1180                 IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1181                 ip_rt_put(rt);
1182                 goto out_free;
1183         }
1184
1185         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1186
1187         if (skb_cow(skb, encap)) {
1188                 ip_rt_put(rt);
1189                 goto out_free;
1190         }
1191
1192         vif->pkt_out++;
1193         vif->bytes_out+=skb->len;
1194
1195         dst_release(skb->dst);
1196         skb->dst = &rt->u.dst;
1197         iph = skb->nh.iph;
1198         ip_decrease_ttl(iph);
1199
1200         /* FIXME: forward and output firewalls used to be called here.
1201          * What do we do with netfilter? -- RR */
1202         if (vif->flags & VIFF_TUNNEL) {
1203                 ip_encap(skb, vif->local, vif->remote);
1204                 /* FIXME: extra output firewall step used to be here. --RR */
1205                 ((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++;
1206                 ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb->len;
1207         }
1208
1209         IPCB(skb)->flags |= IPSKB_FORWARDED;
1210
1211         /*
1212          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1213          * not only before forwarding, but after forwarding on all output
1214          * interfaces. It is clear, if mrouter runs a multicasting
1215          * program, it should receive packets not depending to what interface
1216          * program is joined.
1217          * If we will not make it, the program will have to join on all
1218          * interfaces. On the other hand, multihoming host (or router, but
1219          * not mrouter) cannot join to more than one interface - it will
1220          * result in receiving multiple packets.
1221          */
1222         NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev, 
1223                 ipmr_forward_finish);
1224         return;
1225
1226 out_free:
1227         kfree_skb(skb);
1228         return;
1229 }
1230
1231 static int ipmr_find_vif(struct net_device *dev)
1232 {
1233         int ct;
1234         for (ct=maxvif-1; ct>=0; ct--) {
1235                 if (vif_table[ct].dev == dev)
1236                         break;
1237         }
1238         return ct;
1239 }
1240
1241 /* "local" means that we should preserve one skb (for local delivery) */
1242
1243 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1244 {
1245         int psend = -1;
1246         int vif, ct;
1247
1248         vif = cache->mfc_parent;
1249         cache->mfc_un.res.pkt++;
1250         cache->mfc_un.res.bytes += skb->len;
1251
1252         /*
1253          * Wrong interface: drop packet and (maybe) send PIM assert.
1254          */
1255         if (vif_table[vif].dev != skb->dev) {
1256                 int true_vifi;
1257
1258                 if (((struct rtable*)skb->dst)->fl.iif == 0) {
1259                         /* It is our own packet, looped back.
1260                            Very complicated situation...
1261
1262                            The best workaround until routing daemons will be
1263                            fixed is not to redistribute packet, if it was
1264                            send through wrong interface. It means, that
1265                            multicast applications WILL NOT work for
1266                            (S,G), which have default multicast route pointing
1267                            to wrong oif. In any case, it is not a good
1268                            idea to use multicasting applications on router.
1269                          */
1270                         goto dont_forward;
1271                 }
1272
1273                 cache->mfc_un.res.wrong_if++;
1274                 true_vifi = ipmr_find_vif(skb->dev);
1275
1276                 if (true_vifi >= 0 && mroute_do_assert &&
1277                     /* pimsm uses asserts, when switching from RPT to SPT,
1278                        so that we cannot check that packet arrived on an oif.
1279                        It is bad, but otherwise we would need to move pretty
1280                        large chunk of pimd to kernel. Ough... --ANK
1281                      */
1282                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1283                     time_after(jiffies, 
1284                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1285                         cache->mfc_un.res.last_assert = jiffies;
1286                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1287                 }
1288                 goto dont_forward;
1289         }
1290
1291         vif_table[vif].pkt_in++;
1292         vif_table[vif].bytes_in+=skb->len;
1293
1294         /*
1295          *      Forward the frame
1296          */
1297         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1298                 if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
1299                         if (psend != -1) {
1300                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1301                                 if (skb2)
1302                                         ipmr_queue_xmit(skb2, cache, psend);
1303                         }
1304                         psend=ct;
1305                 }
1306         }
1307         if (psend != -1) {
1308                 if (local) {
1309                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1310                         if (skb2)
1311                                 ipmr_queue_xmit(skb2, cache, psend);
1312                 } else {
1313                         ipmr_queue_xmit(skb, cache, psend);
1314                         return 0;
1315                 }
1316         }
1317
1318 dont_forward:
1319         if (!local)
1320                 kfree_skb(skb);
1321         return 0;
1322 }
1323
1324
1325 /*
1326  *      Multicast packets for forwarding arrive here
1327  */
1328
1329 int ip_mr_input(struct sk_buff *skb)
1330 {
1331         struct mfc_cache *cache;
1332         int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1333
1334         /* Packet is looped back after forward, it should not be
1335            forwarded second time, but still can be delivered locally.
1336          */
1337         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1338                 goto dont_forward;
1339
1340         if (!local) {
1341                     if (IPCB(skb)->opt.router_alert) {
1342                             if (ip_call_ra_chain(skb))
1343                                     return 0;
1344                     } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
1345                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1346                                Cisco IOS <= 11.2(8)) do not put router alert
1347                                option to IGMP packets destined to routable
1348                                groups. It is very bad, because it means
1349                                that we can forward NO IGMP messages.
1350                              */
1351                             read_lock(&mrt_lock);
1352                             if (mroute_socket) {
1353                                     nf_reset(skb);
1354                                     raw_rcv(mroute_socket, skb);
1355                                     read_unlock(&mrt_lock);
1356                                     return 0;
1357                             }
1358                             read_unlock(&mrt_lock);
1359                     }
1360         }
1361
1362         read_lock(&mrt_lock);
1363         cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
1364
1365         /*
1366          *      No usable cache entry
1367          */
1368         if (cache==NULL) {
1369                 int vif;
1370
1371                 if (local) {
1372                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1373                         ip_local_deliver(skb);
1374                         if (skb2 == NULL) {
1375                                 read_unlock(&mrt_lock);
1376                                 return -ENOBUFS;
1377                         }
1378                         skb = skb2;
1379                 }
1380
1381                 vif = ipmr_find_vif(skb->dev);
1382                 if (vif >= 0) {
1383                         int err = ipmr_cache_unresolved(vif, skb);
1384                         read_unlock(&mrt_lock);
1385
1386                         return err;
1387                 }
1388                 read_unlock(&mrt_lock);
1389                 kfree_skb(skb);
1390                 return -ENODEV;
1391         }
1392
1393         ip_mr_forward(skb, cache, local);
1394
1395         read_unlock(&mrt_lock);
1396
1397         if (local)
1398                 return ip_local_deliver(skb);
1399
1400         return 0;
1401
1402 dont_forward:
1403         if (local)
1404                 return ip_local_deliver(skb);
1405         kfree_skb(skb);
1406         return 0;
1407 }
1408
1409 #ifdef CONFIG_IP_PIMSM_V1
1410 /*
1411  * Handle IGMP messages of PIMv1
1412  */
1413
1414 int pim_rcv_v1(struct sk_buff * skb)
1415 {
1416         struct igmphdr *pim;
1417         struct iphdr   *encap;
1418         struct net_device  *reg_dev = NULL;
1419
1420         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 
1421                 goto drop;
1422
1423         pim = (struct igmphdr*)skb->h.raw;
1424
1425         if (!mroute_do_pim ||
1426             skb->len < sizeof(*pim) + sizeof(*encap) ||
1427             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 
1428                 goto drop;
1429
1430         encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
1431         /*
1432            Check that:
1433            a. packet is really destinted to a multicast group
1434            b. packet is not a NULL-REGISTER
1435            c. packet is not truncated
1436          */
1437         if (!MULTICAST(encap->daddr) ||
1438             encap->tot_len == 0 ||
1439             ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 
1440                 goto drop;
1441
1442         read_lock(&mrt_lock);
1443         if (reg_vif_num >= 0)
1444                 reg_dev = vif_table[reg_vif_num].dev;
1445         if (reg_dev)
1446                 dev_hold(reg_dev);
1447         read_unlock(&mrt_lock);
1448
1449         if (reg_dev == NULL) 
1450                 goto drop;
1451
1452         skb->mac.raw = skb->nh.raw;
1453         skb_pull(skb, (u8*)encap - skb->data);
1454         skb->nh.iph = (struct iphdr *)skb->data;
1455         skb->dev = reg_dev;
1456         memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1457         skb->protocol = htons(ETH_P_IP);
1458         skb->ip_summed = 0;
1459         skb->pkt_type = PACKET_HOST;
1460         dst_release(skb->dst);
1461         skb->dst = NULL;
1462         ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
1463         ((struct net_device_stats*)reg_dev->priv)->rx_packets++;
1464         nf_reset(skb);
1465         netif_rx(skb);
1466         dev_put(reg_dev);
1467         return 0;
1468  drop:
1469         kfree_skb(skb);
1470         return 0;
1471 }
1472 #endif
1473
1474 #ifdef CONFIG_IP_PIMSM_V2
1475 static int pim_rcv(struct sk_buff * skb)
1476 {
1477         struct pimreghdr *pim;
1478         struct iphdr   *encap;
1479         struct net_device  *reg_dev = NULL;
1480
1481         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 
1482                 goto drop;
1483
1484         pim = (struct pimreghdr*)skb->h.raw;
1485         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1486             (pim->flags&PIM_NULL_REGISTER) ||
1487             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && 
1488              (u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 
1489                 goto drop;
1490
1491         /* check if the inner packet is destined to mcast group */
1492         encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
1493         if (!MULTICAST(encap->daddr) ||
1494             encap->tot_len == 0 ||
1495             ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 
1496                 goto drop;
1497
1498         read_lock(&mrt_lock);
1499         if (reg_vif_num >= 0)
1500                 reg_dev = vif_table[reg_vif_num].dev;
1501         if (reg_dev)
1502                 dev_hold(reg_dev);
1503         read_unlock(&mrt_lock);
1504
1505         if (reg_dev == NULL) 
1506                 goto drop;
1507
1508         skb->mac.raw = skb->nh.raw;
1509         skb_pull(skb, (u8*)encap - skb->data);
1510         skb->nh.iph = (struct iphdr *)skb->data;
1511         skb->dev = reg_dev;
1512         memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
1513         skb->protocol = htons(ETH_P_IP);
1514         skb->ip_summed = 0;
1515         skb->pkt_type = PACKET_HOST;
1516         dst_release(skb->dst);
1517         ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
1518         ((struct net_device_stats*)reg_dev->priv)->rx_packets++;
1519         skb->dst = NULL;
1520         nf_reset(skb);
1521         netif_rx(skb);
1522         dev_put(reg_dev);
1523         return 0;
1524  drop:
1525         kfree_skb(skb);
1526         return 0;
1527 }
1528 #endif
1529
1530 static int
1531 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1532 {
1533         int ct;
1534         struct rtnexthop *nhp;
1535         struct net_device *dev = vif_table[c->mfc_parent].dev;
1536         u8 *b = skb->tail;
1537         struct rtattr *mp_head;
1538
1539         if (dev)
1540                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1541
1542         mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1543
1544         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1545                 if (c->mfc_un.res.ttls[ct] < 255) {
1546                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1547                                 goto rtattr_failure;
1548                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1549                         nhp->rtnh_flags = 0;
1550                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1551                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1552                         nhp->rtnh_len = sizeof(*nhp);
1553                 }
1554         }
1555         mp_head->rta_type = RTA_MULTIPATH;
1556         mp_head->rta_len = skb->tail - (u8*)mp_head;
1557         rtm->rtm_type = RTN_MULTICAST;
1558         return 1;
1559
1560 rtattr_failure:
1561         skb_trim(skb, b - skb->data);
1562         return -EMSGSIZE;
1563 }
1564
1565 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1566 {
1567         int err;
1568         struct mfc_cache *cache;
1569         struct rtable *rt = (struct rtable*)skb->dst;
1570
1571         read_lock(&mrt_lock);
1572         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1573
1574         if (cache==NULL) {
1575                 struct net_device *dev;
1576                 int vif;
1577
1578                 if (nowait) {
1579                         read_unlock(&mrt_lock);
1580                         return -EAGAIN;
1581                 }
1582
1583                 dev = skb->dev;
1584                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1585                         read_unlock(&mrt_lock);
1586                         return -ENODEV;
1587                 }
1588                 skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
1589                 skb->nh.iph->ihl = sizeof(struct iphdr)>>2;
1590                 skb->nh.iph->saddr = rt->rt_src;
1591                 skb->nh.iph->daddr = rt->rt_dst;
1592                 skb->nh.iph->version = 0;
1593                 err = ipmr_cache_unresolved(vif, skb);
1594                 read_unlock(&mrt_lock);
1595                 return err;
1596         }
1597
1598         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1599                 cache->mfc_flags |= MFC_NOTIFY;
1600         err = ipmr_fill_mroute(skb, cache, rtm);
1601         read_unlock(&mrt_lock);
1602         return err;
1603 }
1604
1605 #ifdef CONFIG_PROC_FS   
1606 /*
1607  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1608  */
1609 struct ipmr_vif_iter {
1610         int ct;
1611 };
1612
1613 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1614                                            loff_t pos)
1615 {
1616         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1617                 if(!VIF_EXISTS(iter->ct))
1618                         continue;
1619                 if (pos-- == 0) 
1620                         return &vif_table[iter->ct];
1621         }
1622         return NULL;
1623 }
1624
1625 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1626 {
1627         read_lock(&mrt_lock);
1628         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1) 
1629                 : SEQ_START_TOKEN;
1630 }
1631
1632 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1633 {
1634         struct ipmr_vif_iter *iter = seq->private;
1635
1636         ++*pos;
1637         if (v == SEQ_START_TOKEN)
1638                 return ipmr_vif_seq_idx(iter, 0);
1639         
1640         while (++iter->ct < maxvif) {
1641                 if(!VIF_EXISTS(iter->ct))
1642                         continue;
1643                 return &vif_table[iter->ct];
1644         }
1645         return NULL;
1646 }
1647
1648 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1649 {
1650         read_unlock(&mrt_lock);
1651 }
1652
1653 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1654 {
1655         if (v == SEQ_START_TOKEN) {
1656                 seq_puts(seq, 
1657                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1658         } else {
1659                 const struct vif_device *vif = v;
1660                 const char *name =  vif->dev ? vif->dev->name : "none";
1661
1662                 seq_printf(seq,
1663                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1664                            vif - vif_table,
1665                            name, vif->bytes_in, vif->pkt_in, 
1666                            vif->bytes_out, vif->pkt_out,
1667                            vif->flags, vif->local, vif->remote);
1668         }
1669         return 0;
1670 }
1671
1672 static struct seq_operations ipmr_vif_seq_ops = {
1673         .start = ipmr_vif_seq_start,
1674         .next  = ipmr_vif_seq_next,
1675         .stop  = ipmr_vif_seq_stop,
1676         .show  = ipmr_vif_seq_show,
1677 };
1678
1679 static int ipmr_vif_open(struct inode *inode, struct file *file)
1680 {
1681         struct seq_file *seq;
1682         int rc = -ENOMEM;
1683         struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1684        
1685         if (!s)
1686                 goto out;
1687
1688         rc = seq_open(file, &ipmr_vif_seq_ops);
1689         if (rc)
1690                 goto out_kfree;
1691
1692         s->ct = 0;
1693         seq = file->private_data;
1694         seq->private = s;
1695 out:
1696         return rc;
1697 out_kfree:
1698         kfree(s);
1699         goto out;
1700
1701 }
1702
1703 static struct file_operations ipmr_vif_fops = {
1704         .owner   = THIS_MODULE,
1705         .open    = ipmr_vif_open,
1706         .read    = seq_read,
1707         .llseek  = seq_lseek,
1708         .release = seq_release_private,
1709 };
1710
1711 struct ipmr_mfc_iter {
1712         struct mfc_cache **cache;
1713         int ct;
1714 };
1715
1716
1717 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1718 {
1719         struct mfc_cache *mfc;
1720
1721         it->cache = mfc_cache_array;
1722         read_lock(&mrt_lock);
1723         for (it->ct = 0; it->ct < MFC_LINES; it->ct++) 
1724                 for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) 
1725                         if (pos-- == 0) 
1726                                 return mfc;
1727         read_unlock(&mrt_lock);
1728
1729         it->cache = &mfc_unres_queue;
1730         spin_lock_bh(&mfc_unres_lock);
1731         for(mfc = mfc_unres_queue; mfc; mfc = mfc->next) 
1732                 if (pos-- == 0)
1733                         return mfc;
1734         spin_unlock_bh(&mfc_unres_lock);
1735
1736         it->cache = NULL;
1737         return NULL;
1738 }
1739
1740
1741 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1742 {
1743         struct ipmr_mfc_iter *it = seq->private;
1744         it->cache = NULL;
1745         it->ct = 0;
1746         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1) 
1747                 : SEQ_START_TOKEN;
1748 }
1749
1750 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1751 {
1752         struct mfc_cache *mfc = v;
1753         struct ipmr_mfc_iter *it = seq->private;
1754
1755         ++*pos;
1756
1757         if (v == SEQ_START_TOKEN)
1758                 return ipmr_mfc_seq_idx(seq->private, 0);
1759
1760         if (mfc->next)
1761                 return mfc->next;
1762         
1763         if (it->cache == &mfc_unres_queue) 
1764                 goto end_of_list;
1765
1766         BUG_ON(it->cache != mfc_cache_array);
1767
1768         while (++it->ct < MFC_LINES) {
1769                 mfc = mfc_cache_array[it->ct];
1770                 if (mfc)
1771                         return mfc;
1772         }
1773
1774         /* exhausted cache_array, show unresolved */
1775         read_unlock(&mrt_lock);
1776         it->cache = &mfc_unres_queue;
1777         it->ct = 0;
1778                 
1779         spin_lock_bh(&mfc_unres_lock);
1780         mfc = mfc_unres_queue;
1781         if (mfc) 
1782                 return mfc;
1783
1784  end_of_list:
1785         spin_unlock_bh(&mfc_unres_lock);
1786         it->cache = NULL;
1787
1788         return NULL;
1789 }
1790
1791 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1792 {
1793         struct ipmr_mfc_iter *it = seq->private;
1794
1795         if (it->cache == &mfc_unres_queue)
1796                 spin_unlock_bh(&mfc_unres_lock);
1797         else if (it->cache == mfc_cache_array)
1798                 read_unlock(&mrt_lock);
1799 }
1800
1801 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1802 {
1803         int n;
1804
1805         if (v == SEQ_START_TOKEN) {
1806                 seq_puts(seq, 
1807                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1808         } else {
1809                 const struct mfc_cache *mfc = v;
1810                 const struct ipmr_mfc_iter *it = seq->private;
1811                 
1812                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1813                            (unsigned long) mfc->mfc_mcastgrp,
1814                            (unsigned long) mfc->mfc_origin,
1815                            mfc->mfc_parent,
1816                            mfc->mfc_un.res.pkt,
1817                            mfc->mfc_un.res.bytes,
1818                            mfc->mfc_un.res.wrong_if);
1819
1820                 if (it->cache != &mfc_unres_queue) {
1821                         for(n = mfc->mfc_un.res.minvif; 
1822                             n < mfc->mfc_un.res.maxvif; n++ ) {
1823                                 if(VIF_EXISTS(n) 
1824                                    && mfc->mfc_un.res.ttls[n] < 255)
1825                                 seq_printf(seq, 
1826                                            " %2d:%-3d", 
1827                                            n, mfc->mfc_un.res.ttls[n]);
1828                         }
1829                 }
1830                 seq_putc(seq, '\n');
1831         }
1832         return 0;
1833 }
1834
1835 static struct seq_operations ipmr_mfc_seq_ops = {
1836         .start = ipmr_mfc_seq_start,
1837         .next  = ipmr_mfc_seq_next,
1838         .stop  = ipmr_mfc_seq_stop,
1839         .show  = ipmr_mfc_seq_show,
1840 };
1841
1842 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1843 {
1844         struct seq_file *seq;
1845         int rc = -ENOMEM;
1846         struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1847        
1848         if (!s)
1849                 goto out;
1850
1851         rc = seq_open(file, &ipmr_mfc_seq_ops);
1852         if (rc)
1853                 goto out_kfree;
1854
1855         seq = file->private_data;
1856         seq->private = s;
1857 out:
1858         return rc;
1859 out_kfree:
1860         kfree(s);
1861         goto out;
1862
1863 }
1864
1865 static struct file_operations ipmr_mfc_fops = {
1866         .owner   = THIS_MODULE,
1867         .open    = ipmr_mfc_open,
1868         .read    = seq_read,
1869         .llseek  = seq_lseek,
1870         .release = seq_release_private,
1871 };
1872 #endif  
1873
1874 #ifdef CONFIG_IP_PIMSM_V2
1875 static struct net_protocol pim_protocol = {
1876         .handler        =       pim_rcv,
1877 };
1878 #endif
1879
1880
1881 /*
1882  *      Setup for IP multicast routing
1883  */
1884  
1885 void __init ip_mr_init(void)
1886 {
1887         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1888                                        sizeof(struct mfc_cache),
1889                                        0, SLAB_HWCACHE_ALIGN,
1890                                        NULL, NULL);
1891         if (!mrt_cachep)
1892                 panic("cannot allocate ip_mrt_cache");
1893
1894         init_timer(&ipmr_expire_timer);
1895         ipmr_expire_timer.function=ipmr_expire_process;
1896         register_netdevice_notifier(&ip_mr_notifier);
1897 #ifdef CONFIG_PROC_FS   
1898         proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1899         proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1900 #endif  
1901 }