Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[pandora-kernel.git] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@redhat.com>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *      Fixes:
15  *      Michael Chastain        :       Incorrect size of copying.
16  *      Alan Cox                :       Added the cache manager code
17  *      Alan Cox                :       Fixed the clone/copy bug and device race.
18  *      Mike McLagan            :       Routing by source
19  *      Malcolm Beattie         :       Buffer handling fixes.
20  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
21  *      SVR Anand               :       Fixed several multicast bugs and problems.
22  *      Alexey Kuznetsov        :       Status, optimisations and more.
23  *      Brad Parker             :       Better behaviour on mrouted upcall
24  *                                      overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
27  *                                      Relax this requrement to work with older peers.
28  *
29  */
30
31 #include <asm/system.h>
32 #include <asm/uaccess.h>
33 #include <linux/types.h>
34 #include <linux/sched.h>
35 #include <linux/capability.h>
36 #include <linux/errno.h>
37 #include <linux/timer.h>
38 #include <linux/mm.h>
39 #include <linux/kernel.h>
40 #include <linux/fcntl.h>
41 #include <linux/stat.h>
42 #include <linux/socket.h>
43 #include <linux/in.h>
44 #include <linux/inet.h>
45 #include <linux/netdevice.h>
46 #include <linux/inetdevice.h>
47 #include <linux/igmp.h>
48 #include <linux/proc_fs.h>
49 #include <linux/seq_file.h>
50 #include <linux/mroute.h>
51 #include <linux/init.h>
52 #include <linux/if_ether.h>
53 #include <net/ip.h>
54 #include <net/protocol.h>
55 #include <linux/skbuff.h>
56 #include <net/route.h>
57 #include <net/sock.h>
58 #include <net/icmp.h>
59 #include <net/udp.h>
60 #include <net/raw.h>
61 #include <linux/notifier.h>
62 #include <linux/if_arp.h>
63 #include <linux/netfilter_ipv4.h>
64 #include <net/ipip.h>
65 #include <net/checksum.h>
66
67 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
68 #define CONFIG_IP_PIMSM 1
69 #endif
70
71 static struct sock *mroute_socket;
72
73
74 /* Big lock, protecting vif table, mrt cache and mroute socket state.
75    Note that the changes are semaphored via rtnl_lock.
76  */
77
78 static DEFINE_RWLOCK(mrt_lock);
79
80 /*
81  *      Multicast router control variables
82  */
83
84 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
85 static int maxvif;
86
87 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
88
89 static int mroute_do_assert;                            /* Set in PIM assert    */
90 static int mroute_do_pim;
91
92 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
93
94 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
95 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
96
97 /* Special spinlock for queue of unresolved entries */
98 static DEFINE_SPINLOCK(mfc_unres_lock);
99
100 /* We return to original Alan's scheme. Hash table of resolved
101    entries is changed only in process context and protected
102    with weak lock mrt_lock. Queue of unresolved entries is protected
103    with strong spinlock mfc_unres_lock.
104
105    In this case data path is free of exclusive locks at all.
106  */
107
108 static struct kmem_cache *mrt_cachep __read_mostly;
109
110 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
111 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
112 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
113
114 #ifdef CONFIG_IP_PIMSM_V2
115 static struct net_protocol pim_protocol;
116 #endif
117
118 static struct timer_list ipmr_expire_timer;
119
120 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
121
122 static
123 struct net_device *ipmr_new_tunnel(struct vifctl *v)
124 {
125         struct net_device  *dev;
126
127         dev = __dev_get_by_name("tunl0");
128
129         if (dev) {
130                 int err;
131                 struct ifreq ifr;
132                 mm_segment_t    oldfs;
133                 struct ip_tunnel_parm p;
134                 struct in_device  *in_dev;
135
136                 memset(&p, 0, sizeof(p));
137                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
138                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
139                 p.iph.version = 4;
140                 p.iph.ihl = 5;
141                 p.iph.protocol = IPPROTO_IPIP;
142                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
143                 ifr.ifr_ifru.ifru_data = (void*)&p;
144
145                 oldfs = get_fs(); set_fs(KERNEL_DS);
146                 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
147                 set_fs(oldfs);
148
149                 dev = NULL;
150
151                 if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
152                         dev->flags |= IFF_MULTICAST;
153
154                         in_dev = __in_dev_get_rtnl(dev);
155                         if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
156                                 goto failure;
157                         in_dev->cnf.rp_filter = 0;
158
159                         if (dev_open(dev))
160                                 goto failure;
161                 }
162         }
163         return dev;
164
165 failure:
166         /* allow the register to be completed before unregistering. */
167         rtnl_unlock();
168         rtnl_lock();
169
170         unregister_netdevice(dev);
171         return NULL;
172 }
173
174 #ifdef CONFIG_IP_PIMSM
175
176 static int reg_vif_num = -1;
177
178 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
179 {
180         read_lock(&mrt_lock);
181         ((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
182         ((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
183         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
184         read_unlock(&mrt_lock);
185         kfree_skb(skb);
186         return 0;
187 }
188
189 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
190 {
191         return (struct net_device_stats*)netdev_priv(dev);
192 }
193
194 static void reg_vif_setup(struct net_device *dev)
195 {
196         dev->type               = ARPHRD_PIMREG;
197         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
198         dev->flags              = IFF_NOARP;
199         dev->hard_start_xmit    = reg_vif_xmit;
200         dev->get_stats          = reg_vif_get_stats;
201         dev->destructor         = free_netdev;
202 }
203
204 static struct net_device *ipmr_reg_vif(void)
205 {
206         struct net_device *dev;
207         struct in_device *in_dev;
208
209         dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
210                            reg_vif_setup);
211
212         if (dev == NULL)
213                 return NULL;
214
215         if (register_netdevice(dev)) {
216                 free_netdev(dev);
217                 return NULL;
218         }
219         dev->iflink = 0;
220
221         if ((in_dev = inetdev_init(dev)) == NULL)
222                 goto failure;
223
224         in_dev->cnf.rp_filter = 0;
225
226         if (dev_open(dev))
227                 goto failure;
228
229         return dev;
230
231 failure:
232         /* allow the register to be completed before unregistering. */
233         rtnl_unlock();
234         rtnl_lock();
235
236         unregister_netdevice(dev);
237         return NULL;
238 }
239 #endif
240
241 /*
242  *      Delete a VIF entry
243  */
244
245 static int vif_delete(int vifi)
246 {
247         struct vif_device *v;
248         struct net_device *dev;
249         struct in_device *in_dev;
250
251         if (vifi < 0 || vifi >= maxvif)
252                 return -EADDRNOTAVAIL;
253
254         v = &vif_table[vifi];
255
256         write_lock_bh(&mrt_lock);
257         dev = v->dev;
258         v->dev = NULL;
259
260         if (!dev) {
261                 write_unlock_bh(&mrt_lock);
262                 return -EADDRNOTAVAIL;
263         }
264
265 #ifdef CONFIG_IP_PIMSM
266         if (vifi == reg_vif_num)
267                 reg_vif_num = -1;
268 #endif
269
270         if (vifi+1 == maxvif) {
271                 int tmp;
272                 for (tmp=vifi-1; tmp>=0; tmp--) {
273                         if (VIF_EXISTS(tmp))
274                                 break;
275                 }
276                 maxvif = tmp+1;
277         }
278
279         write_unlock_bh(&mrt_lock);
280
281         dev_set_allmulti(dev, -1);
282
283         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
284                 in_dev->cnf.mc_forwarding--;
285                 ip_rt_multicast_event(in_dev);
286         }
287
288         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
289                 unregister_netdevice(dev);
290
291         dev_put(dev);
292         return 0;
293 }
294
295 /* Destroy an unresolved cache entry, killing queued skbs
296    and reporting error to netlink readers.
297  */
298
299 static void ipmr_destroy_unres(struct mfc_cache *c)
300 {
301         struct sk_buff *skb;
302         struct nlmsgerr *e;
303
304         atomic_dec(&cache_resolve_queue_len);
305
306         while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
307                 if (skb->nh.iph->version == 0) {
308                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
309                         nlh->nlmsg_type = NLMSG_ERROR;
310                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
311                         skb_trim(skb, nlh->nlmsg_len);
312                         e = NLMSG_DATA(nlh);
313                         e->error = -ETIMEDOUT;
314                         memset(&e->msg, 0, sizeof(e->msg));
315
316                         rtnl_unicast(skb, NETLINK_CB(skb).pid);
317                 } else
318                         kfree_skb(skb);
319         }
320
321         kmem_cache_free(mrt_cachep, c);
322 }
323
324
325 /* Single timer process for all the unresolved queue. */
326
327 static void ipmr_expire_process(unsigned long dummy)
328 {
329         unsigned long now;
330         unsigned long expires;
331         struct mfc_cache *c, **cp;
332
333         if (!spin_trylock(&mfc_unres_lock)) {
334                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
335                 return;
336         }
337
338         if (atomic_read(&cache_resolve_queue_len) == 0)
339                 goto out;
340
341         now = jiffies;
342         expires = 10*HZ;
343         cp = &mfc_unres_queue;
344
345         while ((c=*cp) != NULL) {
346                 if (time_after(c->mfc_un.unres.expires, now)) {
347                         unsigned long interval = c->mfc_un.unres.expires - now;
348                         if (interval < expires)
349                                 expires = interval;
350                         cp = &c->next;
351                         continue;
352                 }
353
354                 *cp = c->next;
355
356                 ipmr_destroy_unres(c);
357         }
358
359         if (atomic_read(&cache_resolve_queue_len))
360                 mod_timer(&ipmr_expire_timer, jiffies + expires);
361
362 out:
363         spin_unlock(&mfc_unres_lock);
364 }
365
366 /* Fill oifs list. It is called under write locked mrt_lock. */
367
368 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
369 {
370         int vifi;
371
372         cache->mfc_un.res.minvif = MAXVIFS;
373         cache->mfc_un.res.maxvif = 0;
374         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
375
376         for (vifi=0; vifi<maxvif; vifi++) {
377                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
378                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
379                         if (cache->mfc_un.res.minvif > vifi)
380                                 cache->mfc_un.res.minvif = vifi;
381                         if (cache->mfc_un.res.maxvif <= vifi)
382                                 cache->mfc_un.res.maxvif = vifi + 1;
383                 }
384         }
385 }
386
387 static int vif_add(struct vifctl *vifc, int mrtsock)
388 {
389         int vifi = vifc->vifc_vifi;
390         struct vif_device *v = &vif_table[vifi];
391         struct net_device *dev;
392         struct in_device *in_dev;
393
394         /* Is vif busy ? */
395         if (VIF_EXISTS(vifi))
396                 return -EADDRINUSE;
397
398         switch (vifc->vifc_flags) {
399 #ifdef CONFIG_IP_PIMSM
400         case VIFF_REGISTER:
401                 /*
402                  * Special Purpose VIF in PIM
403                  * All the packets will be sent to the daemon
404                  */
405                 if (reg_vif_num >= 0)
406                         return -EADDRINUSE;
407                 dev = ipmr_reg_vif();
408                 if (!dev)
409                         return -ENOBUFS;
410                 break;
411 #endif
412         case VIFF_TUNNEL:
413                 dev = ipmr_new_tunnel(vifc);
414                 if (!dev)
415                         return -ENOBUFS;
416                 break;
417         case 0:
418                 dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
419                 if (!dev)
420                         return -EADDRNOTAVAIL;
421                 dev_put(dev);
422                 break;
423         default:
424                 return -EINVAL;
425         }
426
427         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
428                 return -EADDRNOTAVAIL;
429         in_dev->cnf.mc_forwarding++;
430         dev_set_allmulti(dev, +1);
431         ip_rt_multicast_event(in_dev);
432
433         /*
434          *      Fill in the VIF structures
435          */
436         v->rate_limit=vifc->vifc_rate_limit;
437         v->local=vifc->vifc_lcl_addr.s_addr;
438         v->remote=vifc->vifc_rmt_addr.s_addr;
439         v->flags=vifc->vifc_flags;
440         if (!mrtsock)
441                 v->flags |= VIFF_STATIC;
442         v->threshold=vifc->vifc_threshold;
443         v->bytes_in = 0;
444         v->bytes_out = 0;
445         v->pkt_in = 0;
446         v->pkt_out = 0;
447         v->link = dev->ifindex;
448         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
449                 v->link = dev->iflink;
450
451         /* And finish update writing critical data */
452         write_lock_bh(&mrt_lock);
453         dev_hold(dev);
454         v->dev=dev;
455 #ifdef CONFIG_IP_PIMSM
456         if (v->flags&VIFF_REGISTER)
457                 reg_vif_num = vifi;
458 #endif
459         if (vifi+1 > maxvif)
460                 maxvif = vifi+1;
461         write_unlock_bh(&mrt_lock);
462         return 0;
463 }
464
465 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
466 {
467         int line=MFC_HASH(mcastgrp,origin);
468         struct mfc_cache *c;
469
470         for (c=mfc_cache_array[line]; c; c = c->next) {
471                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
472                         break;
473         }
474         return c;
475 }
476
477 /*
478  *      Allocate a multicast cache entry
479  */
480 static struct mfc_cache *ipmr_cache_alloc(void)
481 {
482         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
483         if(c==NULL)
484                 return NULL;
485         c->mfc_un.res.minvif = MAXVIFS;
486         return c;
487 }
488
489 static struct mfc_cache *ipmr_cache_alloc_unres(void)
490 {
491         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
492         if(c==NULL)
493                 return NULL;
494         skb_queue_head_init(&c->mfc_un.unres.unresolved);
495         c->mfc_un.unres.expires = jiffies + 10*HZ;
496         return c;
497 }
498
499 /*
500  *      A cache entry has gone into a resolved state from queued
501  */
502
503 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
504 {
505         struct sk_buff *skb;
506         struct nlmsgerr *e;
507
508         /*
509          *      Play the pending entries through our router
510          */
511
512         while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
513                 if (skb->nh.iph->version == 0) {
514                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
515
516                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
517                                 nlh->nlmsg_len = skb->tail - (u8*)nlh;
518                         } else {
519                                 nlh->nlmsg_type = NLMSG_ERROR;
520                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
521                                 skb_trim(skb, nlh->nlmsg_len);
522                                 e = NLMSG_DATA(nlh);
523                                 e->error = -EMSGSIZE;
524                                 memset(&e->msg, 0, sizeof(e->msg));
525                         }
526
527                         rtnl_unicast(skb, NETLINK_CB(skb).pid);
528                 } else
529                         ip_mr_forward(skb, c, 0);
530         }
531 }
532
533 /*
534  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
535  *      expects the following bizarre scheme.
536  *
537  *      Called under mrt_lock.
538  */
539
540 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
541 {
542         struct sk_buff *skb;
543         int ihl = pkt->nh.iph->ihl<<2;
544         struct igmphdr *igmp;
545         struct igmpmsg *msg;
546         int ret;
547
548 #ifdef CONFIG_IP_PIMSM
549         if (assert == IGMPMSG_WHOLEPKT)
550                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
551         else
552 #endif
553                 skb = alloc_skb(128, GFP_ATOMIC);
554
555         if(!skb)
556                 return -ENOBUFS;
557
558 #ifdef CONFIG_IP_PIMSM
559         if (assert == IGMPMSG_WHOLEPKT) {
560                 /* Ugly, but we have no choice with this interface.
561                    Duplicate old header, fix ihl, length etc.
562                    And all this only to mangle msg->im_msgtype and
563                    to set msg->im_mbz to "mbz" :-)
564                  */
565                 msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
566                 skb->nh.raw = skb->h.raw = (u8*)msg;
567                 memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
568                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
569                 msg->im_mbz = 0;
570                 msg->im_vif = reg_vif_num;
571                 skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
572                 skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
573         } else
574 #endif
575         {
576
577         /*
578          *      Copy the IP header
579          */
580
581         skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
582         memcpy(skb->data,pkt->data,ihl);
583         skb->nh.iph->protocol = 0;                      /* Flag to the kernel this is a route add */
584         msg = (struct igmpmsg*)skb->nh.iph;
585         msg->im_vif = vifi;
586         skb->dst = dst_clone(pkt->dst);
587
588         /*
589          *      Add our header
590          */
591
592         igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
593         igmp->type      =
594         msg->im_msgtype = assert;
595         igmp->code      =       0;
596         skb->nh.iph->tot_len=htons(skb->len);                   /* Fix the length */
597         skb->h.raw = skb->nh.raw;
598         }
599
600         if (mroute_socket == NULL) {
601                 kfree_skb(skb);
602                 return -EINVAL;
603         }
604
605         /*
606          *      Deliver to mrouted
607          */
608         if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
609                 if (net_ratelimit())
610                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
611                 kfree_skb(skb);
612         }
613
614         return ret;
615 }
616
617 /*
618  *      Queue a packet for resolution. It gets locked cache entry!
619  */
620
621 static int
622 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
623 {
624         int err;
625         struct mfc_cache *c;
626
627         spin_lock_bh(&mfc_unres_lock);
628         for (c=mfc_unres_queue; c; c=c->next) {
629                 if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
630                     c->mfc_origin == skb->nh.iph->saddr)
631                         break;
632         }
633
634         if (c == NULL) {
635                 /*
636                  *      Create a new entry if allowable
637                  */
638
639                 if (atomic_read(&cache_resolve_queue_len)>=10 ||
640                     (c=ipmr_cache_alloc_unres())==NULL) {
641                         spin_unlock_bh(&mfc_unres_lock);
642
643                         kfree_skb(skb);
644                         return -ENOBUFS;
645                 }
646
647                 /*
648                  *      Fill in the new cache entry
649                  */
650                 c->mfc_parent=-1;
651                 c->mfc_origin=skb->nh.iph->saddr;
652                 c->mfc_mcastgrp=skb->nh.iph->daddr;
653
654                 /*
655                  *      Reflect first query at mrouted.
656                  */
657                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
658                         /* If the report failed throw the cache entry
659                            out - Brad Parker
660                          */
661                         spin_unlock_bh(&mfc_unres_lock);
662
663                         kmem_cache_free(mrt_cachep, c);
664                         kfree_skb(skb);
665                         return err;
666                 }
667
668                 atomic_inc(&cache_resolve_queue_len);
669                 c->next = mfc_unres_queue;
670                 mfc_unres_queue = c;
671
672                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
673         }
674
675         /*
676          *      See if we can append the packet
677          */
678         if (c->mfc_un.unres.unresolved.qlen>3) {
679                 kfree_skb(skb);
680                 err = -ENOBUFS;
681         } else {
682                 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
683                 err = 0;
684         }
685
686         spin_unlock_bh(&mfc_unres_lock);
687         return err;
688 }
689
690 /*
691  *      MFC cache manipulation by user space mroute daemon
692  */
693
694 static int ipmr_mfc_delete(struct mfcctl *mfc)
695 {
696         int line;
697         struct mfc_cache *c, **cp;
698
699         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
700
701         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
702                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
703                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
704                         write_lock_bh(&mrt_lock);
705                         *cp = c->next;
706                         write_unlock_bh(&mrt_lock);
707
708                         kmem_cache_free(mrt_cachep, c);
709                         return 0;
710                 }
711         }
712         return -ENOENT;
713 }
714
715 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
716 {
717         int line;
718         struct mfc_cache *uc, *c, **cp;
719
720         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
721
722         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
723                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
724                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
725                         break;
726         }
727
728         if (c != NULL) {
729                 write_lock_bh(&mrt_lock);
730                 c->mfc_parent = mfc->mfcc_parent;
731                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
732                 if (!mrtsock)
733                         c->mfc_flags |= MFC_STATIC;
734                 write_unlock_bh(&mrt_lock);
735                 return 0;
736         }
737
738         if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
739                 return -EINVAL;
740
741         c=ipmr_cache_alloc();
742         if (c==NULL)
743                 return -ENOMEM;
744
745         c->mfc_origin=mfc->mfcc_origin.s_addr;
746         c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
747         c->mfc_parent=mfc->mfcc_parent;
748         ipmr_update_thresholds(c, mfc->mfcc_ttls);
749         if (!mrtsock)
750                 c->mfc_flags |= MFC_STATIC;
751
752         write_lock_bh(&mrt_lock);
753         c->next = mfc_cache_array[line];
754         mfc_cache_array[line] = c;
755         write_unlock_bh(&mrt_lock);
756
757         /*
758          *      Check to see if we resolved a queued list. If so we
759          *      need to send on the frames and tidy up.
760          */
761         spin_lock_bh(&mfc_unres_lock);
762         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
763              cp = &uc->next) {
764                 if (uc->mfc_origin == c->mfc_origin &&
765                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
766                         *cp = uc->next;
767                         if (atomic_dec_and_test(&cache_resolve_queue_len))
768                                 del_timer(&ipmr_expire_timer);
769                         break;
770                 }
771         }
772         spin_unlock_bh(&mfc_unres_lock);
773
774         if (uc) {
775                 ipmr_cache_resolve(uc, c);
776                 kmem_cache_free(mrt_cachep, uc);
777         }
778         return 0;
779 }
780
781 /*
782  *      Close the multicast socket, and clear the vif tables etc
783  */
784
785 static void mroute_clean_tables(struct sock *sk)
786 {
787         int i;
788
789         /*
790          *      Shut down all active vif entries
791          */
792         for(i=0; i<maxvif; i++) {
793                 if (!(vif_table[i].flags&VIFF_STATIC))
794                         vif_delete(i);
795         }
796
797         /*
798          *      Wipe the cache
799          */
800         for (i=0;i<MFC_LINES;i++) {
801                 struct mfc_cache *c, **cp;
802
803                 cp = &mfc_cache_array[i];
804                 while ((c = *cp) != NULL) {
805                         if (c->mfc_flags&MFC_STATIC) {
806                                 cp = &c->next;
807                                 continue;
808                         }
809                         write_lock_bh(&mrt_lock);
810                         *cp = c->next;
811                         write_unlock_bh(&mrt_lock);
812
813                         kmem_cache_free(mrt_cachep, c);
814                 }
815         }
816
817         if (atomic_read(&cache_resolve_queue_len) != 0) {
818                 struct mfc_cache *c;
819
820                 spin_lock_bh(&mfc_unres_lock);
821                 while (mfc_unres_queue != NULL) {
822                         c = mfc_unres_queue;
823                         mfc_unres_queue = c->next;
824                         spin_unlock_bh(&mfc_unres_lock);
825
826                         ipmr_destroy_unres(c);
827
828                         spin_lock_bh(&mfc_unres_lock);
829                 }
830                 spin_unlock_bh(&mfc_unres_lock);
831         }
832 }
833
834 static void mrtsock_destruct(struct sock *sk)
835 {
836         rtnl_lock();
837         if (sk == mroute_socket) {
838                 ipv4_devconf.mc_forwarding--;
839
840                 write_lock_bh(&mrt_lock);
841                 mroute_socket=NULL;
842                 write_unlock_bh(&mrt_lock);
843
844                 mroute_clean_tables(sk);
845         }
846         rtnl_unlock();
847 }
848
849 /*
850  *      Socket options and virtual interface manipulation. The whole
851  *      virtual interface system is a complete heap, but unfortunately
852  *      that's how BSD mrouted happens to think. Maybe one day with a proper
853  *      MOSPF/PIM router set up we can clean this up.
854  */
855
856 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
857 {
858         int ret;
859         struct vifctl vif;
860         struct mfcctl mfc;
861
862         if(optname!=MRT_INIT)
863         {
864                 if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
865                         return -EACCES;
866         }
867
868         switch(optname)
869         {
870                 case MRT_INIT:
871                         if (sk->sk_type != SOCK_RAW ||
872                             inet_sk(sk)->num != IPPROTO_IGMP)
873                                 return -EOPNOTSUPP;
874                         if(optlen!=sizeof(int))
875                                 return -ENOPROTOOPT;
876
877                         rtnl_lock();
878                         if (mroute_socket) {
879                                 rtnl_unlock();
880                                 return -EADDRINUSE;
881                         }
882
883                         ret = ip_ra_control(sk, 1, mrtsock_destruct);
884                         if (ret == 0) {
885                                 write_lock_bh(&mrt_lock);
886                                 mroute_socket=sk;
887                                 write_unlock_bh(&mrt_lock);
888
889                                 ipv4_devconf.mc_forwarding++;
890                         }
891                         rtnl_unlock();
892                         return ret;
893                 case MRT_DONE:
894                         if (sk!=mroute_socket)
895                                 return -EACCES;
896                         return ip_ra_control(sk, 0, NULL);
897                 case MRT_ADD_VIF:
898                 case MRT_DEL_VIF:
899                         if(optlen!=sizeof(vif))
900                                 return -EINVAL;
901                         if (copy_from_user(&vif,optval,sizeof(vif)))
902                                 return -EFAULT;
903                         if(vif.vifc_vifi >= MAXVIFS)
904                                 return -ENFILE;
905                         rtnl_lock();
906                         if (optname==MRT_ADD_VIF) {
907                                 ret = vif_add(&vif, sk==mroute_socket);
908                         } else {
909                                 ret = vif_delete(vif.vifc_vifi);
910                         }
911                         rtnl_unlock();
912                         return ret;
913
914                 /*
915                  *      Manipulate the forwarding caches. These live
916                  *      in a sort of kernel/user symbiosis.
917                  */
918                 case MRT_ADD_MFC:
919                 case MRT_DEL_MFC:
920                         if(optlen!=sizeof(mfc))
921                                 return -EINVAL;
922                         if (copy_from_user(&mfc,optval, sizeof(mfc)))
923                                 return -EFAULT;
924                         rtnl_lock();
925                         if (optname==MRT_DEL_MFC)
926                                 ret = ipmr_mfc_delete(&mfc);
927                         else
928                                 ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
929                         rtnl_unlock();
930                         return ret;
931                 /*
932                  *      Control PIM assert.
933                  */
934                 case MRT_ASSERT:
935                 {
936                         int v;
937                         if(get_user(v,(int __user *)optval))
938                                 return -EFAULT;
939                         mroute_do_assert=(v)?1:0;
940                         return 0;
941                 }
942 #ifdef CONFIG_IP_PIMSM
943                 case MRT_PIM:
944                 {
945                         int v, ret;
946                         if(get_user(v,(int __user *)optval))
947                                 return -EFAULT;
948                         v = (v)?1:0;
949                         rtnl_lock();
950                         ret = 0;
951                         if (v != mroute_do_pim) {
952                                 mroute_do_pim = v;
953                                 mroute_do_assert = v;
954 #ifdef CONFIG_IP_PIMSM_V2
955                                 if (mroute_do_pim)
956                                         ret = inet_add_protocol(&pim_protocol,
957                                                                 IPPROTO_PIM);
958                                 else
959                                         ret = inet_del_protocol(&pim_protocol,
960                                                                 IPPROTO_PIM);
961                                 if (ret < 0)
962                                         ret = -EAGAIN;
963 #endif
964                         }
965                         rtnl_unlock();
966                         return ret;
967                 }
968 #endif
969                 /*
970                  *      Spurious command, or MRT_VERSION which you cannot
971                  *      set.
972                  */
973                 default:
974                         return -ENOPROTOOPT;
975         }
976 }
977
978 /*
979  *      Getsock opt support for the multicast routing system.
980  */
981
982 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
983 {
984         int olr;
985         int val;
986
987         if(optname!=MRT_VERSION &&
988 #ifdef CONFIG_IP_PIMSM
989            optname!=MRT_PIM &&
990 #endif
991            optname!=MRT_ASSERT)
992                 return -ENOPROTOOPT;
993
994         if (get_user(olr, optlen))
995                 return -EFAULT;
996
997         olr = min_t(unsigned int, olr, sizeof(int));
998         if (olr < 0)
999                 return -EINVAL;
1000
1001         if(put_user(olr,optlen))
1002                 return -EFAULT;
1003         if(optname==MRT_VERSION)
1004                 val=0x0305;
1005 #ifdef CONFIG_IP_PIMSM
1006         else if(optname==MRT_PIM)
1007                 val=mroute_do_pim;
1008 #endif
1009         else
1010                 val=mroute_do_assert;
1011         if(copy_to_user(optval,&val,olr))
1012                 return -EFAULT;
1013         return 0;
1014 }
1015
1016 /*
1017  *      The IP multicast ioctl support routines.
1018  */
1019
1020 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1021 {
1022         struct sioc_sg_req sr;
1023         struct sioc_vif_req vr;
1024         struct vif_device *vif;
1025         struct mfc_cache *c;
1026
1027         switch(cmd)
1028         {
1029                 case SIOCGETVIFCNT:
1030                         if (copy_from_user(&vr,arg,sizeof(vr)))
1031                                 return -EFAULT;
1032                         if(vr.vifi>=maxvif)
1033                                 return -EINVAL;
1034                         read_lock(&mrt_lock);
1035                         vif=&vif_table[vr.vifi];
1036                         if(VIF_EXISTS(vr.vifi)) {
1037                                 vr.icount=vif->pkt_in;
1038                                 vr.ocount=vif->pkt_out;
1039                                 vr.ibytes=vif->bytes_in;
1040                                 vr.obytes=vif->bytes_out;
1041                                 read_unlock(&mrt_lock);
1042
1043                                 if (copy_to_user(arg,&vr,sizeof(vr)))
1044                                         return -EFAULT;
1045                                 return 0;
1046                         }
1047                         read_unlock(&mrt_lock);
1048                         return -EADDRNOTAVAIL;
1049                 case SIOCGETSGCNT:
1050                         if (copy_from_user(&sr,arg,sizeof(sr)))
1051                                 return -EFAULT;
1052
1053                         read_lock(&mrt_lock);
1054                         c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1055                         if (c) {
1056                                 sr.pktcnt = c->mfc_un.res.pkt;
1057                                 sr.bytecnt = c->mfc_un.res.bytes;
1058                                 sr.wrong_if = c->mfc_un.res.wrong_if;
1059                                 read_unlock(&mrt_lock);
1060
1061                                 if (copy_to_user(arg,&sr,sizeof(sr)))
1062                                         return -EFAULT;
1063                                 return 0;
1064                         }
1065                         read_unlock(&mrt_lock);
1066                         return -EADDRNOTAVAIL;
1067                 default:
1068                         return -ENOIOCTLCMD;
1069         }
1070 }
1071
1072
1073 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1074 {
1075         struct vif_device *v;
1076         int ct;
1077         if (event != NETDEV_UNREGISTER)
1078                 return NOTIFY_DONE;
1079         v=&vif_table[0];
1080         for(ct=0;ct<maxvif;ct++,v++) {
1081                 if (v->dev==ptr)
1082                         vif_delete(ct);
1083         }
1084         return NOTIFY_DONE;
1085 }
1086
1087
1088 static struct notifier_block ip_mr_notifier={
1089         .notifier_call = ipmr_device_event,
1090 };
1091
1092 /*
1093  *      Encapsulate a packet by attaching a valid IPIP header to it.
1094  *      This avoids tunnel drivers and other mess and gives us the speed so
1095  *      important for multicast video.
1096  */
1097
1098 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1099 {
1100         struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
1101
1102         iph->version    =       4;
1103         iph->tos        =       skb->nh.iph->tos;
1104         iph->ttl        =       skb->nh.iph->ttl;
1105         iph->frag_off   =       0;
1106         iph->daddr      =       daddr;
1107         iph->saddr      =       saddr;
1108         iph->protocol   =       IPPROTO_IPIP;
1109         iph->ihl        =       5;
1110         iph->tot_len    =       htons(skb->len);
1111         ip_select_ident(iph, skb->dst, NULL);
1112         ip_send_check(iph);
1113
1114         skb->h.ipiph = skb->nh.iph;
1115         skb->nh.iph = iph;
1116         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1117         nf_reset(skb);
1118 }
1119
1120 static inline int ipmr_forward_finish(struct sk_buff *skb)
1121 {
1122         struct ip_options * opt = &(IPCB(skb)->opt);
1123
1124         IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1125
1126         if (unlikely(opt->optlen))
1127                 ip_forward_options(skb);
1128
1129         return dst_output(skb);
1130 }
1131
1132 /*
1133  *      Processing handlers for ipmr_forward
1134  */
1135
1136 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1137 {
1138         struct iphdr *iph = skb->nh.iph;
1139         struct vif_device *vif = &vif_table[vifi];
1140         struct net_device *dev;
1141         struct rtable *rt;
1142         int    encap = 0;
1143
1144         if (vif->dev == NULL)
1145                 goto out_free;
1146
1147 #ifdef CONFIG_IP_PIMSM
1148         if (vif->flags & VIFF_REGISTER) {
1149                 vif->pkt_out++;
1150                 vif->bytes_out+=skb->len;
1151                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
1152                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
1153                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1154                 kfree_skb(skb);
1155                 return;
1156         }
1157 #endif
1158
1159         if (vif->flags&VIFF_TUNNEL) {
1160                 struct flowi fl = { .oif = vif->link,
1161                                     .nl_u = { .ip4_u =
1162                                               { .daddr = vif->remote,
1163                                                 .saddr = vif->local,
1164                                                 .tos = RT_TOS(iph->tos) } },
1165                                     .proto = IPPROTO_IPIP };
1166                 if (ip_route_output_key(&rt, &fl))
1167                         goto out_free;
1168                 encap = sizeof(struct iphdr);
1169         } else {
1170                 struct flowi fl = { .oif = vif->link,
1171                                     .nl_u = { .ip4_u =
1172                                               { .daddr = iph->daddr,
1173                                                 .tos = RT_TOS(iph->tos) } },
1174                                     .proto = IPPROTO_IPIP };
1175                 if (ip_route_output_key(&rt, &fl))
1176                         goto out_free;
1177         }
1178
1179         dev = rt->u.dst.dev;
1180
1181         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1182                 /* Do not fragment multicasts. Alas, IPv4 does not
1183                    allow to send ICMP, so that packets will disappear
1184                    to blackhole.
1185                  */
1186
1187                 IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1188                 ip_rt_put(rt);
1189                 goto out_free;
1190         }
1191
1192         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1193
1194         if (skb_cow(skb, encap)) {
1195                 ip_rt_put(rt);
1196                 goto out_free;
1197         }
1198
1199         vif->pkt_out++;
1200         vif->bytes_out+=skb->len;
1201
1202         dst_release(skb->dst);
1203         skb->dst = &rt->u.dst;
1204         iph = skb->nh.iph;
1205         ip_decrease_ttl(iph);
1206
1207         /* FIXME: forward and output firewalls used to be called here.
1208          * What do we do with netfilter? -- RR */
1209         if (vif->flags & VIFF_TUNNEL) {
1210                 ip_encap(skb, vif->local, vif->remote);
1211                 /* FIXME: extra output firewall step used to be here. --RR */
1212                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
1213                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
1214         }
1215
1216         IPCB(skb)->flags |= IPSKB_FORWARDED;
1217
1218         /*
1219          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1220          * not only before forwarding, but after forwarding on all output
1221          * interfaces. It is clear, if mrouter runs a multicasting
1222          * program, it should receive packets not depending to what interface
1223          * program is joined.
1224          * If we will not make it, the program will have to join on all
1225          * interfaces. On the other hand, multihoming host (or router, but
1226          * not mrouter) cannot join to more than one interface - it will
1227          * result in receiving multiple packets.
1228          */
1229         NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
1230                 ipmr_forward_finish);
1231         return;
1232
1233 out_free:
1234         kfree_skb(skb);
1235         return;
1236 }
1237
1238 static int ipmr_find_vif(struct net_device *dev)
1239 {
1240         int ct;
1241         for (ct=maxvif-1; ct>=0; ct--) {
1242                 if (vif_table[ct].dev == dev)
1243                         break;
1244         }
1245         return ct;
1246 }
1247
1248 /* "local" means that we should preserve one skb (for local delivery) */
1249
1250 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1251 {
1252         int psend = -1;
1253         int vif, ct;
1254
1255         vif = cache->mfc_parent;
1256         cache->mfc_un.res.pkt++;
1257         cache->mfc_un.res.bytes += skb->len;
1258
1259         /*
1260          * Wrong interface: drop packet and (maybe) send PIM assert.
1261          */
1262         if (vif_table[vif].dev != skb->dev) {
1263                 int true_vifi;
1264
1265                 if (((struct rtable*)skb->dst)->fl.iif == 0) {
1266                         /* It is our own packet, looped back.
1267                            Very complicated situation...
1268
1269                            The best workaround until routing daemons will be
1270                            fixed is not to redistribute packet, if it was
1271                            send through wrong interface. It means, that
1272                            multicast applications WILL NOT work for
1273                            (S,G), which have default multicast route pointing
1274                            to wrong oif. In any case, it is not a good
1275                            idea to use multicasting applications on router.
1276                          */
1277                         goto dont_forward;
1278                 }
1279
1280                 cache->mfc_un.res.wrong_if++;
1281                 true_vifi = ipmr_find_vif(skb->dev);
1282
1283                 if (true_vifi >= 0 && mroute_do_assert &&
1284                     /* pimsm uses asserts, when switching from RPT to SPT,
1285                        so that we cannot check that packet arrived on an oif.
1286                        It is bad, but otherwise we would need to move pretty
1287                        large chunk of pimd to kernel. Ough... --ANK
1288                      */
1289                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1290                     time_after(jiffies,
1291                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1292                         cache->mfc_un.res.last_assert = jiffies;
1293                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1294                 }
1295                 goto dont_forward;
1296         }
1297
1298         vif_table[vif].pkt_in++;
1299         vif_table[vif].bytes_in+=skb->len;
1300
1301         /*
1302          *      Forward the frame
1303          */
1304         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1305                 if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
1306                         if (psend != -1) {
1307                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1308                                 if (skb2)
1309                                         ipmr_queue_xmit(skb2, cache, psend);
1310                         }
1311                         psend=ct;
1312                 }
1313         }
1314         if (psend != -1) {
1315                 if (local) {
1316                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1317                         if (skb2)
1318                                 ipmr_queue_xmit(skb2, cache, psend);
1319                 } else {
1320                         ipmr_queue_xmit(skb, cache, psend);
1321                         return 0;
1322                 }
1323         }
1324
1325 dont_forward:
1326         if (!local)
1327                 kfree_skb(skb);
1328         return 0;
1329 }
1330
1331
1332 /*
1333  *      Multicast packets for forwarding arrive here
1334  */
1335
1336 int ip_mr_input(struct sk_buff *skb)
1337 {
1338         struct mfc_cache *cache;
1339         int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1340
1341         /* Packet is looped back after forward, it should not be
1342            forwarded second time, but still can be delivered locally.
1343          */
1344         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1345                 goto dont_forward;
1346
1347         if (!local) {
1348                     if (IPCB(skb)->opt.router_alert) {
1349                             if (ip_call_ra_chain(skb))
1350                                     return 0;
1351                     } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
1352                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1353                                Cisco IOS <= 11.2(8)) do not put router alert
1354                                option to IGMP packets destined to routable
1355                                groups. It is very bad, because it means
1356                                that we can forward NO IGMP messages.
1357                              */
1358                             read_lock(&mrt_lock);
1359                             if (mroute_socket) {
1360                                     nf_reset(skb);
1361                                     raw_rcv(mroute_socket, skb);
1362                                     read_unlock(&mrt_lock);
1363                                     return 0;
1364                             }
1365                             read_unlock(&mrt_lock);
1366                     }
1367         }
1368
1369         read_lock(&mrt_lock);
1370         cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
1371
1372         /*
1373          *      No usable cache entry
1374          */
1375         if (cache==NULL) {
1376                 int vif;
1377
1378                 if (local) {
1379                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1380                         ip_local_deliver(skb);
1381                         if (skb2 == NULL) {
1382                                 read_unlock(&mrt_lock);
1383                                 return -ENOBUFS;
1384                         }
1385                         skb = skb2;
1386                 }
1387
1388                 vif = ipmr_find_vif(skb->dev);
1389                 if (vif >= 0) {
1390                         int err = ipmr_cache_unresolved(vif, skb);
1391                         read_unlock(&mrt_lock);
1392
1393                         return err;
1394                 }
1395                 read_unlock(&mrt_lock);
1396                 kfree_skb(skb);
1397                 return -ENODEV;
1398         }
1399
1400         ip_mr_forward(skb, cache, local);
1401
1402         read_unlock(&mrt_lock);
1403
1404         if (local)
1405                 return ip_local_deliver(skb);
1406
1407         return 0;
1408
1409 dont_forward:
1410         if (local)
1411                 return ip_local_deliver(skb);
1412         kfree_skb(skb);
1413         return 0;
1414 }
1415
1416 #ifdef CONFIG_IP_PIMSM_V1
1417 /*
1418  * Handle IGMP messages of PIMv1
1419  */
1420
1421 int pim_rcv_v1(struct sk_buff * skb)
1422 {
1423         struct igmphdr *pim;
1424         struct iphdr   *encap;
1425         struct net_device  *reg_dev = NULL;
1426
1427         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1428                 goto drop;
1429
1430         pim = (struct igmphdr*)skb->h.raw;
1431
1432         if (!mroute_do_pim ||
1433             skb->len < sizeof(*pim) + sizeof(*encap) ||
1434             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1435                 goto drop;
1436
1437         encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
1438         /*
1439            Check that:
1440            a. packet is really destinted to a multicast group
1441            b. packet is not a NULL-REGISTER
1442            c. packet is not truncated
1443          */
1444         if (!MULTICAST(encap->daddr) ||
1445             encap->tot_len == 0 ||
1446             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1447                 goto drop;
1448
1449         read_lock(&mrt_lock);
1450         if (reg_vif_num >= 0)
1451                 reg_dev = vif_table[reg_vif_num].dev;
1452         if (reg_dev)
1453                 dev_hold(reg_dev);
1454         read_unlock(&mrt_lock);
1455
1456         if (reg_dev == NULL)
1457                 goto drop;
1458
1459         skb->mac.raw = skb->nh.raw;
1460         skb_pull(skb, (u8*)encap - skb->data);
1461         skb->nh.iph = (struct iphdr *)skb->data;
1462         skb->dev = reg_dev;
1463         skb->protocol = htons(ETH_P_IP);
1464         skb->ip_summed = 0;
1465         skb->pkt_type = PACKET_HOST;
1466         dst_release(skb->dst);
1467         skb->dst = NULL;
1468         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1469         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1470         nf_reset(skb);
1471         netif_rx(skb);
1472         dev_put(reg_dev);
1473         return 0;
1474  drop:
1475         kfree_skb(skb);
1476         return 0;
1477 }
1478 #endif
1479
1480 #ifdef CONFIG_IP_PIMSM_V2
1481 static int pim_rcv(struct sk_buff * skb)
1482 {
1483         struct pimreghdr *pim;
1484         struct iphdr   *encap;
1485         struct net_device  *reg_dev = NULL;
1486
1487         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1488                 goto drop;
1489
1490         pim = (struct pimreghdr*)skb->h.raw;
1491         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1492             (pim->flags&PIM_NULL_REGISTER) ||
1493             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1494              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1495                 goto drop;
1496
1497         /* check if the inner packet is destined to mcast group */
1498         encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
1499         if (!MULTICAST(encap->daddr) ||
1500             encap->tot_len == 0 ||
1501             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1502                 goto drop;
1503
1504         read_lock(&mrt_lock);
1505         if (reg_vif_num >= 0)
1506                 reg_dev = vif_table[reg_vif_num].dev;
1507         if (reg_dev)
1508                 dev_hold(reg_dev);
1509         read_unlock(&mrt_lock);
1510
1511         if (reg_dev == NULL)
1512                 goto drop;
1513
1514         skb->mac.raw = skb->nh.raw;
1515         skb_pull(skb, (u8*)encap - skb->data);
1516         skb->nh.iph = (struct iphdr *)skb->data;
1517         skb->dev = reg_dev;
1518         skb->protocol = htons(ETH_P_IP);
1519         skb->ip_summed = 0;
1520         skb->pkt_type = PACKET_HOST;
1521         dst_release(skb->dst);
1522         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1523         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1524         skb->dst = NULL;
1525         nf_reset(skb);
1526         netif_rx(skb);
1527         dev_put(reg_dev);
1528         return 0;
1529  drop:
1530         kfree_skb(skb);
1531         return 0;
1532 }
1533 #endif
1534
1535 static int
1536 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1537 {
1538         int ct;
1539         struct rtnexthop *nhp;
1540         struct net_device *dev = vif_table[c->mfc_parent].dev;
1541         u8 *b = skb->tail;
1542         struct rtattr *mp_head;
1543
1544         if (dev)
1545                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1546
1547         mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1548
1549         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1550                 if (c->mfc_un.res.ttls[ct] < 255) {
1551                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1552                                 goto rtattr_failure;
1553                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1554                         nhp->rtnh_flags = 0;
1555                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1556                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1557                         nhp->rtnh_len = sizeof(*nhp);
1558                 }
1559         }
1560         mp_head->rta_type = RTA_MULTIPATH;
1561         mp_head->rta_len = skb->tail - (u8*)mp_head;
1562         rtm->rtm_type = RTN_MULTICAST;
1563         return 1;
1564
1565 rtattr_failure:
1566         skb_trim(skb, b - skb->data);
1567         return -EMSGSIZE;
1568 }
1569
1570 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1571 {
1572         int err;
1573         struct mfc_cache *cache;
1574         struct rtable *rt = (struct rtable*)skb->dst;
1575
1576         read_lock(&mrt_lock);
1577         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1578
1579         if (cache==NULL) {
1580                 struct sk_buff *skb2;
1581                 struct net_device *dev;
1582                 int vif;
1583
1584                 if (nowait) {
1585                         read_unlock(&mrt_lock);
1586                         return -EAGAIN;
1587                 }
1588
1589                 dev = skb->dev;
1590                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1591                         read_unlock(&mrt_lock);
1592                         return -ENODEV;
1593                 }
1594                 skb2 = skb_clone(skb, GFP_ATOMIC);
1595                 if (!skb2) {
1596                         read_unlock(&mrt_lock);
1597                         return -ENOMEM;
1598                 }
1599
1600                 skb2->nh.raw = skb_push(skb2, sizeof(struct iphdr));
1601                 skb2->nh.iph->ihl = sizeof(struct iphdr)>>2;
1602                 skb2->nh.iph->saddr = rt->rt_src;
1603                 skb2->nh.iph->daddr = rt->rt_dst;
1604                 skb2->nh.iph->version = 0;
1605                 err = ipmr_cache_unresolved(vif, skb2);
1606                 read_unlock(&mrt_lock);
1607                 return err;
1608         }
1609
1610         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1611                 cache->mfc_flags |= MFC_NOTIFY;
1612         err = ipmr_fill_mroute(skb, cache, rtm);
1613         read_unlock(&mrt_lock);
1614         return err;
1615 }
1616
1617 #ifdef CONFIG_PROC_FS
1618 /*
1619  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1620  */
1621 struct ipmr_vif_iter {
1622         int ct;
1623 };
1624
1625 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1626                                            loff_t pos)
1627 {
1628         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1629                 if(!VIF_EXISTS(iter->ct))
1630                         continue;
1631                 if (pos-- == 0)
1632                         return &vif_table[iter->ct];
1633         }
1634         return NULL;
1635 }
1636
1637 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1638 {
1639         read_lock(&mrt_lock);
1640         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1641                 : SEQ_START_TOKEN;
1642 }
1643
1644 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1645 {
1646         struct ipmr_vif_iter *iter = seq->private;
1647
1648         ++*pos;
1649         if (v == SEQ_START_TOKEN)
1650                 return ipmr_vif_seq_idx(iter, 0);
1651
1652         while (++iter->ct < maxvif) {
1653                 if(!VIF_EXISTS(iter->ct))
1654                         continue;
1655                 return &vif_table[iter->ct];
1656         }
1657         return NULL;
1658 }
1659
1660 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1661 {
1662         read_unlock(&mrt_lock);
1663 }
1664
1665 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1666 {
1667         if (v == SEQ_START_TOKEN) {
1668                 seq_puts(seq,
1669                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1670         } else {
1671                 const struct vif_device *vif = v;
1672                 const char *name =  vif->dev ? vif->dev->name : "none";
1673
1674                 seq_printf(seq,
1675                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1676                            vif - vif_table,
1677                            name, vif->bytes_in, vif->pkt_in,
1678                            vif->bytes_out, vif->pkt_out,
1679                            vif->flags, vif->local, vif->remote);
1680         }
1681         return 0;
1682 }
1683
1684 static struct seq_operations ipmr_vif_seq_ops = {
1685         .start = ipmr_vif_seq_start,
1686         .next  = ipmr_vif_seq_next,
1687         .stop  = ipmr_vif_seq_stop,
1688         .show  = ipmr_vif_seq_show,
1689 };
1690
1691 static int ipmr_vif_open(struct inode *inode, struct file *file)
1692 {
1693         struct seq_file *seq;
1694         int rc = -ENOMEM;
1695         struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1696
1697         if (!s)
1698                 goto out;
1699
1700         rc = seq_open(file, &ipmr_vif_seq_ops);
1701         if (rc)
1702                 goto out_kfree;
1703
1704         s->ct = 0;
1705         seq = file->private_data;
1706         seq->private = s;
1707 out:
1708         return rc;
1709 out_kfree:
1710         kfree(s);
1711         goto out;
1712
1713 }
1714
1715 static struct file_operations ipmr_vif_fops = {
1716         .owner   = THIS_MODULE,
1717         .open    = ipmr_vif_open,
1718         .read    = seq_read,
1719         .llseek  = seq_lseek,
1720         .release = seq_release_private,
1721 };
1722
1723 struct ipmr_mfc_iter {
1724         struct mfc_cache **cache;
1725         int ct;
1726 };
1727
1728
1729 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1730 {
1731         struct mfc_cache *mfc;
1732
1733         it->cache = mfc_cache_array;
1734         read_lock(&mrt_lock);
1735         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1736                 for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1737                         if (pos-- == 0)
1738                                 return mfc;
1739         read_unlock(&mrt_lock);
1740
1741         it->cache = &mfc_unres_queue;
1742         spin_lock_bh(&mfc_unres_lock);
1743         for(mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1744                 if (pos-- == 0)
1745                         return mfc;
1746         spin_unlock_bh(&mfc_unres_lock);
1747
1748         it->cache = NULL;
1749         return NULL;
1750 }
1751
1752
1753 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1754 {
1755         struct ipmr_mfc_iter *it = seq->private;
1756         it->cache = NULL;
1757         it->ct = 0;
1758         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1759                 : SEQ_START_TOKEN;
1760 }
1761
1762 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1763 {
1764         struct mfc_cache *mfc = v;
1765         struct ipmr_mfc_iter *it = seq->private;
1766
1767         ++*pos;
1768
1769         if (v == SEQ_START_TOKEN)
1770                 return ipmr_mfc_seq_idx(seq->private, 0);
1771
1772         if (mfc->next)
1773                 return mfc->next;
1774
1775         if (it->cache == &mfc_unres_queue)
1776                 goto end_of_list;
1777
1778         BUG_ON(it->cache != mfc_cache_array);
1779
1780         while (++it->ct < MFC_LINES) {
1781                 mfc = mfc_cache_array[it->ct];
1782                 if (mfc)
1783                         return mfc;
1784         }
1785
1786         /* exhausted cache_array, show unresolved */
1787         read_unlock(&mrt_lock);
1788         it->cache = &mfc_unres_queue;
1789         it->ct = 0;
1790
1791         spin_lock_bh(&mfc_unres_lock);
1792         mfc = mfc_unres_queue;
1793         if (mfc)
1794                 return mfc;
1795
1796  end_of_list:
1797         spin_unlock_bh(&mfc_unres_lock);
1798         it->cache = NULL;
1799
1800         return NULL;
1801 }
1802
1803 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1804 {
1805         struct ipmr_mfc_iter *it = seq->private;
1806
1807         if (it->cache == &mfc_unres_queue)
1808                 spin_unlock_bh(&mfc_unres_lock);
1809         else if (it->cache == mfc_cache_array)
1810                 read_unlock(&mrt_lock);
1811 }
1812
1813 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1814 {
1815         int n;
1816
1817         if (v == SEQ_START_TOKEN) {
1818                 seq_puts(seq,
1819                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1820         } else {
1821                 const struct mfc_cache *mfc = v;
1822                 const struct ipmr_mfc_iter *it = seq->private;
1823
1824                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1825                            (unsigned long) mfc->mfc_mcastgrp,
1826                            (unsigned long) mfc->mfc_origin,
1827                            mfc->mfc_parent,
1828                            mfc->mfc_un.res.pkt,
1829                            mfc->mfc_un.res.bytes,
1830                            mfc->mfc_un.res.wrong_if);
1831
1832                 if (it->cache != &mfc_unres_queue) {
1833                         for(n = mfc->mfc_un.res.minvif;
1834                             n < mfc->mfc_un.res.maxvif; n++ ) {
1835                                 if(VIF_EXISTS(n)
1836                                    && mfc->mfc_un.res.ttls[n] < 255)
1837                                 seq_printf(seq,
1838                                            " %2d:%-3d",
1839                                            n, mfc->mfc_un.res.ttls[n]);
1840                         }
1841                 }
1842                 seq_putc(seq, '\n');
1843         }
1844         return 0;
1845 }
1846
1847 static struct seq_operations ipmr_mfc_seq_ops = {
1848         .start = ipmr_mfc_seq_start,
1849         .next  = ipmr_mfc_seq_next,
1850         .stop  = ipmr_mfc_seq_stop,
1851         .show  = ipmr_mfc_seq_show,
1852 };
1853
1854 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1855 {
1856         struct seq_file *seq;
1857         int rc = -ENOMEM;
1858         struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1859
1860         if (!s)
1861                 goto out;
1862
1863         rc = seq_open(file, &ipmr_mfc_seq_ops);
1864         if (rc)
1865                 goto out_kfree;
1866
1867         seq = file->private_data;
1868         seq->private = s;
1869 out:
1870         return rc;
1871 out_kfree:
1872         kfree(s);
1873         goto out;
1874
1875 }
1876
1877 static struct file_operations ipmr_mfc_fops = {
1878         .owner   = THIS_MODULE,
1879         .open    = ipmr_mfc_open,
1880         .read    = seq_read,
1881         .llseek  = seq_lseek,
1882         .release = seq_release_private,
1883 };
1884 #endif
1885
1886 #ifdef CONFIG_IP_PIMSM_V2
1887 static struct net_protocol pim_protocol = {
1888         .handler        =       pim_rcv,
1889 };
1890 #endif
1891
1892
1893 /*
1894  *      Setup for IP multicast routing
1895  */
1896
1897 void __init ip_mr_init(void)
1898 {
1899         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1900                                        sizeof(struct mfc_cache),
1901                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1902                                        NULL, NULL);
1903         init_timer(&ipmr_expire_timer);
1904         ipmr_expire_timer.function=ipmr_expire_process;
1905         register_netdevice_notifier(&ip_mr_notifier);
1906 #ifdef CONFIG_PROC_FS
1907         proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1908         proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1909 #endif
1910 }