gre: Move MTU setting out of ipgre_tunnel_bind_dev
[pandora-kernel.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/if_ether.h>
31
32 #include <net/sock.h>
33 #include <net/ip.h>
34 #include <net/icmp.h>
35 #include <net/protocol.h>
36 #include <net/ipip.h>
37 #include <net/arp.h>
38 #include <net/checksum.h>
39 #include <net/dsfield.h>
40 #include <net/inet_ecn.h>
41 #include <net/xfrm.h>
42 #include <net/net_namespace.h>
43 #include <net/netns/generic.h>
44
45 #ifdef CONFIG_IPV6
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #endif
50
51 /*
52    Problems & solutions
53    --------------------
54
55    1. The most important issue is detecting local dead loops.
56    They would cause complete host lockup in transmit, which
57    would be "resolved" by stack overflow or, if queueing is enabled,
58    with infinite looping in net_bh.
59
60    We cannot track such dead loops during route installation,
61    it is infeasible task. The most general solutions would be
62    to keep skb->encapsulation counter (sort of local ttl),
63    and silently drop packet when it expires. It is the best
64    solution, but it supposes maintaing new variable in ALL
65    skb, even if no tunneling is used.
66
67    Current solution: t->recursion lock breaks dead loops. It looks
68    like dev->tbusy flag, but I preferred new variable, because
69    the semantics is different. One day, when hard_start_xmit
70    will be multithreaded we will have to use skb->encapsulation.
71
72
73
74    2. Networking dead loops would not kill routers, but would really
75    kill network. IP hop limit plays role of "t->recursion" in this case,
76    if we copy it from packet being encapsulated to upper header.
77    It is very good solution, but it introduces two problems:
78
79    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
80      do not work over tunnels.
81    - traceroute does not work. I planned to relay ICMP from tunnel,
82      so that this problem would be solved and traceroute output
83      would even more informative. This idea appeared to be wrong:
84      only Linux complies to rfc1812 now (yes, guys, Linux is the only
85      true router now :-)), all routers (at least, in neighbourhood of mine)
86      return only 8 bytes of payload. It is the end.
87
88    Hence, if we want that OSPF worked or traceroute said something reasonable,
89    we should search for another solution.
90
91    One of them is to parse packet trying to detect inner encapsulation
92    made by our node. It is difficult or even impossible, especially,
93    taking into account fragmentation. TO be short, tt is not solution at all.
94
95    Current solution: The solution was UNEXPECTEDLY SIMPLE.
96    We force DF flag on tunnels with preconfigured hop limit,
97    that is ALL. :-) Well, it does not remove the problem completely,
98    but exponential growth of network traffic is changed to linear
99    (branches, that exceed pmtu are pruned) and tunnel mtu
100    fastly degrades to value <68, where looping stops.
101    Yes, it is not good if there exists a router in the loop,
102    which does not force DF, even when encapsulating packets have DF set.
103    But it is not our problem! Nobody could accuse us, we made
104    all that we could make. Even if it is your gated who injected
105    fatal route to network, even if it were you who configured
106    fatal static route: you are innocent. :-)
107
108
109
110    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
111    practically identical code. It would be good to glue them
112    together, but it is not very evident, how to make them modular.
113    sit is integral part of IPv6, ipip and gre are naturally modular.
114    We could extract common parts (hash table, ioctl etc)
115    to a separate module (ip_tunnel.c).
116
117    Alexey Kuznetsov.
118  */
119
120 static int ipgre_tunnel_init(struct net_device *dev);
121 static void ipgre_tunnel_setup(struct net_device *dev);
122 static int ipgre_tunnel_bind_dev(struct net_device *dev);
123
124 /* Fallback tunnel: no source, no destination, no key, no options */
125
126 static int ipgre_fb_tunnel_init(struct net_device *dev);
127
128 #define HASH_SIZE  16
129
130 static int ipgre_net_id;
131 struct ipgre_net {
132         struct ip_tunnel *tunnels[4][HASH_SIZE];
133
134         struct net_device *fb_tunnel_dev;
135 };
136
137 /* Tunnel hash table */
138
139 /*
140    4 hash tables:
141
142    3: (remote,local)
143    2: (remote,*)
144    1: (*,local)
145    0: (*,*)
146
147    We require exact key match i.e. if a key is present in packet
148    it will match only tunnel with the same key; if it is not present,
149    it will match only keyless tunnel.
150
151    All keysless packets, if not matched configured keyless tunnels
152    will match fallback tunnel.
153  */
154
155 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156
157 #define tunnels_r_l     tunnels[3]
158 #define tunnels_r       tunnels[2]
159 #define tunnels_l       tunnels[1]
160 #define tunnels_wc      tunnels[0]
161
162 static DEFINE_RWLOCK(ipgre_lock);
163
164 /* Given src, dst and key, find appropriate for input tunnel. */
165
166 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
167                 __be32 remote, __be32 local, __be32 key)
168 {
169         unsigned h0 = HASH(remote);
170         unsigned h1 = HASH(key);
171         struct ip_tunnel *t;
172         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
173
174         for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
175                 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
176                         if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
177                                 return t;
178                 }
179         }
180         for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
181                 if (remote == t->parms.iph.daddr) {
182                         if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
183                                 return t;
184                 }
185         }
186         for (t = ign->tunnels_l[h1]; t; t = t->next) {
187                 if (local == t->parms.iph.saddr ||
188                      (local == t->parms.iph.daddr &&
189                       ipv4_is_multicast(local))) {
190                         if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
191                                 return t;
192                 }
193         }
194         for (t = ign->tunnels_wc[h1]; t; t = t->next) {
195                 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
196                         return t;
197         }
198
199         if (ign->fb_tunnel_dev->flags&IFF_UP)
200                 return netdev_priv(ign->fb_tunnel_dev);
201         return NULL;
202 }
203
204 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
205                 struct ip_tunnel_parm *parms)
206 {
207         __be32 remote = parms->iph.daddr;
208         __be32 local = parms->iph.saddr;
209         __be32 key = parms->i_key;
210         unsigned h = HASH(key);
211         int prio = 0;
212
213         if (local)
214                 prio |= 1;
215         if (remote && !ipv4_is_multicast(remote)) {
216                 prio |= 2;
217                 h ^= HASH(remote);
218         }
219
220         return &ign->tunnels[prio][h];
221 }
222
223 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
224                 struct ip_tunnel *t)
225 {
226         return __ipgre_bucket(ign, &t->parms);
227 }
228
229 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
230 {
231         struct ip_tunnel **tp = ipgre_bucket(ign, t);
232
233         t->next = *tp;
234         write_lock_bh(&ipgre_lock);
235         *tp = t;
236         write_unlock_bh(&ipgre_lock);
237 }
238
239 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
240 {
241         struct ip_tunnel **tp;
242
243         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
244                 if (t == *tp) {
245                         write_lock_bh(&ipgre_lock);
246                         *tp = t->next;
247                         write_unlock_bh(&ipgre_lock);
248                         break;
249                 }
250         }
251 }
252
253 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
254                 struct ip_tunnel_parm *parms, int create)
255 {
256         __be32 remote = parms->iph.daddr;
257         __be32 local = parms->iph.saddr;
258         __be32 key = parms->i_key;
259         struct ip_tunnel *t, **tp, *nt;
260         struct net_device *dev;
261         char name[IFNAMSIZ];
262         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
263
264         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) {
265                 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
266                         if (key == t->parms.i_key)
267                                 return t;
268                 }
269         }
270         if (!create)
271                 return NULL;
272
273         if (parms->name[0])
274                 strlcpy(name, parms->name, IFNAMSIZ);
275         else
276                 sprintf(name, "gre%%d");
277
278         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
279         if (!dev)
280           return NULL;
281
282         dev_net_set(dev, net);
283
284         if (strchr(name, '%')) {
285                 if (dev_alloc_name(dev, name) < 0)
286                         goto failed_free;
287         }
288
289         dev->init = ipgre_tunnel_init;
290         nt = netdev_priv(dev);
291         nt->parms = *parms;
292
293         dev->mtu = ipgre_tunnel_bind_dev(dev);
294
295         if (register_netdevice(dev) < 0)
296                 goto failed_free;
297
298         dev_hold(dev);
299         ipgre_tunnel_link(ign, nt);
300         return nt;
301
302 failed_free:
303         free_netdev(dev);
304         return NULL;
305 }
306
307 static void ipgre_tunnel_uninit(struct net_device *dev)
308 {
309         struct net *net = dev_net(dev);
310         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
311
312         ipgre_tunnel_unlink(ign, netdev_priv(dev));
313         dev_put(dev);
314 }
315
316
317 static void ipgre_err(struct sk_buff *skb, u32 info)
318 {
319
320 /* All the routers (except for Linux) return only
321    8 bytes of packet payload. It means, that precise relaying of
322    ICMP in the real Internet is absolutely infeasible.
323
324    Moreover, Cisco "wise men" put GRE key to the third word
325    in GRE header. It makes impossible maintaining even soft state for keyed
326    GRE tunnels with enabled checksum. Tell them "thank you".
327
328    Well, I wonder, rfc1812 was written by Cisco employee,
329    what the hell these idiots break standrads established
330    by themself???
331  */
332
333         struct iphdr *iph = (struct iphdr*)skb->data;
334         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
335         int grehlen = (iph->ihl<<2) + 4;
336         const int type = icmp_hdr(skb)->type;
337         const int code = icmp_hdr(skb)->code;
338         struct ip_tunnel *t;
339         __be16 flags;
340
341         flags = p[0];
342         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
343                 if (flags&(GRE_VERSION|GRE_ROUTING))
344                         return;
345                 if (flags&GRE_KEY) {
346                         grehlen += 4;
347                         if (flags&GRE_CSUM)
348                                 grehlen += 4;
349                 }
350         }
351
352         /* If only 8 bytes returned, keyed message will be dropped here */
353         if (skb_headlen(skb) < grehlen)
354                 return;
355
356         switch (type) {
357         default:
358         case ICMP_PARAMETERPROB:
359                 return;
360
361         case ICMP_DEST_UNREACH:
362                 switch (code) {
363                 case ICMP_SR_FAILED:
364                 case ICMP_PORT_UNREACH:
365                         /* Impossible event. */
366                         return;
367                 case ICMP_FRAG_NEEDED:
368                         /* Soft state for pmtu is maintained by IP core. */
369                         return;
370                 default:
371                         /* All others are translated to HOST_UNREACH.
372                            rfc2003 contains "deep thoughts" about NET_UNREACH,
373                            I believe they are just ether pollution. --ANK
374                          */
375                         break;
376                 }
377                 break;
378         case ICMP_TIME_EXCEEDED:
379                 if (code != ICMP_EXC_TTL)
380                         return;
381                 break;
382         }
383
384         read_lock(&ipgre_lock);
385         t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
386                         (flags&GRE_KEY) ?
387                         *(((__be32*)p) + (grehlen>>2) - 1) : 0);
388         if (t == NULL || t->parms.iph.daddr == 0 ||
389             ipv4_is_multicast(t->parms.iph.daddr))
390                 goto out;
391
392         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
393                 goto out;
394
395         if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
396                 t->err_count++;
397         else
398                 t->err_count = 1;
399         t->err_time = jiffies;
400 out:
401         read_unlock(&ipgre_lock);
402         return;
403 }
404
405 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
406 {
407         if (INET_ECN_is_ce(iph->tos)) {
408                 if (skb->protocol == htons(ETH_P_IP)) {
409                         IP_ECN_set_ce(ip_hdr(skb));
410                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
411                         IP6_ECN_set_ce(ipv6_hdr(skb));
412                 }
413         }
414 }
415
416 static inline u8
417 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
418 {
419         u8 inner = 0;
420         if (skb->protocol == htons(ETH_P_IP))
421                 inner = old_iph->tos;
422         else if (skb->protocol == htons(ETH_P_IPV6))
423                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
424         return INET_ECN_encapsulate(tos, inner);
425 }
426
427 static int ipgre_rcv(struct sk_buff *skb)
428 {
429         struct iphdr *iph;
430         u8     *h;
431         __be16    flags;
432         __sum16   csum = 0;
433         __be32 key = 0;
434         u32    seqno = 0;
435         struct ip_tunnel *tunnel;
436         int    offset = 4;
437
438         if (!pskb_may_pull(skb, 16))
439                 goto drop_nolock;
440
441         iph = ip_hdr(skb);
442         h = skb->data;
443         flags = *(__be16*)h;
444
445         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
446                 /* - Version must be 0.
447                    - We do not support routing headers.
448                  */
449                 if (flags&(GRE_VERSION|GRE_ROUTING))
450                         goto drop_nolock;
451
452                 if (flags&GRE_CSUM) {
453                         switch (skb->ip_summed) {
454                         case CHECKSUM_COMPLETE:
455                                 csum = csum_fold(skb->csum);
456                                 if (!csum)
457                                         break;
458                                 /* fall through */
459                         case CHECKSUM_NONE:
460                                 skb->csum = 0;
461                                 csum = __skb_checksum_complete(skb);
462                                 skb->ip_summed = CHECKSUM_COMPLETE;
463                         }
464                         offset += 4;
465                 }
466                 if (flags&GRE_KEY) {
467                         key = *(__be32*)(h + offset);
468                         offset += 4;
469                 }
470                 if (flags&GRE_SEQ) {
471                         seqno = ntohl(*(__be32*)(h + offset));
472                         offset += 4;
473                 }
474         }
475
476         read_lock(&ipgre_lock);
477         if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
478                                         iph->saddr, iph->daddr, key)) != NULL) {
479                 struct net_device_stats *stats = &tunnel->dev->stats;
480
481                 secpath_reset(skb);
482
483                 skb->protocol = *(__be16*)(h + 2);
484                 /* WCCP version 1 and 2 protocol decoding.
485                  * - Change protocol to IP
486                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
487                  */
488                 if (flags == 0 &&
489                     skb->protocol == htons(ETH_P_WCCP)) {
490                         skb->protocol = htons(ETH_P_IP);
491                         if ((*(h + offset) & 0xF0) != 0x40)
492                                 offset += 4;
493                 }
494
495                 skb->mac_header = skb->network_header;
496                 __pskb_pull(skb, offset);
497                 skb_reset_network_header(skb);
498                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
499                 skb->pkt_type = PACKET_HOST;
500 #ifdef CONFIG_NET_IPGRE_BROADCAST
501                 if (ipv4_is_multicast(iph->daddr)) {
502                         /* Looped back packet, drop it! */
503                         if (skb->rtable->fl.iif == 0)
504                                 goto drop;
505                         stats->multicast++;
506                         skb->pkt_type = PACKET_BROADCAST;
507                 }
508 #endif
509
510                 if (((flags&GRE_CSUM) && csum) ||
511                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
512                         stats->rx_crc_errors++;
513                         stats->rx_errors++;
514                         goto drop;
515                 }
516                 if (tunnel->parms.i_flags&GRE_SEQ) {
517                         if (!(flags&GRE_SEQ) ||
518                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
519                                 stats->rx_fifo_errors++;
520                                 stats->rx_errors++;
521                                 goto drop;
522                         }
523                         tunnel->i_seqno = seqno + 1;
524                 }
525                 stats->rx_packets++;
526                 stats->rx_bytes += skb->len;
527                 skb->dev = tunnel->dev;
528                 dst_release(skb->dst);
529                 skb->dst = NULL;
530                 nf_reset(skb);
531                 ipgre_ecn_decapsulate(iph, skb);
532                 netif_rx(skb);
533                 read_unlock(&ipgre_lock);
534                 return(0);
535         }
536         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
537
538 drop:
539         read_unlock(&ipgre_lock);
540 drop_nolock:
541         kfree_skb(skb);
542         return(0);
543 }
544
545 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
546 {
547         struct ip_tunnel *tunnel = netdev_priv(dev);
548         struct net_device_stats *stats = &tunnel->dev->stats;
549         struct iphdr  *old_iph = ip_hdr(skb);
550         struct iphdr  *tiph;
551         u8     tos;
552         __be16 df;
553         struct rtable *rt;                      /* Route to the other host */
554         struct net_device *tdev;                        /* Device to other host */
555         struct iphdr  *iph;                     /* Our new IP header */
556         unsigned int max_headroom;              /* The extra header space needed */
557         int    gre_hlen;
558         __be32 dst;
559         int    mtu;
560
561         if (tunnel->recursion++) {
562                 stats->collisions++;
563                 goto tx_error;
564         }
565
566         if (dev->header_ops) {
567                 gre_hlen = 0;
568                 tiph = (struct iphdr*)skb->data;
569         } else {
570                 gre_hlen = tunnel->hlen;
571                 tiph = &tunnel->parms.iph;
572         }
573
574         if ((dst = tiph->daddr) == 0) {
575                 /* NBMA tunnel */
576
577                 if (skb->dst == NULL) {
578                         stats->tx_fifo_errors++;
579                         goto tx_error;
580                 }
581
582                 if (skb->protocol == htons(ETH_P_IP)) {
583                         rt = skb->rtable;
584                         if ((dst = rt->rt_gateway) == 0)
585                                 goto tx_error_icmp;
586                 }
587 #ifdef CONFIG_IPV6
588                 else if (skb->protocol == htons(ETH_P_IPV6)) {
589                         struct in6_addr *addr6;
590                         int addr_type;
591                         struct neighbour *neigh = skb->dst->neighbour;
592
593                         if (neigh == NULL)
594                                 goto tx_error;
595
596                         addr6 = (struct in6_addr*)&neigh->primary_key;
597                         addr_type = ipv6_addr_type(addr6);
598
599                         if (addr_type == IPV6_ADDR_ANY) {
600                                 addr6 = &ipv6_hdr(skb)->daddr;
601                                 addr_type = ipv6_addr_type(addr6);
602                         }
603
604                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
605                                 goto tx_error_icmp;
606
607                         dst = addr6->s6_addr32[3];
608                 }
609 #endif
610                 else
611                         goto tx_error;
612         }
613
614         tos = tiph->tos;
615         if (tos&1) {
616                 if (skb->protocol == htons(ETH_P_IP))
617                         tos = old_iph->tos;
618                 tos &= ~1;
619         }
620
621         {
622                 struct flowi fl = { .oif = tunnel->parms.link,
623                                     .nl_u = { .ip4_u =
624                                               { .daddr = dst,
625                                                 .saddr = tiph->saddr,
626                                                 .tos = RT_TOS(tos) } },
627                                     .proto = IPPROTO_GRE };
628                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
629                         stats->tx_carrier_errors++;
630                         goto tx_error;
631                 }
632         }
633         tdev = rt->u.dst.dev;
634
635         if (tdev == dev) {
636                 ip_rt_put(rt);
637                 stats->collisions++;
638                 goto tx_error;
639         }
640
641         df = tiph->frag_off;
642         if (df)
643                 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
644         else
645                 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
646
647         if (skb->dst)
648                 skb->dst->ops->update_pmtu(skb->dst, mtu);
649
650         if (skb->protocol == htons(ETH_P_IP)) {
651                 df |= (old_iph->frag_off&htons(IP_DF));
652
653                 if ((old_iph->frag_off&htons(IP_DF)) &&
654                     mtu < ntohs(old_iph->tot_len)) {
655                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
656                         ip_rt_put(rt);
657                         goto tx_error;
658                 }
659         }
660 #ifdef CONFIG_IPV6
661         else if (skb->protocol == htons(ETH_P_IPV6)) {
662                 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
663
664                 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
665                         if ((tunnel->parms.iph.daddr &&
666                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
667                             rt6->rt6i_dst.plen == 128) {
668                                 rt6->rt6i_flags |= RTF_MODIFIED;
669                                 skb->dst->metrics[RTAX_MTU-1] = mtu;
670                         }
671                 }
672
673                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
674                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
675                         ip_rt_put(rt);
676                         goto tx_error;
677                 }
678         }
679 #endif
680
681         if (tunnel->err_count > 0) {
682                 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
683                         tunnel->err_count--;
684
685                         dst_link_failure(skb);
686                 } else
687                         tunnel->err_count = 0;
688         }
689
690         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
691
692         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
693             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
694                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
695                 if (!new_skb) {
696                         ip_rt_put(rt);
697                         stats->tx_dropped++;
698                         dev_kfree_skb(skb);
699                         tunnel->recursion--;
700                         return 0;
701                 }
702                 if (skb->sk)
703                         skb_set_owner_w(new_skb, skb->sk);
704                 dev_kfree_skb(skb);
705                 skb = new_skb;
706                 old_iph = ip_hdr(skb);
707         }
708
709         skb->transport_header = skb->network_header;
710         skb_push(skb, gre_hlen);
711         skb_reset_network_header(skb);
712         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
713         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
714                               IPSKB_REROUTED);
715         dst_release(skb->dst);
716         skb->dst = &rt->u.dst;
717
718         /*
719          *      Push down and install the IPIP header.
720          */
721
722         iph                     =       ip_hdr(skb);
723         iph->version            =       4;
724         iph->ihl                =       sizeof(struct iphdr) >> 2;
725         iph->frag_off           =       df;
726         iph->protocol           =       IPPROTO_GRE;
727         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
728         iph->daddr              =       rt->rt_dst;
729         iph->saddr              =       rt->rt_src;
730
731         if ((iph->ttl = tiph->ttl) == 0) {
732                 if (skb->protocol == htons(ETH_P_IP))
733                         iph->ttl = old_iph->ttl;
734 #ifdef CONFIG_IPV6
735                 else if (skb->protocol == htons(ETH_P_IPV6))
736                         iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
737 #endif
738                 else
739                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
740         }
741
742         ((__be16*)(iph+1))[0] = tunnel->parms.o_flags;
743         ((__be16*)(iph+1))[1] = skb->protocol;
744
745         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
746                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
747
748                 if (tunnel->parms.o_flags&GRE_SEQ) {
749                         ++tunnel->o_seqno;
750                         *ptr = htonl(tunnel->o_seqno);
751                         ptr--;
752                 }
753                 if (tunnel->parms.o_flags&GRE_KEY) {
754                         *ptr = tunnel->parms.o_key;
755                         ptr--;
756                 }
757                 if (tunnel->parms.o_flags&GRE_CSUM) {
758                         *ptr = 0;
759                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
760                 }
761         }
762
763         nf_reset(skb);
764
765         IPTUNNEL_XMIT();
766         tunnel->recursion--;
767         return 0;
768
769 tx_error_icmp:
770         dst_link_failure(skb);
771
772 tx_error:
773         stats->tx_errors++;
774         dev_kfree_skb(skb);
775         tunnel->recursion--;
776         return 0;
777 }
778
779 static int ipgre_tunnel_bind_dev(struct net_device *dev)
780 {
781         struct net_device *tdev = NULL;
782         struct ip_tunnel *tunnel;
783         struct iphdr *iph;
784         int hlen = LL_MAX_HEADER;
785         int mtu = ETH_DATA_LEN;
786         int addend = sizeof(struct iphdr) + 4;
787
788         tunnel = netdev_priv(dev);
789         iph = &tunnel->parms.iph;
790
791         /* Guess output device to choose reasonable mtu and needed_headroom */
792
793         if (iph->daddr) {
794                 struct flowi fl = { .oif = tunnel->parms.link,
795                                     .nl_u = { .ip4_u =
796                                               { .daddr = iph->daddr,
797                                                 .saddr = iph->saddr,
798                                                 .tos = RT_TOS(iph->tos) } },
799                                     .proto = IPPROTO_GRE };
800                 struct rtable *rt;
801                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
802                         tdev = rt->u.dst.dev;
803                         ip_rt_put(rt);
804                 }
805                 dev->flags |= IFF_POINTOPOINT;
806         }
807
808         if (!tdev && tunnel->parms.link)
809                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
810
811         if (tdev) {
812                 hlen = tdev->hard_header_len + tdev->needed_headroom;
813                 mtu = tdev->mtu;
814         }
815         dev->iflink = tunnel->parms.link;
816
817         /* Precalculate GRE options length */
818         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
819                 if (tunnel->parms.o_flags&GRE_CSUM)
820                         addend += 4;
821                 if (tunnel->parms.o_flags&GRE_KEY)
822                         addend += 4;
823                 if (tunnel->parms.o_flags&GRE_SEQ)
824                         addend += 4;
825         }
826         dev->needed_headroom = addend + hlen;
827         mtu -= dev->hard_header_len - addend;
828
829         if (mtu < 68)
830                 mtu = 68;
831
832         tunnel->hlen = addend;
833
834         return mtu;
835 }
836
837 static int
838 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
839 {
840         int err = 0;
841         struct ip_tunnel_parm p;
842         struct ip_tunnel *t;
843         struct net *net = dev_net(dev);
844         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
845
846         switch (cmd) {
847         case SIOCGETTUNNEL:
848                 t = NULL;
849                 if (dev == ign->fb_tunnel_dev) {
850                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
851                                 err = -EFAULT;
852                                 break;
853                         }
854                         t = ipgre_tunnel_locate(net, &p, 0);
855                 }
856                 if (t == NULL)
857                         t = netdev_priv(dev);
858                 memcpy(&p, &t->parms, sizeof(p));
859                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
860                         err = -EFAULT;
861                 break;
862
863         case SIOCADDTUNNEL:
864         case SIOCCHGTUNNEL:
865                 err = -EPERM;
866                 if (!capable(CAP_NET_ADMIN))
867                         goto done;
868
869                 err = -EFAULT;
870                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
871                         goto done;
872
873                 err = -EINVAL;
874                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
875                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
876                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
877                         goto done;
878                 if (p.iph.ttl)
879                         p.iph.frag_off |= htons(IP_DF);
880
881                 if (!(p.i_flags&GRE_KEY))
882                         p.i_key = 0;
883                 if (!(p.o_flags&GRE_KEY))
884                         p.o_key = 0;
885
886                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
887
888                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
889                         if (t != NULL) {
890                                 if (t->dev != dev) {
891                                         err = -EEXIST;
892                                         break;
893                                 }
894                         } else {
895                                 unsigned nflags=0;
896
897                                 t = netdev_priv(dev);
898
899                                 if (ipv4_is_multicast(p.iph.daddr))
900                                         nflags = IFF_BROADCAST;
901                                 else if (p.iph.daddr)
902                                         nflags = IFF_POINTOPOINT;
903
904                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
905                                         err = -EINVAL;
906                                         break;
907                                 }
908                                 ipgre_tunnel_unlink(ign, t);
909                                 t->parms.iph.saddr = p.iph.saddr;
910                                 t->parms.iph.daddr = p.iph.daddr;
911                                 t->parms.i_key = p.i_key;
912                                 t->parms.o_key = p.o_key;
913                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
914                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
915                                 ipgre_tunnel_link(ign, t);
916                                 netdev_state_change(dev);
917                         }
918                 }
919
920                 if (t) {
921                         err = 0;
922                         if (cmd == SIOCCHGTUNNEL) {
923                                 t->parms.iph.ttl = p.iph.ttl;
924                                 t->parms.iph.tos = p.iph.tos;
925                                 t->parms.iph.frag_off = p.iph.frag_off;
926                                 if (t->parms.link != p.link) {
927                                         t->parms.link = p.link;
928                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
929                                         netdev_state_change(dev);
930                                 }
931                         }
932                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
933                                 err = -EFAULT;
934                 } else
935                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
936                 break;
937
938         case SIOCDELTUNNEL:
939                 err = -EPERM;
940                 if (!capable(CAP_NET_ADMIN))
941                         goto done;
942
943                 if (dev == ign->fb_tunnel_dev) {
944                         err = -EFAULT;
945                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
946                                 goto done;
947                         err = -ENOENT;
948                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
949                                 goto done;
950                         err = -EPERM;
951                         if (t == netdev_priv(ign->fb_tunnel_dev))
952                                 goto done;
953                         dev = t->dev;
954                 }
955                 unregister_netdevice(dev);
956                 err = 0;
957                 break;
958
959         default:
960                 err = -EINVAL;
961         }
962
963 done:
964         return err;
965 }
966
967 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
968 {
969         struct ip_tunnel *tunnel = netdev_priv(dev);
970         if (new_mtu < 68 ||
971             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
972                 return -EINVAL;
973         dev->mtu = new_mtu;
974         return 0;
975 }
976
977 /* Nice toy. Unfortunately, useless in real life :-)
978    It allows to construct virtual multiprotocol broadcast "LAN"
979    over the Internet, provided multicast routing is tuned.
980
981
982    I have no idea was this bicycle invented before me,
983    so that I had to set ARPHRD_IPGRE to a random value.
984    I have an impression, that Cisco could make something similar,
985    but this feature is apparently missing in IOS<=11.2(8).
986
987    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
988    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
989
990    ping -t 255 224.66.66.66
991
992    If nobody answers, mbone does not work.
993
994    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
995    ip addr add 10.66.66.<somewhat>/24 dev Universe
996    ifconfig Universe up
997    ifconfig Universe add fe80::<Your_real_addr>/10
998    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
999    ftp 10.66.66.66
1000    ...
1001    ftp fec0:6666:6666::193.233.7.65
1002    ...
1003
1004  */
1005
1006 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1007                         unsigned short type,
1008                         const void *daddr, const void *saddr, unsigned len)
1009 {
1010         struct ip_tunnel *t = netdev_priv(dev);
1011         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1012         __be16 *p = (__be16*)(iph+1);
1013
1014         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1015         p[0]            = t->parms.o_flags;
1016         p[1]            = htons(type);
1017
1018         /*
1019          *      Set the source hardware address.
1020          */
1021
1022         if (saddr)
1023                 memcpy(&iph->saddr, saddr, 4);
1024
1025         if (daddr) {
1026                 memcpy(&iph->daddr, daddr, 4);
1027                 return t->hlen;
1028         }
1029         if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1030                 return t->hlen;
1031
1032         return -t->hlen;
1033 }
1034
1035 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1036 {
1037         struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
1038         memcpy(haddr, &iph->saddr, 4);
1039         return 4;
1040 }
1041
1042 static const struct header_ops ipgre_header_ops = {
1043         .create = ipgre_header,
1044         .parse  = ipgre_header_parse,
1045 };
1046
1047 #ifdef CONFIG_NET_IPGRE_BROADCAST
1048 static int ipgre_open(struct net_device *dev)
1049 {
1050         struct ip_tunnel *t = netdev_priv(dev);
1051
1052         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1053                 struct flowi fl = { .oif = t->parms.link,
1054                                     .nl_u = { .ip4_u =
1055                                               { .daddr = t->parms.iph.daddr,
1056                                                 .saddr = t->parms.iph.saddr,
1057                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1058                                     .proto = IPPROTO_GRE };
1059                 struct rtable *rt;
1060                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1061                         return -EADDRNOTAVAIL;
1062                 dev = rt->u.dst.dev;
1063                 ip_rt_put(rt);
1064                 if (__in_dev_get_rtnl(dev) == NULL)
1065                         return -EADDRNOTAVAIL;
1066                 t->mlink = dev->ifindex;
1067                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1068         }
1069         return 0;
1070 }
1071
1072 static int ipgre_close(struct net_device *dev)
1073 {
1074         struct ip_tunnel *t = netdev_priv(dev);
1075         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1076                 struct in_device *in_dev;
1077                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1078                 if (in_dev) {
1079                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1080                         in_dev_put(in_dev);
1081                 }
1082         }
1083         return 0;
1084 }
1085
1086 #endif
1087
1088 static void ipgre_tunnel_setup(struct net_device *dev)
1089 {
1090         dev->uninit             = ipgre_tunnel_uninit;
1091         dev->destructor         = free_netdev;
1092         dev->hard_start_xmit    = ipgre_tunnel_xmit;
1093         dev->do_ioctl           = ipgre_tunnel_ioctl;
1094         dev->change_mtu         = ipgre_tunnel_change_mtu;
1095
1096         dev->type               = ARPHRD_IPGRE;
1097         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1098         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1099         dev->flags              = IFF_NOARP;
1100         dev->iflink             = 0;
1101         dev->addr_len           = 4;
1102         dev->features           |= NETIF_F_NETNS_LOCAL;
1103 }
1104
1105 static int ipgre_tunnel_init(struct net_device *dev)
1106 {
1107         struct ip_tunnel *tunnel;
1108         struct iphdr *iph;
1109
1110         tunnel = netdev_priv(dev);
1111         iph = &tunnel->parms.iph;
1112
1113         tunnel->dev = dev;
1114         strcpy(tunnel->parms.name, dev->name);
1115
1116         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1117         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1118
1119         if (iph->daddr) {
1120 #ifdef CONFIG_NET_IPGRE_BROADCAST
1121                 if (ipv4_is_multicast(iph->daddr)) {
1122                         if (!iph->saddr)
1123                                 return -EINVAL;
1124                         dev->flags = IFF_BROADCAST;
1125                         dev->header_ops = &ipgre_header_ops;
1126                         dev->open = ipgre_open;
1127                         dev->stop = ipgre_close;
1128                 }
1129 #endif
1130         } else
1131                 dev->header_ops = &ipgre_header_ops;
1132
1133         return 0;
1134 }
1135
1136 static int ipgre_fb_tunnel_init(struct net_device *dev)
1137 {
1138         struct ip_tunnel *tunnel = netdev_priv(dev);
1139         struct iphdr *iph = &tunnel->parms.iph;
1140         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1141
1142         tunnel->dev = dev;
1143         strcpy(tunnel->parms.name, dev->name);
1144
1145         iph->version            = 4;
1146         iph->protocol           = IPPROTO_GRE;
1147         iph->ihl                = 5;
1148         tunnel->hlen            = sizeof(struct iphdr) + 4;
1149
1150         dev_hold(dev);
1151         ign->tunnels_wc[0]      = tunnel;
1152         return 0;
1153 }
1154
1155
1156 static struct net_protocol ipgre_protocol = {
1157         .handler        =       ipgre_rcv,
1158         .err_handler    =       ipgre_err,
1159         .netns_ok       =       1,
1160 };
1161
1162 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1163 {
1164         int prio;
1165
1166         for (prio = 0; prio < 4; prio++) {
1167                 int h;
1168                 for (h = 0; h < HASH_SIZE; h++) {
1169                         struct ip_tunnel *t;
1170                         while ((t = ign->tunnels[prio][h]) != NULL)
1171                                 unregister_netdevice(t->dev);
1172                 }
1173         }
1174 }
1175
1176 static int ipgre_init_net(struct net *net)
1177 {
1178         int err;
1179         struct ipgre_net *ign;
1180
1181         err = -ENOMEM;
1182         ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1183         if (ign == NULL)
1184                 goto err_alloc;
1185
1186         err = net_assign_generic(net, ipgre_net_id, ign);
1187         if (err < 0)
1188                 goto err_assign;
1189
1190         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1191                                            ipgre_tunnel_setup);
1192         if (!ign->fb_tunnel_dev) {
1193                 err = -ENOMEM;
1194                 goto err_alloc_dev;
1195         }
1196
1197         ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1198         dev_net_set(ign->fb_tunnel_dev, net);
1199
1200         if ((err = register_netdev(ign->fb_tunnel_dev)))
1201                 goto err_reg_dev;
1202
1203         return 0;
1204
1205 err_reg_dev:
1206         free_netdev(ign->fb_tunnel_dev);
1207 err_alloc_dev:
1208         /* nothing */
1209 err_assign:
1210         kfree(ign);
1211 err_alloc:
1212         return err;
1213 }
1214
1215 static void ipgre_exit_net(struct net *net)
1216 {
1217         struct ipgre_net *ign;
1218
1219         ign = net_generic(net, ipgre_net_id);
1220         rtnl_lock();
1221         ipgre_destroy_tunnels(ign);
1222         rtnl_unlock();
1223         kfree(ign);
1224 }
1225
1226 static struct pernet_operations ipgre_net_ops = {
1227         .init = ipgre_init_net,
1228         .exit = ipgre_exit_net,
1229 };
1230
1231 /*
1232  *      And now the modules code and kernel interface.
1233  */
1234
1235 static int __init ipgre_init(void)
1236 {
1237         int err;
1238
1239         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1240
1241         if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1242                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1243                 return -EAGAIN;
1244         }
1245
1246         err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1247         if (err < 0)
1248                 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1249
1250         return err;
1251 }
1252
1253 static void __exit ipgre_fini(void)
1254 {
1255         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1256                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1257
1258         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1259 }
1260
1261 module_init(ipgre_init);
1262 module_exit(ipgre_fini);
1263 MODULE_LICENSE("GPL");