Merge git://git.kernel.org/pub/scm/linux/kernel/git/lethal/sh-2.6
[pandora-kernel.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
32
33 #include <net/sock.h>
34 #include <net/ip.h>
35 #include <net/icmp.h>
36 #include <net/protocol.h>
37 #include <net/ipip.h>
38 #include <net/arp.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
42 #include <net/xfrm.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
46
47 #ifdef CONFIG_IPV6
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 /*
54    Problems & solutions
55    --------------------
56
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is the best
66    solution, but it supposes maintaing new variable in ALL
67    skb, even if no tunneling is used.
68
69    Current solution: HARD_TX_LOCK lock breaks dead loops.
70
71
72
73    2. Networking dead loops would not kill routers, but would really
74    kill network. IP hop limit plays role of "t->recursion" in this case,
75    if we copy it from packet being encapsulated to upper header.
76    It is very good solution, but it introduces two problems:
77
78    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
79      do not work over tunnels.
80    - traceroute does not work. I planned to relay ICMP from tunnel,
81      so that this problem would be solved and traceroute output
82      would even more informative. This idea appeared to be wrong:
83      only Linux complies to rfc1812 now (yes, guys, Linux is the only
84      true router now :-)), all routers (at least, in neighbourhood of mine)
85      return only 8 bytes of payload. It is the end.
86
87    Hence, if we want that OSPF worked or traceroute said something reasonable,
88    we should search for another solution.
89
90    One of them is to parse packet trying to detect inner encapsulation
91    made by our node. It is difficult or even impossible, especially,
92    taking into account fragmentation. TO be short, tt is not solution at all.
93
94    Current solution: The solution was UNEXPECTEDLY SIMPLE.
95    We force DF flag on tunnels with preconfigured hop limit,
96    that is ALL. :-) Well, it does not remove the problem completely,
97    but exponential growth of network traffic is changed to linear
98    (branches, that exceed pmtu are pruned) and tunnel mtu
99    fastly degrades to value <68, where looping stops.
100    Yes, it is not good if there exists a router in the loop,
101    which does not force DF, even when encapsulating packets have DF set.
102    But it is not our problem! Nobody could accuse us, we made
103    all that we could make. Even if it is your gated who injected
104    fatal route to network, even if it were you who configured
105    fatal static route: you are innocent. :-)
106
107
108
109    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
110    practically identical code. It would be good to glue them
111    together, but it is not very evident, how to make them modular.
112    sit is integral part of IPv6, ipip and gre are naturally modular.
113    We could extract common parts (hash table, ioctl etc)
114    to a separate module (ip_tunnel.c).
115
116    Alexey Kuznetsov.
117  */
118
119 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
120 static int ipgre_tunnel_init(struct net_device *dev);
121 static void ipgre_tunnel_setup(struct net_device *dev);
122 static int ipgre_tunnel_bind_dev(struct net_device *dev);
123
124 /* Fallback tunnel: no source, no destination, no key, no options */
125
126 #define HASH_SIZE  16
127
128 static int ipgre_net_id;
129 struct ipgre_net {
130         struct ip_tunnel *tunnels[4][HASH_SIZE];
131
132         struct net_device *fb_tunnel_dev;
133 };
134
135 /* Tunnel hash table */
136
137 /*
138    4 hash tables:
139
140    3: (remote,local)
141    2: (remote,*)
142    1: (*,local)
143    0: (*,*)
144
145    We require exact key match i.e. if a key is present in packet
146    it will match only tunnel with the same key; if it is not present,
147    it will match only keyless tunnel.
148
149    All keysless packets, if not matched configured keyless tunnels
150    will match fallback tunnel.
151  */
152
153 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
154
155 #define tunnels_r_l     tunnels[3]
156 #define tunnels_r       tunnels[2]
157 #define tunnels_l       tunnels[1]
158 #define tunnels_wc      tunnels[0]
159
160 static DEFINE_RWLOCK(ipgre_lock);
161
162 /* Given src, dst and key, find appropriate for input tunnel. */
163
164 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
165                                               __be32 remote, __be32 local,
166                                               __be32 key, __be16 gre_proto)
167 {
168         struct net *net = dev_net(dev);
169         int link = dev->ifindex;
170         unsigned h0 = HASH(remote);
171         unsigned h1 = HASH(key);
172         struct ip_tunnel *t, *cand = NULL;
173         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
174         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
175                        ARPHRD_ETHER : ARPHRD_IPGRE;
176         int score, cand_score = 4;
177
178         for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
179                 if (local != t->parms.iph.saddr ||
180                     remote != t->parms.iph.daddr ||
181                     key != t->parms.i_key ||
182                     !(t->dev->flags & IFF_UP))
183                         continue;
184
185                 if (t->dev->type != ARPHRD_IPGRE &&
186                     t->dev->type != dev_type)
187                         continue;
188
189                 score = 0;
190                 if (t->parms.link != link)
191                         score |= 1;
192                 if (t->dev->type != dev_type)
193                         score |= 2;
194                 if (score == 0)
195                         return t;
196
197                 if (score < cand_score) {
198                         cand = t;
199                         cand_score = score;
200                 }
201         }
202
203         for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
204                 if (remote != t->parms.iph.daddr ||
205                     key != t->parms.i_key ||
206                     !(t->dev->flags & IFF_UP))
207                         continue;
208
209                 if (t->dev->type != ARPHRD_IPGRE &&
210                     t->dev->type != dev_type)
211                         continue;
212
213                 score = 0;
214                 if (t->parms.link != link)
215                         score |= 1;
216                 if (t->dev->type != dev_type)
217                         score |= 2;
218                 if (score == 0)
219                         return t;
220
221                 if (score < cand_score) {
222                         cand = t;
223                         cand_score = score;
224                 }
225         }
226
227         for (t = ign->tunnels_l[h1]; t; t = t->next) {
228                 if ((local != t->parms.iph.saddr &&
229                      (local != t->parms.iph.daddr ||
230                       !ipv4_is_multicast(local))) ||
231                     key != t->parms.i_key ||
232                     !(t->dev->flags & IFF_UP))
233                         continue;
234
235                 if (t->dev->type != ARPHRD_IPGRE &&
236                     t->dev->type != dev_type)
237                         continue;
238
239                 score = 0;
240                 if (t->parms.link != link)
241                         score |= 1;
242                 if (t->dev->type != dev_type)
243                         score |= 2;
244                 if (score == 0)
245                         return t;
246
247                 if (score < cand_score) {
248                         cand = t;
249                         cand_score = score;
250                 }
251         }
252
253         for (t = ign->tunnels_wc[h1]; t; t = t->next) {
254                 if (t->parms.i_key != key ||
255                     !(t->dev->flags & IFF_UP))
256                         continue;
257
258                 if (t->dev->type != ARPHRD_IPGRE &&
259                     t->dev->type != dev_type)
260                         continue;
261
262                 score = 0;
263                 if (t->parms.link != link)
264                         score |= 1;
265                 if (t->dev->type != dev_type)
266                         score |= 2;
267                 if (score == 0)
268                         return t;
269
270                 if (score < cand_score) {
271                         cand = t;
272                         cand_score = score;
273                 }
274         }
275
276         if (cand != NULL)
277                 return cand;
278
279         if (ign->fb_tunnel_dev->flags & IFF_UP)
280                 return netdev_priv(ign->fb_tunnel_dev);
281
282         return NULL;
283 }
284
285 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
286                 struct ip_tunnel_parm *parms)
287 {
288         __be32 remote = parms->iph.daddr;
289         __be32 local = parms->iph.saddr;
290         __be32 key = parms->i_key;
291         unsigned h = HASH(key);
292         int prio = 0;
293
294         if (local)
295                 prio |= 1;
296         if (remote && !ipv4_is_multicast(remote)) {
297                 prio |= 2;
298                 h ^= HASH(remote);
299         }
300
301         return &ign->tunnels[prio][h];
302 }
303
304 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
305                 struct ip_tunnel *t)
306 {
307         return __ipgre_bucket(ign, &t->parms);
308 }
309
310 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
311 {
312         struct ip_tunnel **tp = ipgre_bucket(ign, t);
313
314         t->next = *tp;
315         write_lock_bh(&ipgre_lock);
316         *tp = t;
317         write_unlock_bh(&ipgre_lock);
318 }
319
320 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
321 {
322         struct ip_tunnel **tp;
323
324         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
325                 if (t == *tp) {
326                         write_lock_bh(&ipgre_lock);
327                         *tp = t->next;
328                         write_unlock_bh(&ipgre_lock);
329                         break;
330                 }
331         }
332 }
333
334 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
335                                            struct ip_tunnel_parm *parms,
336                                            int type)
337 {
338         __be32 remote = parms->iph.daddr;
339         __be32 local = parms->iph.saddr;
340         __be32 key = parms->i_key;
341         int link = parms->link;
342         struct ip_tunnel *t, **tp;
343         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
344
345         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
346                 if (local == t->parms.iph.saddr &&
347                     remote == t->parms.iph.daddr &&
348                     key == t->parms.i_key &&
349                     link == t->parms.link &&
350                     type == t->dev->type)
351                         break;
352
353         return t;
354 }
355
356 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
357                 struct ip_tunnel_parm *parms, int create)
358 {
359         struct ip_tunnel *t, *nt;
360         struct net_device *dev;
361         char name[IFNAMSIZ];
362         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
363
364         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
365         if (t || !create)
366                 return t;
367
368         if (parms->name[0])
369                 strlcpy(name, parms->name, IFNAMSIZ);
370         else
371                 sprintf(name, "gre%%d");
372
373         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
374         if (!dev)
375           return NULL;
376
377         dev_net_set(dev, net);
378
379         if (strchr(name, '%')) {
380                 if (dev_alloc_name(dev, name) < 0)
381                         goto failed_free;
382         }
383
384         nt = netdev_priv(dev);
385         nt->parms = *parms;
386         dev->rtnl_link_ops = &ipgre_link_ops;
387
388         dev->mtu = ipgre_tunnel_bind_dev(dev);
389
390         if (register_netdevice(dev) < 0)
391                 goto failed_free;
392
393         dev_hold(dev);
394         ipgre_tunnel_link(ign, nt);
395         return nt;
396
397 failed_free:
398         free_netdev(dev);
399         return NULL;
400 }
401
402 static void ipgre_tunnel_uninit(struct net_device *dev)
403 {
404         struct net *net = dev_net(dev);
405         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
406
407         ipgre_tunnel_unlink(ign, netdev_priv(dev));
408         dev_put(dev);
409 }
410
411
412 static void ipgre_err(struct sk_buff *skb, u32 info)
413 {
414
415 /* All the routers (except for Linux) return only
416    8 bytes of packet payload. It means, that precise relaying of
417    ICMP in the real Internet is absolutely infeasible.
418
419    Moreover, Cisco "wise men" put GRE key to the third word
420    in GRE header. It makes impossible maintaining even soft state for keyed
421    GRE tunnels with enabled checksum. Tell them "thank you".
422
423    Well, I wonder, rfc1812 was written by Cisco employee,
424    what the hell these idiots break standrads established
425    by themself???
426  */
427
428         struct iphdr *iph = (struct iphdr *)skb->data;
429         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
430         int grehlen = (iph->ihl<<2) + 4;
431         const int type = icmp_hdr(skb)->type;
432         const int code = icmp_hdr(skb)->code;
433         struct ip_tunnel *t;
434         __be16 flags;
435
436         flags = p[0];
437         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
438                 if (flags&(GRE_VERSION|GRE_ROUTING))
439                         return;
440                 if (flags&GRE_KEY) {
441                         grehlen += 4;
442                         if (flags&GRE_CSUM)
443                                 grehlen += 4;
444                 }
445         }
446
447         /* If only 8 bytes returned, keyed message will be dropped here */
448         if (skb_headlen(skb) < grehlen)
449                 return;
450
451         switch (type) {
452         default:
453         case ICMP_PARAMETERPROB:
454                 return;
455
456         case ICMP_DEST_UNREACH:
457                 switch (code) {
458                 case ICMP_SR_FAILED:
459                 case ICMP_PORT_UNREACH:
460                         /* Impossible event. */
461                         return;
462                 case ICMP_FRAG_NEEDED:
463                         /* Soft state for pmtu is maintained by IP core. */
464                         return;
465                 default:
466                         /* All others are translated to HOST_UNREACH.
467                            rfc2003 contains "deep thoughts" about NET_UNREACH,
468                            I believe they are just ether pollution. --ANK
469                          */
470                         break;
471                 }
472                 break;
473         case ICMP_TIME_EXCEEDED:
474                 if (code != ICMP_EXC_TTL)
475                         return;
476                 break;
477         }
478
479         read_lock(&ipgre_lock);
480         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
481                                 flags & GRE_KEY ?
482                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
483                                 p[1]);
484         if (t == NULL || t->parms.iph.daddr == 0 ||
485             ipv4_is_multicast(t->parms.iph.daddr))
486                 goto out;
487
488         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
489                 goto out;
490
491         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
492                 t->err_count++;
493         else
494                 t->err_count = 1;
495         t->err_time = jiffies;
496 out:
497         read_unlock(&ipgre_lock);
498         return;
499 }
500
501 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
502 {
503         if (INET_ECN_is_ce(iph->tos)) {
504                 if (skb->protocol == htons(ETH_P_IP)) {
505                         IP_ECN_set_ce(ip_hdr(skb));
506                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
507                         IP6_ECN_set_ce(ipv6_hdr(skb));
508                 }
509         }
510 }
511
512 static inline u8
513 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
514 {
515         u8 inner = 0;
516         if (skb->protocol == htons(ETH_P_IP))
517                 inner = old_iph->tos;
518         else if (skb->protocol == htons(ETH_P_IPV6))
519                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
520         return INET_ECN_encapsulate(tos, inner);
521 }
522
523 static int ipgre_rcv(struct sk_buff *skb)
524 {
525         struct iphdr *iph;
526         u8     *h;
527         __be16    flags;
528         __sum16   csum = 0;
529         __be32 key = 0;
530         u32    seqno = 0;
531         struct ip_tunnel *tunnel;
532         int    offset = 4;
533         __be16 gre_proto;
534         unsigned int len;
535
536         if (!pskb_may_pull(skb, 16))
537                 goto drop_nolock;
538
539         iph = ip_hdr(skb);
540         h = skb->data;
541         flags = *(__be16*)h;
542
543         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
544                 /* - Version must be 0.
545                    - We do not support routing headers.
546                  */
547                 if (flags&(GRE_VERSION|GRE_ROUTING))
548                         goto drop_nolock;
549
550                 if (flags&GRE_CSUM) {
551                         switch (skb->ip_summed) {
552                         case CHECKSUM_COMPLETE:
553                                 csum = csum_fold(skb->csum);
554                                 if (!csum)
555                                         break;
556                                 /* fall through */
557                         case CHECKSUM_NONE:
558                                 skb->csum = 0;
559                                 csum = __skb_checksum_complete(skb);
560                                 skb->ip_summed = CHECKSUM_COMPLETE;
561                         }
562                         offset += 4;
563                 }
564                 if (flags&GRE_KEY) {
565                         key = *(__be32*)(h + offset);
566                         offset += 4;
567                 }
568                 if (flags&GRE_SEQ) {
569                         seqno = ntohl(*(__be32*)(h + offset));
570                         offset += 4;
571                 }
572         }
573
574         gre_proto = *(__be16 *)(h + 2);
575
576         read_lock(&ipgre_lock);
577         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
578                                           iph->saddr, iph->daddr, key,
579                                           gre_proto))) {
580                 struct net_device_stats *stats = &tunnel->dev->stats;
581
582                 secpath_reset(skb);
583
584                 skb->protocol = gre_proto;
585                 /* WCCP version 1 and 2 protocol decoding.
586                  * - Change protocol to IP
587                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
588                  */
589                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
590                         skb->protocol = htons(ETH_P_IP);
591                         if ((*(h + offset) & 0xF0) != 0x40)
592                                 offset += 4;
593                 }
594
595                 skb->mac_header = skb->network_header;
596                 __pskb_pull(skb, offset);
597                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
598                 skb->pkt_type = PACKET_HOST;
599 #ifdef CONFIG_NET_IPGRE_BROADCAST
600                 if (ipv4_is_multicast(iph->daddr)) {
601                         /* Looped back packet, drop it! */
602                         if (skb_rtable(skb)->fl.iif == 0)
603                                 goto drop;
604                         stats->multicast++;
605                         skb->pkt_type = PACKET_BROADCAST;
606                 }
607 #endif
608
609                 if (((flags&GRE_CSUM) && csum) ||
610                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
611                         stats->rx_crc_errors++;
612                         stats->rx_errors++;
613                         goto drop;
614                 }
615                 if (tunnel->parms.i_flags&GRE_SEQ) {
616                         if (!(flags&GRE_SEQ) ||
617                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
618                                 stats->rx_fifo_errors++;
619                                 stats->rx_errors++;
620                                 goto drop;
621                         }
622                         tunnel->i_seqno = seqno + 1;
623                 }
624
625                 len = skb->len;
626
627                 /* Warning: All skb pointers will be invalidated! */
628                 if (tunnel->dev->type == ARPHRD_ETHER) {
629                         if (!pskb_may_pull(skb, ETH_HLEN)) {
630                                 stats->rx_length_errors++;
631                                 stats->rx_errors++;
632                                 goto drop;
633                         }
634
635                         iph = ip_hdr(skb);
636                         skb->protocol = eth_type_trans(skb, tunnel->dev);
637                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
638                 }
639
640                 stats->rx_packets++;
641                 stats->rx_bytes += len;
642                 skb->dev = tunnel->dev;
643                 skb_dst_drop(skb);
644                 nf_reset(skb);
645
646                 skb_reset_network_header(skb);
647                 ipgre_ecn_decapsulate(iph, skb);
648
649                 netif_rx(skb);
650                 read_unlock(&ipgre_lock);
651                 return(0);
652         }
653         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
654
655 drop:
656         read_unlock(&ipgre_lock);
657 drop_nolock:
658         kfree_skb(skb);
659         return(0);
660 }
661
662 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
663 {
664         struct ip_tunnel *tunnel = netdev_priv(dev);
665         struct net_device_stats *stats = &tunnel->dev->stats;
666         struct iphdr  *old_iph = ip_hdr(skb);
667         struct iphdr  *tiph;
668         u8     tos;
669         __be16 df;
670         struct rtable *rt;                      /* Route to the other host */
671         struct net_device *tdev;                        /* Device to other host */
672         struct iphdr  *iph;                     /* Our new IP header */
673         unsigned int max_headroom;              /* The extra header space needed */
674         int    gre_hlen;
675         __be32 dst;
676         int    mtu;
677
678         if (dev->type == ARPHRD_ETHER)
679                 IPCB(skb)->flags = 0;
680
681         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
682                 gre_hlen = 0;
683                 tiph = (struct iphdr *)skb->data;
684         } else {
685                 gre_hlen = tunnel->hlen;
686                 tiph = &tunnel->parms.iph;
687         }
688
689         if ((dst = tiph->daddr) == 0) {
690                 /* NBMA tunnel */
691
692                 if (skb_dst(skb) == NULL) {
693                         stats->tx_fifo_errors++;
694                         goto tx_error;
695                 }
696
697                 if (skb->protocol == htons(ETH_P_IP)) {
698                         rt = skb_rtable(skb);
699                         if ((dst = rt->rt_gateway) == 0)
700                                 goto tx_error_icmp;
701                 }
702 #ifdef CONFIG_IPV6
703                 else if (skb->protocol == htons(ETH_P_IPV6)) {
704                         struct in6_addr *addr6;
705                         int addr_type;
706                         struct neighbour *neigh = skb_dst(skb)->neighbour;
707
708                         if (neigh == NULL)
709                                 goto tx_error;
710
711                         addr6 = (struct in6_addr *)&neigh->primary_key;
712                         addr_type = ipv6_addr_type(addr6);
713
714                         if (addr_type == IPV6_ADDR_ANY) {
715                                 addr6 = &ipv6_hdr(skb)->daddr;
716                                 addr_type = ipv6_addr_type(addr6);
717                         }
718
719                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
720                                 goto tx_error_icmp;
721
722                         dst = addr6->s6_addr32[3];
723                 }
724 #endif
725                 else
726                         goto tx_error;
727         }
728
729         tos = tiph->tos;
730         if (tos == 1) {
731                 tos = 0;
732                 if (skb->protocol == htons(ETH_P_IP))
733                         tos = old_iph->tos;
734         }
735
736         {
737                 struct flowi fl = { .oif = tunnel->parms.link,
738                                     .nl_u = { .ip4_u =
739                                               { .daddr = dst,
740                                                 .saddr = tiph->saddr,
741                                                 .tos = RT_TOS(tos) } },
742                                     .proto = IPPROTO_GRE };
743                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
744                         stats->tx_carrier_errors++;
745                         goto tx_error;
746                 }
747         }
748         tdev = rt->u.dst.dev;
749
750         if (tdev == dev) {
751                 ip_rt_put(rt);
752                 stats->collisions++;
753                 goto tx_error;
754         }
755
756         df = tiph->frag_off;
757         if (df)
758                 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
759         else
760                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
761
762         if (skb_dst(skb))
763                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
764
765         if (skb->protocol == htons(ETH_P_IP)) {
766                 df |= (old_iph->frag_off&htons(IP_DF));
767
768                 if ((old_iph->frag_off&htons(IP_DF)) &&
769                     mtu < ntohs(old_iph->tot_len)) {
770                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
771                         ip_rt_put(rt);
772                         goto tx_error;
773                 }
774         }
775 #ifdef CONFIG_IPV6
776         else if (skb->protocol == htons(ETH_P_IPV6)) {
777                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
778
779                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
780                         if ((tunnel->parms.iph.daddr &&
781                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
782                             rt6->rt6i_dst.plen == 128) {
783                                 rt6->rt6i_flags |= RTF_MODIFIED;
784                                 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
785                         }
786                 }
787
788                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
789                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
790                         ip_rt_put(rt);
791                         goto tx_error;
792                 }
793         }
794 #endif
795
796         if (tunnel->err_count > 0) {
797                 if (time_before(jiffies,
798                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
799                         tunnel->err_count--;
800
801                         dst_link_failure(skb);
802                 } else
803                         tunnel->err_count = 0;
804         }
805
806         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
807
808         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
809             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
810                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
811                 if (!new_skb) {
812                         ip_rt_put(rt);
813                         stats->tx_dropped++;
814                         dev_kfree_skb(skb);
815                         return NETDEV_TX_OK;
816                 }
817                 if (skb->sk)
818                         skb_set_owner_w(new_skb, skb->sk);
819                 dev_kfree_skb(skb);
820                 skb = new_skb;
821                 old_iph = ip_hdr(skb);
822         }
823
824         skb_reset_transport_header(skb);
825         skb_push(skb, gre_hlen);
826         skb_reset_network_header(skb);
827         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
828         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
829                               IPSKB_REROUTED);
830         skb_dst_drop(skb);
831         skb_dst_set(skb, &rt->u.dst);
832
833         /*
834          *      Push down and install the IPIP header.
835          */
836
837         iph                     =       ip_hdr(skb);
838         iph->version            =       4;
839         iph->ihl                =       sizeof(struct iphdr) >> 2;
840         iph->frag_off           =       df;
841         iph->protocol           =       IPPROTO_GRE;
842         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
843         iph->daddr              =       rt->rt_dst;
844         iph->saddr              =       rt->rt_src;
845
846         if ((iph->ttl = tiph->ttl) == 0) {
847                 if (skb->protocol == htons(ETH_P_IP))
848                         iph->ttl = old_iph->ttl;
849 #ifdef CONFIG_IPV6
850                 else if (skb->protocol == htons(ETH_P_IPV6))
851                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
852 #endif
853                 else
854                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
855         }
856
857         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
858         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
859                                    htons(ETH_P_TEB) : skb->protocol;
860
861         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
862                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
863
864                 if (tunnel->parms.o_flags&GRE_SEQ) {
865                         ++tunnel->o_seqno;
866                         *ptr = htonl(tunnel->o_seqno);
867                         ptr--;
868                 }
869                 if (tunnel->parms.o_flags&GRE_KEY) {
870                         *ptr = tunnel->parms.o_key;
871                         ptr--;
872                 }
873                 if (tunnel->parms.o_flags&GRE_CSUM) {
874                         *ptr = 0;
875                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
876                 }
877         }
878
879         nf_reset(skb);
880
881         IPTUNNEL_XMIT();
882         return NETDEV_TX_OK;
883
884 tx_error_icmp:
885         dst_link_failure(skb);
886
887 tx_error:
888         stats->tx_errors++;
889         dev_kfree_skb(skb);
890         return NETDEV_TX_OK;
891 }
892
893 static int ipgre_tunnel_bind_dev(struct net_device *dev)
894 {
895         struct net_device *tdev = NULL;
896         struct ip_tunnel *tunnel;
897         struct iphdr *iph;
898         int hlen = LL_MAX_HEADER;
899         int mtu = ETH_DATA_LEN;
900         int addend = sizeof(struct iphdr) + 4;
901
902         tunnel = netdev_priv(dev);
903         iph = &tunnel->parms.iph;
904
905         /* Guess output device to choose reasonable mtu and needed_headroom */
906
907         if (iph->daddr) {
908                 struct flowi fl = { .oif = tunnel->parms.link,
909                                     .nl_u = { .ip4_u =
910                                               { .daddr = iph->daddr,
911                                                 .saddr = iph->saddr,
912                                                 .tos = RT_TOS(iph->tos) } },
913                                     .proto = IPPROTO_GRE };
914                 struct rtable *rt;
915                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
916                         tdev = rt->u.dst.dev;
917                         ip_rt_put(rt);
918                 }
919
920                 if (dev->type != ARPHRD_ETHER)
921                         dev->flags |= IFF_POINTOPOINT;
922         }
923
924         if (!tdev && tunnel->parms.link)
925                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
926
927         if (tdev) {
928                 hlen = tdev->hard_header_len + tdev->needed_headroom;
929                 mtu = tdev->mtu;
930         }
931         dev->iflink = tunnel->parms.link;
932
933         /* Precalculate GRE options length */
934         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
935                 if (tunnel->parms.o_flags&GRE_CSUM)
936                         addend += 4;
937                 if (tunnel->parms.o_flags&GRE_KEY)
938                         addend += 4;
939                 if (tunnel->parms.o_flags&GRE_SEQ)
940                         addend += 4;
941         }
942         dev->needed_headroom = addend + hlen;
943         mtu -= dev->hard_header_len + addend;
944
945         if (mtu < 68)
946                 mtu = 68;
947
948         tunnel->hlen = addend;
949
950         return mtu;
951 }
952
953 static int
954 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
955 {
956         int err = 0;
957         struct ip_tunnel_parm p;
958         struct ip_tunnel *t;
959         struct net *net = dev_net(dev);
960         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
961
962         switch (cmd) {
963         case SIOCGETTUNNEL:
964                 t = NULL;
965                 if (dev == ign->fb_tunnel_dev) {
966                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
967                                 err = -EFAULT;
968                                 break;
969                         }
970                         t = ipgre_tunnel_locate(net, &p, 0);
971                 }
972                 if (t == NULL)
973                         t = netdev_priv(dev);
974                 memcpy(&p, &t->parms, sizeof(p));
975                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
976                         err = -EFAULT;
977                 break;
978
979         case SIOCADDTUNNEL:
980         case SIOCCHGTUNNEL:
981                 err = -EPERM;
982                 if (!capable(CAP_NET_ADMIN))
983                         goto done;
984
985                 err = -EFAULT;
986                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
987                         goto done;
988
989                 err = -EINVAL;
990                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
991                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
992                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
993                         goto done;
994                 if (p.iph.ttl)
995                         p.iph.frag_off |= htons(IP_DF);
996
997                 if (!(p.i_flags&GRE_KEY))
998                         p.i_key = 0;
999                 if (!(p.o_flags&GRE_KEY))
1000                         p.o_key = 0;
1001
1002                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1003
1004                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1005                         if (t != NULL) {
1006                                 if (t->dev != dev) {
1007                                         err = -EEXIST;
1008                                         break;
1009                                 }
1010                         } else {
1011                                 unsigned nflags = 0;
1012
1013                                 t = netdev_priv(dev);
1014
1015                                 if (ipv4_is_multicast(p.iph.daddr))
1016                                         nflags = IFF_BROADCAST;
1017                                 else if (p.iph.daddr)
1018                                         nflags = IFF_POINTOPOINT;
1019
1020                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1021                                         err = -EINVAL;
1022                                         break;
1023                                 }
1024                                 ipgre_tunnel_unlink(ign, t);
1025                                 t->parms.iph.saddr = p.iph.saddr;
1026                                 t->parms.iph.daddr = p.iph.daddr;
1027                                 t->parms.i_key = p.i_key;
1028                                 t->parms.o_key = p.o_key;
1029                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1030                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1031                                 ipgre_tunnel_link(ign, t);
1032                                 netdev_state_change(dev);
1033                         }
1034                 }
1035
1036                 if (t) {
1037                         err = 0;
1038                         if (cmd == SIOCCHGTUNNEL) {
1039                                 t->parms.iph.ttl = p.iph.ttl;
1040                                 t->parms.iph.tos = p.iph.tos;
1041                                 t->parms.iph.frag_off = p.iph.frag_off;
1042                                 if (t->parms.link != p.link) {
1043                                         t->parms.link = p.link;
1044                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1045                                         netdev_state_change(dev);
1046                                 }
1047                         }
1048                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1049                                 err = -EFAULT;
1050                 } else
1051                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1052                 break;
1053
1054         case SIOCDELTUNNEL:
1055                 err = -EPERM;
1056                 if (!capable(CAP_NET_ADMIN))
1057                         goto done;
1058
1059                 if (dev == ign->fb_tunnel_dev) {
1060                         err = -EFAULT;
1061                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1062                                 goto done;
1063                         err = -ENOENT;
1064                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1065                                 goto done;
1066                         err = -EPERM;
1067                         if (t == netdev_priv(ign->fb_tunnel_dev))
1068                                 goto done;
1069                         dev = t->dev;
1070                 }
1071                 unregister_netdevice(dev);
1072                 err = 0;
1073                 break;
1074
1075         default:
1076                 err = -EINVAL;
1077         }
1078
1079 done:
1080         return err;
1081 }
1082
1083 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1084 {
1085         struct ip_tunnel *tunnel = netdev_priv(dev);
1086         if (new_mtu < 68 ||
1087             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1088                 return -EINVAL;
1089         dev->mtu = new_mtu;
1090         return 0;
1091 }
1092
1093 /* Nice toy. Unfortunately, useless in real life :-)
1094    It allows to construct virtual multiprotocol broadcast "LAN"
1095    over the Internet, provided multicast routing is tuned.
1096
1097
1098    I have no idea was this bicycle invented before me,
1099    so that I had to set ARPHRD_IPGRE to a random value.
1100    I have an impression, that Cisco could make something similar,
1101    but this feature is apparently missing in IOS<=11.2(8).
1102
1103    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1104    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1105
1106    ping -t 255 224.66.66.66
1107
1108    If nobody answers, mbone does not work.
1109
1110    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1111    ip addr add 10.66.66.<somewhat>/24 dev Universe
1112    ifconfig Universe up
1113    ifconfig Universe add fe80::<Your_real_addr>/10
1114    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1115    ftp 10.66.66.66
1116    ...
1117    ftp fec0:6666:6666::193.233.7.65
1118    ...
1119
1120  */
1121
1122 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1123                         unsigned short type,
1124                         const void *daddr, const void *saddr, unsigned len)
1125 {
1126         struct ip_tunnel *t = netdev_priv(dev);
1127         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1128         __be16 *p = (__be16*)(iph+1);
1129
1130         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1131         p[0]            = t->parms.o_flags;
1132         p[1]            = htons(type);
1133
1134         /*
1135          *      Set the source hardware address.
1136          */
1137
1138         if (saddr)
1139                 memcpy(&iph->saddr, saddr, 4);
1140
1141         if (daddr) {
1142                 memcpy(&iph->daddr, daddr, 4);
1143                 return t->hlen;
1144         }
1145         if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1146                 return t->hlen;
1147
1148         return -t->hlen;
1149 }
1150
1151 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1152 {
1153         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1154         memcpy(haddr, &iph->saddr, 4);
1155         return 4;
1156 }
1157
1158 static const struct header_ops ipgre_header_ops = {
1159         .create = ipgre_header,
1160         .parse  = ipgre_header_parse,
1161 };
1162
1163 #ifdef CONFIG_NET_IPGRE_BROADCAST
1164 static int ipgre_open(struct net_device *dev)
1165 {
1166         struct ip_tunnel *t = netdev_priv(dev);
1167
1168         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1169                 struct flowi fl = { .oif = t->parms.link,
1170                                     .nl_u = { .ip4_u =
1171                                               { .daddr = t->parms.iph.daddr,
1172                                                 .saddr = t->parms.iph.saddr,
1173                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1174                                     .proto = IPPROTO_GRE };
1175                 struct rtable *rt;
1176                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1177                         return -EADDRNOTAVAIL;
1178                 dev = rt->u.dst.dev;
1179                 ip_rt_put(rt);
1180                 if (__in_dev_get_rtnl(dev) == NULL)
1181                         return -EADDRNOTAVAIL;
1182                 t->mlink = dev->ifindex;
1183                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1184         }
1185         return 0;
1186 }
1187
1188 static int ipgre_close(struct net_device *dev)
1189 {
1190         struct ip_tunnel *t = netdev_priv(dev);
1191
1192         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1193                 struct in_device *in_dev;
1194                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1195                 if (in_dev) {
1196                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1197                         in_dev_put(in_dev);
1198                 }
1199         }
1200         return 0;
1201 }
1202
1203 #endif
1204
1205 static const struct net_device_ops ipgre_netdev_ops = {
1206         .ndo_init               = ipgre_tunnel_init,
1207         .ndo_uninit             = ipgre_tunnel_uninit,
1208 #ifdef CONFIG_NET_IPGRE_BROADCAST
1209         .ndo_open               = ipgre_open,
1210         .ndo_stop               = ipgre_close,
1211 #endif
1212         .ndo_start_xmit         = ipgre_tunnel_xmit,
1213         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1214         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1215 };
1216
1217 static void ipgre_tunnel_setup(struct net_device *dev)
1218 {
1219         dev->netdev_ops         = &ipgre_netdev_ops;
1220         dev->destructor         = free_netdev;
1221
1222         dev->type               = ARPHRD_IPGRE;
1223         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1224         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1225         dev->flags              = IFF_NOARP;
1226         dev->iflink             = 0;
1227         dev->addr_len           = 4;
1228         dev->features           |= NETIF_F_NETNS_LOCAL;
1229         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1230 }
1231
1232 static int ipgre_tunnel_init(struct net_device *dev)
1233 {
1234         struct ip_tunnel *tunnel;
1235         struct iphdr *iph;
1236
1237         tunnel = netdev_priv(dev);
1238         iph = &tunnel->parms.iph;
1239
1240         tunnel->dev = dev;
1241         strcpy(tunnel->parms.name, dev->name);
1242
1243         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1244         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1245
1246         if (iph->daddr) {
1247 #ifdef CONFIG_NET_IPGRE_BROADCAST
1248                 if (ipv4_is_multicast(iph->daddr)) {
1249                         if (!iph->saddr)
1250                                 return -EINVAL;
1251                         dev->flags = IFF_BROADCAST;
1252                         dev->header_ops = &ipgre_header_ops;
1253                 }
1254 #endif
1255         } else
1256                 dev->header_ops = &ipgre_header_ops;
1257
1258         return 0;
1259 }
1260
1261 static void ipgre_fb_tunnel_init(struct net_device *dev)
1262 {
1263         struct ip_tunnel *tunnel = netdev_priv(dev);
1264         struct iphdr *iph = &tunnel->parms.iph;
1265         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1266
1267         tunnel->dev = dev;
1268         strcpy(tunnel->parms.name, dev->name);
1269
1270         iph->version            = 4;
1271         iph->protocol           = IPPROTO_GRE;
1272         iph->ihl                = 5;
1273         tunnel->hlen            = sizeof(struct iphdr) + 4;
1274
1275         dev_hold(dev);
1276         ign->tunnels_wc[0]      = tunnel;
1277 }
1278
1279
1280 static const struct net_protocol ipgre_protocol = {
1281         .handler        =       ipgre_rcv,
1282         .err_handler    =       ipgre_err,
1283         .netns_ok       =       1,
1284 };
1285
1286 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1287 {
1288         int prio;
1289
1290         for (prio = 0; prio < 4; prio++) {
1291                 int h;
1292                 for (h = 0; h < HASH_SIZE; h++) {
1293                         struct ip_tunnel *t;
1294                         while ((t = ign->tunnels[prio][h]) != NULL)
1295                                 unregister_netdevice(t->dev);
1296                 }
1297         }
1298 }
1299
1300 static int ipgre_init_net(struct net *net)
1301 {
1302         int err;
1303         struct ipgre_net *ign;
1304
1305         err = -ENOMEM;
1306         ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1307         if (ign == NULL)
1308                 goto err_alloc;
1309
1310         err = net_assign_generic(net, ipgre_net_id, ign);
1311         if (err < 0)
1312                 goto err_assign;
1313
1314         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1315                                            ipgre_tunnel_setup);
1316         if (!ign->fb_tunnel_dev) {
1317                 err = -ENOMEM;
1318                 goto err_alloc_dev;
1319         }
1320         dev_net_set(ign->fb_tunnel_dev, net);
1321
1322         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1323         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1324
1325         if ((err = register_netdev(ign->fb_tunnel_dev)))
1326                 goto err_reg_dev;
1327
1328         return 0;
1329
1330 err_reg_dev:
1331         free_netdev(ign->fb_tunnel_dev);
1332 err_alloc_dev:
1333         /* nothing */
1334 err_assign:
1335         kfree(ign);
1336 err_alloc:
1337         return err;
1338 }
1339
1340 static void ipgre_exit_net(struct net *net)
1341 {
1342         struct ipgre_net *ign;
1343
1344         ign = net_generic(net, ipgre_net_id);
1345         rtnl_lock();
1346         ipgre_destroy_tunnels(ign);
1347         rtnl_unlock();
1348         kfree(ign);
1349 }
1350
1351 static struct pernet_operations ipgre_net_ops = {
1352         .init = ipgre_init_net,
1353         .exit = ipgre_exit_net,
1354 };
1355
1356 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1357 {
1358         __be16 flags;
1359
1360         if (!data)
1361                 return 0;
1362
1363         flags = 0;
1364         if (data[IFLA_GRE_IFLAGS])
1365                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1366         if (data[IFLA_GRE_OFLAGS])
1367                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1368         if (flags & (GRE_VERSION|GRE_ROUTING))
1369                 return -EINVAL;
1370
1371         return 0;
1372 }
1373
1374 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1375 {
1376         __be32 daddr;
1377
1378         if (tb[IFLA_ADDRESS]) {
1379                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1380                         return -EINVAL;
1381                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1382                         return -EADDRNOTAVAIL;
1383         }
1384
1385         if (!data)
1386                 goto out;
1387
1388         if (data[IFLA_GRE_REMOTE]) {
1389                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1390                 if (!daddr)
1391                         return -EINVAL;
1392         }
1393
1394 out:
1395         return ipgre_tunnel_validate(tb, data);
1396 }
1397
1398 static void ipgre_netlink_parms(struct nlattr *data[],
1399                                 struct ip_tunnel_parm *parms)
1400 {
1401         memset(parms, 0, sizeof(*parms));
1402
1403         parms->iph.protocol = IPPROTO_GRE;
1404
1405         if (!data)
1406                 return;
1407
1408         if (data[IFLA_GRE_LINK])
1409                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1410
1411         if (data[IFLA_GRE_IFLAGS])
1412                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1413
1414         if (data[IFLA_GRE_OFLAGS])
1415                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1416
1417         if (data[IFLA_GRE_IKEY])
1418                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1419
1420         if (data[IFLA_GRE_OKEY])
1421                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1422
1423         if (data[IFLA_GRE_LOCAL])
1424                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1425
1426         if (data[IFLA_GRE_REMOTE])
1427                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1428
1429         if (data[IFLA_GRE_TTL])
1430                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1431
1432         if (data[IFLA_GRE_TOS])
1433                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1434
1435         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1436                 parms->iph.frag_off = htons(IP_DF);
1437 }
1438
1439 static int ipgre_tap_init(struct net_device *dev)
1440 {
1441         struct ip_tunnel *tunnel;
1442
1443         tunnel = netdev_priv(dev);
1444
1445         tunnel->dev = dev;
1446         strcpy(tunnel->parms.name, dev->name);
1447
1448         ipgre_tunnel_bind_dev(dev);
1449
1450         return 0;
1451 }
1452
1453 static const struct net_device_ops ipgre_tap_netdev_ops = {
1454         .ndo_init               = ipgre_tap_init,
1455         .ndo_uninit             = ipgre_tunnel_uninit,
1456         .ndo_start_xmit         = ipgre_tunnel_xmit,
1457         .ndo_set_mac_address    = eth_mac_addr,
1458         .ndo_validate_addr      = eth_validate_addr,
1459         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1460 };
1461
1462 static void ipgre_tap_setup(struct net_device *dev)
1463 {
1464
1465         ether_setup(dev);
1466
1467         dev->netdev_ops         = &ipgre_netdev_ops;
1468         dev->destructor         = free_netdev;
1469
1470         dev->iflink             = 0;
1471         dev->features           |= NETIF_F_NETNS_LOCAL;
1472 }
1473
1474 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1475                          struct nlattr *data[])
1476 {
1477         struct ip_tunnel *nt;
1478         struct net *net = dev_net(dev);
1479         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1480         int mtu;
1481         int err;
1482
1483         nt = netdev_priv(dev);
1484         ipgre_netlink_parms(data, &nt->parms);
1485
1486         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1487                 return -EEXIST;
1488
1489         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1490                 random_ether_addr(dev->dev_addr);
1491
1492         mtu = ipgre_tunnel_bind_dev(dev);
1493         if (!tb[IFLA_MTU])
1494                 dev->mtu = mtu;
1495
1496         err = register_netdevice(dev);
1497         if (err)
1498                 goto out;
1499
1500         dev_hold(dev);
1501         ipgre_tunnel_link(ign, nt);
1502
1503 out:
1504         return err;
1505 }
1506
1507 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1508                             struct nlattr *data[])
1509 {
1510         struct ip_tunnel *t, *nt;
1511         struct net *net = dev_net(dev);
1512         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1513         struct ip_tunnel_parm p;
1514         int mtu;
1515
1516         if (dev == ign->fb_tunnel_dev)
1517                 return -EINVAL;
1518
1519         nt = netdev_priv(dev);
1520         ipgre_netlink_parms(data, &p);
1521
1522         t = ipgre_tunnel_locate(net, &p, 0);
1523
1524         if (t) {
1525                 if (t->dev != dev)
1526                         return -EEXIST;
1527         } else {
1528                 unsigned nflags = 0;
1529
1530                 t = nt;
1531
1532                 if (ipv4_is_multicast(p.iph.daddr))
1533                         nflags = IFF_BROADCAST;
1534                 else if (p.iph.daddr)
1535                         nflags = IFF_POINTOPOINT;
1536
1537                 if ((dev->flags ^ nflags) &
1538                     (IFF_POINTOPOINT | IFF_BROADCAST))
1539                         return -EINVAL;
1540
1541                 ipgre_tunnel_unlink(ign, t);
1542                 t->parms.iph.saddr = p.iph.saddr;
1543                 t->parms.iph.daddr = p.iph.daddr;
1544                 t->parms.i_key = p.i_key;
1545                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1546                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1547                 ipgre_tunnel_link(ign, t);
1548                 netdev_state_change(dev);
1549         }
1550
1551         t->parms.o_key = p.o_key;
1552         t->parms.iph.ttl = p.iph.ttl;
1553         t->parms.iph.tos = p.iph.tos;
1554         t->parms.iph.frag_off = p.iph.frag_off;
1555
1556         if (t->parms.link != p.link) {
1557                 t->parms.link = p.link;
1558                 mtu = ipgre_tunnel_bind_dev(dev);
1559                 if (!tb[IFLA_MTU])
1560                         dev->mtu = mtu;
1561                 netdev_state_change(dev);
1562         }
1563
1564         return 0;
1565 }
1566
1567 static size_t ipgre_get_size(const struct net_device *dev)
1568 {
1569         return
1570                 /* IFLA_GRE_LINK */
1571                 nla_total_size(4) +
1572                 /* IFLA_GRE_IFLAGS */
1573                 nla_total_size(2) +
1574                 /* IFLA_GRE_OFLAGS */
1575                 nla_total_size(2) +
1576                 /* IFLA_GRE_IKEY */
1577                 nla_total_size(4) +
1578                 /* IFLA_GRE_OKEY */
1579                 nla_total_size(4) +
1580                 /* IFLA_GRE_LOCAL */
1581                 nla_total_size(4) +
1582                 /* IFLA_GRE_REMOTE */
1583                 nla_total_size(4) +
1584                 /* IFLA_GRE_TTL */
1585                 nla_total_size(1) +
1586                 /* IFLA_GRE_TOS */
1587                 nla_total_size(1) +
1588                 /* IFLA_GRE_PMTUDISC */
1589                 nla_total_size(1) +
1590                 0;
1591 }
1592
1593 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1594 {
1595         struct ip_tunnel *t = netdev_priv(dev);
1596         struct ip_tunnel_parm *p = &t->parms;
1597
1598         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1599         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1600         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1601         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1602         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1603         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1604         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1605         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1606         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1607         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1608
1609         return 0;
1610
1611 nla_put_failure:
1612         return -EMSGSIZE;
1613 }
1614
1615 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1616         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1617         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1618         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1619         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1620         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1621         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1622         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1623         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1624         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1625         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1626 };
1627
1628 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1629         .kind           = "gre",
1630         .maxtype        = IFLA_GRE_MAX,
1631         .policy         = ipgre_policy,
1632         .priv_size      = sizeof(struct ip_tunnel),
1633         .setup          = ipgre_tunnel_setup,
1634         .validate       = ipgre_tunnel_validate,
1635         .newlink        = ipgre_newlink,
1636         .changelink     = ipgre_changelink,
1637         .get_size       = ipgre_get_size,
1638         .fill_info      = ipgre_fill_info,
1639 };
1640
1641 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1642         .kind           = "gretap",
1643         .maxtype        = IFLA_GRE_MAX,
1644         .policy         = ipgre_policy,
1645         .priv_size      = sizeof(struct ip_tunnel),
1646         .setup          = ipgre_tap_setup,
1647         .validate       = ipgre_tap_validate,
1648         .newlink        = ipgre_newlink,
1649         .changelink     = ipgre_changelink,
1650         .get_size       = ipgre_get_size,
1651         .fill_info      = ipgre_fill_info,
1652 };
1653
1654 /*
1655  *      And now the modules code and kernel interface.
1656  */
1657
1658 static int __init ipgre_init(void)
1659 {
1660         int err;
1661
1662         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1663
1664         if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1665                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1666                 return -EAGAIN;
1667         }
1668
1669         err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1670         if (err < 0)
1671                 goto gen_device_failed;
1672
1673         err = rtnl_link_register(&ipgre_link_ops);
1674         if (err < 0)
1675                 goto rtnl_link_failed;
1676
1677         err = rtnl_link_register(&ipgre_tap_ops);
1678         if (err < 0)
1679                 goto tap_ops_failed;
1680
1681 out:
1682         return err;
1683
1684 tap_ops_failed:
1685         rtnl_link_unregister(&ipgre_link_ops);
1686 rtnl_link_failed:
1687         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1688 gen_device_failed:
1689         inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1690         goto out;
1691 }
1692
1693 static void __exit ipgre_fini(void)
1694 {
1695         rtnl_link_unregister(&ipgre_tap_ops);
1696         rtnl_link_unregister(&ipgre_link_ops);
1697         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1698         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1699                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1700 }
1701
1702 module_init(ipgre_init);
1703 module_exit(ipgre_fini);
1704 MODULE_LICENSE("GPL");
1705 MODULE_ALIAS_RTNL_LINK("gre");
1706 MODULE_ALIAS_RTNL_LINK("gretap");