Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[pandora-kernel.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
33
34 #include <net/sock.h>
35 #include <net/ip.h>
36 #include <net/icmp.h>
37 #include <net/protocol.h>
38 #include <net/ipip.h>
39 #include <net/arp.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
47 #include <net/gre.h>
48
49 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #endif
54
55 /*
56    Problems & solutions
57    --------------------
58
59    1. The most important issue is detecting local dead loops.
60    They would cause complete host lockup in transmit, which
61    would be "resolved" by stack overflow or, if queueing is enabled,
62    with infinite looping in net_bh.
63
64    We cannot track such dead loops during route installation,
65    it is infeasible task. The most general solutions would be
66    to keep skb->encapsulation counter (sort of local ttl),
67    and silently drop packet when it expires. It is a good
68    solution, but it supposes maintaing new variable in ALL
69    skb, even if no tunneling is used.
70
71    Current solution: xmit_recursion breaks dead loops. This is a percpu
72    counter, since when we enter the first ndo_xmit(), cpu migration is
73    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
74
75    2. Networking dead loops would not kill routers, but would really
76    kill network. IP hop limit plays role of "t->recursion" in this case,
77    if we copy it from packet being encapsulated to upper header.
78    It is very good solution, but it introduces two problems:
79
80    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
81      do not work over tunnels.
82    - traceroute does not work. I planned to relay ICMP from tunnel,
83      so that this problem would be solved and traceroute output
84      would even more informative. This idea appeared to be wrong:
85      only Linux complies to rfc1812 now (yes, guys, Linux is the only
86      true router now :-)), all routers (at least, in neighbourhood of mine)
87      return only 8 bytes of payload. It is the end.
88
89    Hence, if we want that OSPF worked or traceroute said something reasonable,
90    we should search for another solution.
91
92    One of them is to parse packet trying to detect inner encapsulation
93    made by our node. It is difficult or even impossible, especially,
94    taking into account fragmentation. TO be short, tt is not solution at all.
95
96    Current solution: The solution was UNEXPECTEDLY SIMPLE.
97    We force DF flag on tunnels with preconfigured hop limit,
98    that is ALL. :-) Well, it does not remove the problem completely,
99    but exponential growth of network traffic is changed to linear
100    (branches, that exceed pmtu are pruned) and tunnel mtu
101    fastly degrades to value <68, where looping stops.
102    Yes, it is not good if there exists a router in the loop,
103    which does not force DF, even when encapsulating packets have DF set.
104    But it is not our problem! Nobody could accuse us, we made
105    all that we could make. Even if it is your gated who injected
106    fatal route to network, even if it were you who configured
107    fatal static route: you are innocent. :-)
108
109
110
111    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
112    practically identical code. It would be good to glue them
113    together, but it is not very evident, how to make them modular.
114    sit is integral part of IPv6, ipip and gre are naturally modular.
115    We could extract common parts (hash table, ioctl etc)
116    to a separate module (ip_tunnel.c).
117
118    Alexey Kuznetsov.
119  */
120
121 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
122 static int ipgre_tunnel_init(struct net_device *dev);
123 static void ipgre_tunnel_setup(struct net_device *dev);
124 static int ipgre_tunnel_bind_dev(struct net_device *dev);
125
126 /* Fallback tunnel: no source, no destination, no key, no options */
127
128 #define HASH_SIZE  16
129
130 static int ipgre_net_id __read_mostly;
131 struct ipgre_net {
132         struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
133
134         struct net_device *fb_tunnel_dev;
135 };
136
137 /* Tunnel hash table */
138
139 /*
140    4 hash tables:
141
142    3: (remote,local)
143    2: (remote,*)
144    1: (*,local)
145    0: (*,*)
146
147    We require exact key match i.e. if a key is present in packet
148    it will match only tunnel with the same key; if it is not present,
149    it will match only keyless tunnel.
150
151    All keysless packets, if not matched configured keyless tunnels
152    will match fallback tunnel.
153  */
154
155 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156
157 #define tunnels_r_l     tunnels[3]
158 #define tunnels_r       tunnels[2]
159 #define tunnels_l       tunnels[1]
160 #define tunnels_wc      tunnels[0]
161 /*
162  * Locking : hash tables are protected by RCU and RTNL
163  */
164
165 #define for_each_ip_tunnel_rcu(start) \
166         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167
168 /* often modified stats are per cpu, other are shared (netdev->stats) */
169 struct pcpu_tstats {
170         unsigned long   rx_packets;
171         unsigned long   rx_bytes;
172         unsigned long   tx_packets;
173         unsigned long   tx_bytes;
174 };
175
176 static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
177 {
178         struct pcpu_tstats sum = { 0 };
179         int i;
180
181         for_each_possible_cpu(i) {
182                 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
183
184                 sum.rx_packets += tstats->rx_packets;
185                 sum.rx_bytes   += tstats->rx_bytes;
186                 sum.tx_packets += tstats->tx_packets;
187                 sum.tx_bytes   += tstats->tx_bytes;
188         }
189         dev->stats.rx_packets = sum.rx_packets;
190         dev->stats.rx_bytes   = sum.rx_bytes;
191         dev->stats.tx_packets = sum.tx_packets;
192         dev->stats.tx_bytes   = sum.tx_bytes;
193         return &dev->stats;
194 }
195
196 /* Given src, dst and key, find appropriate for input tunnel. */
197
198 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
199                                               __be32 remote, __be32 local,
200                                               __be32 key, __be16 gre_proto)
201 {
202         struct net *net = dev_net(dev);
203         int link = dev->ifindex;
204         unsigned int h0 = HASH(remote);
205         unsigned int h1 = HASH(key);
206         struct ip_tunnel *t, *cand = NULL;
207         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
208         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
209                        ARPHRD_ETHER : ARPHRD_IPGRE;
210         int score, cand_score = 4;
211
212         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
213                 if (local != t->parms.iph.saddr ||
214                     remote != t->parms.iph.daddr ||
215                     key != t->parms.i_key ||
216                     !(t->dev->flags & IFF_UP))
217                         continue;
218
219                 if (t->dev->type != ARPHRD_IPGRE &&
220                     t->dev->type != dev_type)
221                         continue;
222
223                 score = 0;
224                 if (t->parms.link != link)
225                         score |= 1;
226                 if (t->dev->type != dev_type)
227                         score |= 2;
228                 if (score == 0)
229                         return t;
230
231                 if (score < cand_score) {
232                         cand = t;
233                         cand_score = score;
234                 }
235         }
236
237         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
238                 if (remote != t->parms.iph.daddr ||
239                     key != t->parms.i_key ||
240                     !(t->dev->flags & IFF_UP))
241                         continue;
242
243                 if (t->dev->type != ARPHRD_IPGRE &&
244                     t->dev->type != dev_type)
245                         continue;
246
247                 score = 0;
248                 if (t->parms.link != link)
249                         score |= 1;
250                 if (t->dev->type != dev_type)
251                         score |= 2;
252                 if (score == 0)
253                         return t;
254
255                 if (score < cand_score) {
256                         cand = t;
257                         cand_score = score;
258                 }
259         }
260
261         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
262                 if ((local != t->parms.iph.saddr &&
263                      (local != t->parms.iph.daddr ||
264                       !ipv4_is_multicast(local))) ||
265                     key != t->parms.i_key ||
266                     !(t->dev->flags & IFF_UP))
267                         continue;
268
269                 if (t->dev->type != ARPHRD_IPGRE &&
270                     t->dev->type != dev_type)
271                         continue;
272
273                 score = 0;
274                 if (t->parms.link != link)
275                         score |= 1;
276                 if (t->dev->type != dev_type)
277                         score |= 2;
278                 if (score == 0)
279                         return t;
280
281                 if (score < cand_score) {
282                         cand = t;
283                         cand_score = score;
284                 }
285         }
286
287         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
288                 if (t->parms.i_key != key ||
289                     !(t->dev->flags & IFF_UP))
290                         continue;
291
292                 if (t->dev->type != ARPHRD_IPGRE &&
293                     t->dev->type != dev_type)
294                         continue;
295
296                 score = 0;
297                 if (t->parms.link != link)
298                         score |= 1;
299                 if (t->dev->type != dev_type)
300                         score |= 2;
301                 if (score == 0)
302                         return t;
303
304                 if (score < cand_score) {
305                         cand = t;
306                         cand_score = score;
307                 }
308         }
309
310         if (cand != NULL)
311                 return cand;
312
313         dev = ign->fb_tunnel_dev;
314         if (dev->flags & IFF_UP)
315                 return netdev_priv(dev);
316
317         return NULL;
318 }
319
320 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
321                 struct ip_tunnel_parm *parms)
322 {
323         __be32 remote = parms->iph.daddr;
324         __be32 local = parms->iph.saddr;
325         __be32 key = parms->i_key;
326         unsigned int h = HASH(key);
327         int prio = 0;
328
329         if (local)
330                 prio |= 1;
331         if (remote && !ipv4_is_multicast(remote)) {
332                 prio |= 2;
333                 h ^= HASH(remote);
334         }
335
336         return &ign->tunnels[prio][h];
337 }
338
339 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
340                 struct ip_tunnel *t)
341 {
342         return __ipgre_bucket(ign, &t->parms);
343 }
344
345 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
346 {
347         struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
348
349         rcu_assign_pointer(t->next, rtnl_dereference(*tp));
350         rcu_assign_pointer(*tp, t);
351 }
352
353 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
354 {
355         struct ip_tunnel __rcu **tp;
356         struct ip_tunnel *iter;
357
358         for (tp = ipgre_bucket(ign, t);
359              (iter = rtnl_dereference(*tp)) != NULL;
360              tp = &iter->next) {
361                 if (t == iter) {
362                         rcu_assign_pointer(*tp, t->next);
363                         break;
364                 }
365         }
366 }
367
368 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
369                                            struct ip_tunnel_parm *parms,
370                                            int type)
371 {
372         __be32 remote = parms->iph.daddr;
373         __be32 local = parms->iph.saddr;
374         __be32 key = parms->i_key;
375         int link = parms->link;
376         struct ip_tunnel *t;
377         struct ip_tunnel __rcu **tp;
378         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
379
380         for (tp = __ipgre_bucket(ign, parms);
381              (t = rtnl_dereference(*tp)) != NULL;
382              tp = &t->next)
383                 if (local == t->parms.iph.saddr &&
384                     remote == t->parms.iph.daddr &&
385                     key == t->parms.i_key &&
386                     link == t->parms.link &&
387                     type == t->dev->type)
388                         break;
389
390         return t;
391 }
392
393 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
394                 struct ip_tunnel_parm *parms, int create)
395 {
396         struct ip_tunnel *t, *nt;
397         struct net_device *dev;
398         char name[IFNAMSIZ];
399         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
400
401         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
402         if (t || !create)
403                 return t;
404
405         if (parms->name[0])
406                 strlcpy(name, parms->name, IFNAMSIZ);
407         else
408                 strcpy(name, "gre%d");
409
410         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
411         if (!dev)
412                 return NULL;
413
414         dev_net_set(dev, net);
415
416         nt = netdev_priv(dev);
417         nt->parms = *parms;
418         dev->rtnl_link_ops = &ipgre_link_ops;
419
420         dev->mtu = ipgre_tunnel_bind_dev(dev);
421
422         if (register_netdevice(dev) < 0)
423                 goto failed_free;
424
425         dev_hold(dev);
426         ipgre_tunnel_link(ign, nt);
427         return nt;
428
429 failed_free:
430         free_netdev(dev);
431         return NULL;
432 }
433
434 static void ipgre_tunnel_uninit(struct net_device *dev)
435 {
436         struct net *net = dev_net(dev);
437         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
438
439         ipgre_tunnel_unlink(ign, netdev_priv(dev));
440         dev_put(dev);
441 }
442
443
444 static void ipgre_err(struct sk_buff *skb, u32 info)
445 {
446
447 /* All the routers (except for Linux) return only
448    8 bytes of packet payload. It means, that precise relaying of
449    ICMP in the real Internet is absolutely infeasible.
450
451    Moreover, Cisco "wise men" put GRE key to the third word
452    in GRE header. It makes impossible maintaining even soft state for keyed
453    GRE tunnels with enabled checksum. Tell them "thank you".
454
455    Well, I wonder, rfc1812 was written by Cisco employee,
456    what the hell these idiots break standrads established
457    by themself???
458  */
459
460         const struct iphdr *iph = (const struct iphdr *)skb->data;
461         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
462         int grehlen = (iph->ihl<<2) + 4;
463         const int type = icmp_hdr(skb)->type;
464         const int code = icmp_hdr(skb)->code;
465         struct ip_tunnel *t;
466         __be16 flags;
467
468         flags = p[0];
469         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
470                 if (flags&(GRE_VERSION|GRE_ROUTING))
471                         return;
472                 if (flags&GRE_KEY) {
473                         grehlen += 4;
474                         if (flags&GRE_CSUM)
475                                 grehlen += 4;
476                 }
477         }
478
479         /* If only 8 bytes returned, keyed message will be dropped here */
480         if (skb_headlen(skb) < grehlen)
481                 return;
482
483         switch (type) {
484         default:
485         case ICMP_PARAMETERPROB:
486                 return;
487
488         case ICMP_DEST_UNREACH:
489                 switch (code) {
490                 case ICMP_SR_FAILED:
491                 case ICMP_PORT_UNREACH:
492                         /* Impossible event. */
493                         return;
494                 case ICMP_FRAG_NEEDED:
495                         /* Soft state for pmtu is maintained by IP core. */
496                         return;
497                 default:
498                         /* All others are translated to HOST_UNREACH.
499                            rfc2003 contains "deep thoughts" about NET_UNREACH,
500                            I believe they are just ether pollution. --ANK
501                          */
502                         break;
503                 }
504                 break;
505         case ICMP_TIME_EXCEEDED:
506                 if (code != ICMP_EXC_TTL)
507                         return;
508                 break;
509         }
510
511         rcu_read_lock();
512         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
513                                 flags & GRE_KEY ?
514                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
515                                 p[1]);
516         if (t == NULL || t->parms.iph.daddr == 0 ||
517             ipv4_is_multicast(t->parms.iph.daddr))
518                 goto out;
519
520         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
521                 goto out;
522
523         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
524                 t->err_count++;
525         else
526                 t->err_count = 1;
527         t->err_time = jiffies;
528 out:
529         rcu_read_unlock();
530 }
531
532 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
533 {
534         if (INET_ECN_is_ce(iph->tos)) {
535                 if (skb->protocol == htons(ETH_P_IP)) {
536                         IP_ECN_set_ce(ip_hdr(skb));
537                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
538                         IP6_ECN_set_ce(ipv6_hdr(skb));
539                 }
540         }
541 }
542
543 static inline u8
544 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
545 {
546         u8 inner = 0;
547         if (skb->protocol == htons(ETH_P_IP))
548                 inner = old_iph->tos;
549         else if (skb->protocol == htons(ETH_P_IPV6))
550                 inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
551         return INET_ECN_encapsulate(tos, inner);
552 }
553
554 static int ipgre_rcv(struct sk_buff *skb)
555 {
556         const struct iphdr *iph;
557         u8     *h;
558         __be16    flags;
559         __sum16   csum = 0;
560         __be32 key = 0;
561         u32    seqno = 0;
562         struct ip_tunnel *tunnel;
563         int    offset = 4;
564         __be16 gre_proto;
565
566         if (!pskb_may_pull(skb, 16))
567                 goto drop_nolock;
568
569         iph = ip_hdr(skb);
570         h = skb->data;
571         flags = *(__be16*)h;
572
573         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
574                 /* - Version must be 0.
575                    - We do not support routing headers.
576                  */
577                 if (flags&(GRE_VERSION|GRE_ROUTING))
578                         goto drop_nolock;
579
580                 if (flags&GRE_CSUM) {
581                         switch (skb->ip_summed) {
582                         case CHECKSUM_COMPLETE:
583                                 csum = csum_fold(skb->csum);
584                                 if (!csum)
585                                         break;
586                                 /* fall through */
587                         case CHECKSUM_NONE:
588                                 skb->csum = 0;
589                                 csum = __skb_checksum_complete(skb);
590                                 skb->ip_summed = CHECKSUM_COMPLETE;
591                         }
592                         offset += 4;
593                 }
594                 if (flags&GRE_KEY) {
595                         key = *(__be32*)(h + offset);
596                         offset += 4;
597                 }
598                 if (flags&GRE_SEQ) {
599                         seqno = ntohl(*(__be32*)(h + offset));
600                         offset += 4;
601                 }
602         }
603
604         gre_proto = *(__be16 *)(h + 2);
605
606         rcu_read_lock();
607         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
608                                           iph->saddr, iph->daddr, key,
609                                           gre_proto))) {
610                 struct pcpu_tstats *tstats;
611
612                 secpath_reset(skb);
613
614                 skb->protocol = gre_proto;
615                 /* WCCP version 1 and 2 protocol decoding.
616                  * - Change protocol to IP
617                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
618                  */
619                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
620                         skb->protocol = htons(ETH_P_IP);
621                         if ((*(h + offset) & 0xF0) != 0x40)
622                                 offset += 4;
623                 }
624
625                 skb->mac_header = skb->network_header;
626                 __pskb_pull(skb, offset);
627                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
628                 skb->pkt_type = PACKET_HOST;
629 #ifdef CONFIG_NET_IPGRE_BROADCAST
630                 if (ipv4_is_multicast(iph->daddr)) {
631                         /* Looped back packet, drop it! */
632                         if (rt_is_output_route(skb_rtable(skb)))
633                                 goto drop;
634                         tunnel->dev->stats.multicast++;
635                         skb->pkt_type = PACKET_BROADCAST;
636                 }
637 #endif
638
639                 if (((flags&GRE_CSUM) && csum) ||
640                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
641                         tunnel->dev->stats.rx_crc_errors++;
642                         tunnel->dev->stats.rx_errors++;
643                         goto drop;
644                 }
645                 if (tunnel->parms.i_flags&GRE_SEQ) {
646                         if (!(flags&GRE_SEQ) ||
647                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
648                                 tunnel->dev->stats.rx_fifo_errors++;
649                                 tunnel->dev->stats.rx_errors++;
650                                 goto drop;
651                         }
652                         tunnel->i_seqno = seqno + 1;
653                 }
654
655                 /* Warning: All skb pointers will be invalidated! */
656                 if (tunnel->dev->type == ARPHRD_ETHER) {
657                         if (!pskb_may_pull(skb, ETH_HLEN)) {
658                                 tunnel->dev->stats.rx_length_errors++;
659                                 tunnel->dev->stats.rx_errors++;
660                                 goto drop;
661                         }
662
663                         iph = ip_hdr(skb);
664                         skb->protocol = eth_type_trans(skb, tunnel->dev);
665                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
666                 }
667
668                 tstats = this_cpu_ptr(tunnel->dev->tstats);
669                 tstats->rx_packets++;
670                 tstats->rx_bytes += skb->len;
671
672                 __skb_tunnel_rx(skb, tunnel->dev);
673
674                 skb_reset_network_header(skb);
675                 ipgre_ecn_decapsulate(iph, skb);
676
677                 netif_rx(skb);
678
679                 rcu_read_unlock();
680                 return 0;
681         }
682         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
683
684 drop:
685         rcu_read_unlock();
686 drop_nolock:
687         kfree_skb(skb);
688         return 0;
689 }
690
691 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
692 {
693         struct ip_tunnel *tunnel = netdev_priv(dev);
694         struct pcpu_tstats *tstats;
695         const struct iphdr  *old_iph = ip_hdr(skb);
696         const struct iphdr  *tiph;
697         struct flowi4 fl4;
698         u8     tos;
699         __be16 df;
700         struct rtable *rt;                      /* Route to the other host */
701         struct net_device *tdev;                /* Device to other host */
702         struct iphdr  *iph;                     /* Our new IP header */
703         unsigned int max_headroom;              /* The extra header space needed */
704         int    gre_hlen;
705         __be32 dst;
706         int    mtu;
707
708         if (dev->type == ARPHRD_ETHER)
709                 IPCB(skb)->flags = 0;
710
711         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
712                 gre_hlen = 0;
713                 tiph = (const struct iphdr *)skb->data;
714         } else {
715                 gre_hlen = tunnel->hlen;
716                 tiph = &tunnel->parms.iph;
717         }
718
719         if ((dst = tiph->daddr) == 0) {
720                 /* NBMA tunnel */
721
722                 if (skb_dst(skb) == NULL) {
723                         dev->stats.tx_fifo_errors++;
724                         goto tx_error;
725                 }
726
727                 if (skb->protocol == htons(ETH_P_IP)) {
728                         rt = skb_rtable(skb);
729                         if ((dst = rt->rt_gateway) == 0)
730                                 goto tx_error_icmp;
731                 }
732 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
733                 else if (skb->protocol == htons(ETH_P_IPV6)) {
734                         struct neighbour *neigh = dst_get_neighbour(skb_dst(skb));
735                         const struct in6_addr *addr6;
736                         int addr_type;
737
738                         if (neigh == NULL)
739                                 goto tx_error;
740
741                         addr6 = (const struct in6_addr *)&neigh->primary_key;
742                         addr_type = ipv6_addr_type(addr6);
743
744                         if (addr_type == IPV6_ADDR_ANY) {
745                                 addr6 = &ipv6_hdr(skb)->daddr;
746                                 addr_type = ipv6_addr_type(addr6);
747                         }
748
749                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
750                                 goto tx_error_icmp;
751
752                         dst = addr6->s6_addr32[3];
753                 }
754 #endif
755                 else
756                         goto tx_error;
757         }
758
759         tos = tiph->tos;
760         if (tos == 1) {
761                 tos = 0;
762                 if (skb->protocol == htons(ETH_P_IP))
763                         tos = old_iph->tos;
764                 else if (skb->protocol == htons(ETH_P_IPV6))
765                         tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
766         }
767
768         rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
769                                  tunnel->parms.o_key, RT_TOS(tos),
770                                  tunnel->parms.link);
771         if (IS_ERR(rt)) {
772                 dev->stats.tx_carrier_errors++;
773                 goto tx_error;
774         }
775         tdev = rt->dst.dev;
776
777         if (tdev == dev) {
778                 ip_rt_put(rt);
779                 dev->stats.collisions++;
780                 goto tx_error;
781         }
782
783         df = tiph->frag_off;
784         if (df)
785                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
786         else
787                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
788
789         if (skb_dst(skb))
790                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
791
792         if (skb->protocol == htons(ETH_P_IP)) {
793                 df |= (old_iph->frag_off&htons(IP_DF));
794
795                 if ((old_iph->frag_off&htons(IP_DF)) &&
796                     mtu < ntohs(old_iph->tot_len)) {
797                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
798                         ip_rt_put(rt);
799                         goto tx_error;
800                 }
801         }
802 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
803         else if (skb->protocol == htons(ETH_P_IPV6)) {
804                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
805
806                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
807                         if ((tunnel->parms.iph.daddr &&
808                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
809                             rt6->rt6i_dst.plen == 128) {
810                                 rt6->rt6i_flags |= RTF_MODIFIED;
811                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
812                         }
813                 }
814
815                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
816                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
817                         ip_rt_put(rt);
818                         goto tx_error;
819                 }
820         }
821 #endif
822
823         if (tunnel->err_count > 0) {
824                 if (time_before(jiffies,
825                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
826                         tunnel->err_count--;
827
828                         dst_link_failure(skb);
829                 } else
830                         tunnel->err_count = 0;
831         }
832
833         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
834
835         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
836             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
837                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
838                 if (!new_skb) {
839                         ip_rt_put(rt);
840                         dev->stats.tx_dropped++;
841                         dev_kfree_skb(skb);
842                         return NETDEV_TX_OK;
843                 }
844                 if (skb->sk)
845                         skb_set_owner_w(new_skb, skb->sk);
846                 dev_kfree_skb(skb);
847                 skb = new_skb;
848                 old_iph = ip_hdr(skb);
849         }
850
851         skb_reset_transport_header(skb);
852         skb_push(skb, gre_hlen);
853         skb_reset_network_header(skb);
854         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
855         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
856                               IPSKB_REROUTED);
857         skb_dst_drop(skb);
858         skb_dst_set(skb, &rt->dst);
859
860         /*
861          *      Push down and install the IPIP header.
862          */
863
864         iph                     =       ip_hdr(skb);
865         iph->version            =       4;
866         iph->ihl                =       sizeof(struct iphdr) >> 2;
867         iph->frag_off           =       df;
868         iph->protocol           =       IPPROTO_GRE;
869         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
870         iph->daddr              =       fl4.daddr;
871         iph->saddr              =       fl4.saddr;
872
873         if ((iph->ttl = tiph->ttl) == 0) {
874                 if (skb->protocol == htons(ETH_P_IP))
875                         iph->ttl = old_iph->ttl;
876 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
877                 else if (skb->protocol == htons(ETH_P_IPV6))
878                         iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
879 #endif
880                 else
881                         iph->ttl = ip4_dst_hoplimit(&rt->dst);
882         }
883
884         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
885         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
886                                    htons(ETH_P_TEB) : skb->protocol;
887
888         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
889                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
890
891                 if (tunnel->parms.o_flags&GRE_SEQ) {
892                         ++tunnel->o_seqno;
893                         *ptr = htonl(tunnel->o_seqno);
894                         ptr--;
895                 }
896                 if (tunnel->parms.o_flags&GRE_KEY) {
897                         *ptr = tunnel->parms.o_key;
898                         ptr--;
899                 }
900                 if (tunnel->parms.o_flags&GRE_CSUM) {
901                         *ptr = 0;
902                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
903                 }
904         }
905
906         nf_reset(skb);
907         tstats = this_cpu_ptr(dev->tstats);
908         __IPTUNNEL_XMIT(tstats, &dev->stats);
909         return NETDEV_TX_OK;
910
911 tx_error_icmp:
912         dst_link_failure(skb);
913
914 tx_error:
915         dev->stats.tx_errors++;
916         dev_kfree_skb(skb);
917         return NETDEV_TX_OK;
918 }
919
920 static int ipgre_tunnel_bind_dev(struct net_device *dev)
921 {
922         struct net_device *tdev = NULL;
923         struct ip_tunnel *tunnel;
924         const struct iphdr *iph;
925         int hlen = LL_MAX_HEADER;
926         int mtu = ETH_DATA_LEN;
927         int addend = sizeof(struct iphdr) + 4;
928
929         tunnel = netdev_priv(dev);
930         iph = &tunnel->parms.iph;
931
932         /* Guess output device to choose reasonable mtu and needed_headroom */
933
934         if (iph->daddr) {
935                 struct flowi4 fl4;
936                 struct rtable *rt;
937
938                 rt = ip_route_output_gre(dev_net(dev), &fl4,
939                                          iph->daddr, iph->saddr,
940                                          tunnel->parms.o_key,
941                                          RT_TOS(iph->tos),
942                                          tunnel->parms.link);
943                 if (!IS_ERR(rt)) {
944                         tdev = rt->dst.dev;
945                         ip_rt_put(rt);
946                 }
947
948                 if (dev->type != ARPHRD_ETHER)
949                         dev->flags |= IFF_POINTOPOINT;
950         }
951
952         if (!tdev && tunnel->parms.link)
953                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
954
955         if (tdev) {
956                 hlen = tdev->hard_header_len + tdev->needed_headroom;
957                 mtu = tdev->mtu;
958         }
959         dev->iflink = tunnel->parms.link;
960
961         /* Precalculate GRE options length */
962         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
963                 if (tunnel->parms.o_flags&GRE_CSUM)
964                         addend += 4;
965                 if (tunnel->parms.o_flags&GRE_KEY)
966                         addend += 4;
967                 if (tunnel->parms.o_flags&GRE_SEQ)
968                         addend += 4;
969         }
970         dev->needed_headroom = addend + hlen;
971         mtu -= dev->hard_header_len + addend;
972
973         if (mtu < 68)
974                 mtu = 68;
975
976         tunnel->hlen = addend;
977
978         return mtu;
979 }
980
981 static int
982 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
983 {
984         int err = 0;
985         struct ip_tunnel_parm p;
986         struct ip_tunnel *t;
987         struct net *net = dev_net(dev);
988         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
989
990         switch (cmd) {
991         case SIOCGETTUNNEL:
992                 t = NULL;
993                 if (dev == ign->fb_tunnel_dev) {
994                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
995                                 err = -EFAULT;
996                                 break;
997                         }
998                         t = ipgre_tunnel_locate(net, &p, 0);
999                 }
1000                 if (t == NULL)
1001                         t = netdev_priv(dev);
1002                 memcpy(&p, &t->parms, sizeof(p));
1003                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1004                         err = -EFAULT;
1005                 break;
1006
1007         case SIOCADDTUNNEL:
1008         case SIOCCHGTUNNEL:
1009                 err = -EPERM;
1010                 if (!capable(CAP_NET_ADMIN))
1011                         goto done;
1012
1013                 err = -EFAULT;
1014                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1015                         goto done;
1016
1017                 err = -EINVAL;
1018                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1019                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1020                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1021                         goto done;
1022                 if (p.iph.ttl)
1023                         p.iph.frag_off |= htons(IP_DF);
1024
1025                 if (!(p.i_flags&GRE_KEY))
1026                         p.i_key = 0;
1027                 if (!(p.o_flags&GRE_KEY))
1028                         p.o_key = 0;
1029
1030                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1031
1032                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1033                         if (t != NULL) {
1034                                 if (t->dev != dev) {
1035                                         err = -EEXIST;
1036                                         break;
1037                                 }
1038                         } else {
1039                                 unsigned int nflags = 0;
1040
1041                                 t = netdev_priv(dev);
1042
1043                                 if (ipv4_is_multicast(p.iph.daddr))
1044                                         nflags = IFF_BROADCAST;
1045                                 else if (p.iph.daddr)
1046                                         nflags = IFF_POINTOPOINT;
1047
1048                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1049                                         err = -EINVAL;
1050                                         break;
1051                                 }
1052                                 ipgre_tunnel_unlink(ign, t);
1053                                 synchronize_net();
1054                                 t->parms.iph.saddr = p.iph.saddr;
1055                                 t->parms.iph.daddr = p.iph.daddr;
1056                                 t->parms.i_key = p.i_key;
1057                                 t->parms.o_key = p.o_key;
1058                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1059                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1060                                 ipgre_tunnel_link(ign, t);
1061                                 netdev_state_change(dev);
1062                         }
1063                 }
1064
1065                 if (t) {
1066                         err = 0;
1067                         if (cmd == SIOCCHGTUNNEL) {
1068                                 t->parms.iph.ttl = p.iph.ttl;
1069                                 t->parms.iph.tos = p.iph.tos;
1070                                 t->parms.iph.frag_off = p.iph.frag_off;
1071                                 if (t->parms.link != p.link) {
1072                                         t->parms.link = p.link;
1073                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1074                                         netdev_state_change(dev);
1075                                 }
1076                         }
1077                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1078                                 err = -EFAULT;
1079                 } else
1080                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1081                 break;
1082
1083         case SIOCDELTUNNEL:
1084                 err = -EPERM;
1085                 if (!capable(CAP_NET_ADMIN))
1086                         goto done;
1087
1088                 if (dev == ign->fb_tunnel_dev) {
1089                         err = -EFAULT;
1090                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1091                                 goto done;
1092                         err = -ENOENT;
1093                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1094                                 goto done;
1095                         err = -EPERM;
1096                         if (t == netdev_priv(ign->fb_tunnel_dev))
1097                                 goto done;
1098                         dev = t->dev;
1099                 }
1100                 unregister_netdevice(dev);
1101                 err = 0;
1102                 break;
1103
1104         default:
1105                 err = -EINVAL;
1106         }
1107
1108 done:
1109         return err;
1110 }
1111
1112 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1113 {
1114         struct ip_tunnel *tunnel = netdev_priv(dev);
1115         if (new_mtu < 68 ||
1116             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1117                 return -EINVAL;
1118         dev->mtu = new_mtu;
1119         return 0;
1120 }
1121
1122 /* Nice toy. Unfortunately, useless in real life :-)
1123    It allows to construct virtual multiprotocol broadcast "LAN"
1124    over the Internet, provided multicast routing is tuned.
1125
1126
1127    I have no idea was this bicycle invented before me,
1128    so that I had to set ARPHRD_IPGRE to a random value.
1129    I have an impression, that Cisco could make something similar,
1130    but this feature is apparently missing in IOS<=11.2(8).
1131
1132    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1133    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1134
1135    ping -t 255 224.66.66.66
1136
1137    If nobody answers, mbone does not work.
1138
1139    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1140    ip addr add 10.66.66.<somewhat>/24 dev Universe
1141    ifconfig Universe up
1142    ifconfig Universe add fe80::<Your_real_addr>/10
1143    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1144    ftp 10.66.66.66
1145    ...
1146    ftp fec0:6666:6666::193.233.7.65
1147    ...
1148
1149  */
1150
1151 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1152                         unsigned short type,
1153                         const void *daddr, const void *saddr, unsigned int len)
1154 {
1155         struct ip_tunnel *t = netdev_priv(dev);
1156         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1157         __be16 *p = (__be16*)(iph+1);
1158
1159         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1160         p[0]            = t->parms.o_flags;
1161         p[1]            = htons(type);
1162
1163         /*
1164          *      Set the source hardware address.
1165          */
1166
1167         if (saddr)
1168                 memcpy(&iph->saddr, saddr, 4);
1169         if (daddr)
1170                 memcpy(&iph->daddr, daddr, 4);
1171         if (iph->daddr)
1172                 return t->hlen;
1173
1174         return -t->hlen;
1175 }
1176
1177 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1178 {
1179         const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1180         memcpy(haddr, &iph->saddr, 4);
1181         return 4;
1182 }
1183
1184 static const struct header_ops ipgre_header_ops = {
1185         .create = ipgre_header,
1186         .parse  = ipgre_header_parse,
1187 };
1188
1189 #ifdef CONFIG_NET_IPGRE_BROADCAST
1190 static int ipgre_open(struct net_device *dev)
1191 {
1192         struct ip_tunnel *t = netdev_priv(dev);
1193
1194         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1195                 struct flowi4 fl4;
1196                 struct rtable *rt;
1197
1198                 rt = ip_route_output_gre(dev_net(dev), &fl4,
1199                                          t->parms.iph.daddr,
1200                                          t->parms.iph.saddr,
1201                                          t->parms.o_key,
1202                                          RT_TOS(t->parms.iph.tos),
1203                                          t->parms.link);
1204                 if (IS_ERR(rt))
1205                         return -EADDRNOTAVAIL;
1206                 dev = rt->dst.dev;
1207                 ip_rt_put(rt);
1208                 if (__in_dev_get_rtnl(dev) == NULL)
1209                         return -EADDRNOTAVAIL;
1210                 t->mlink = dev->ifindex;
1211                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1212         }
1213         return 0;
1214 }
1215
1216 static int ipgre_close(struct net_device *dev)
1217 {
1218         struct ip_tunnel *t = netdev_priv(dev);
1219
1220         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1221                 struct in_device *in_dev;
1222                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1223                 if (in_dev)
1224                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1225         }
1226         return 0;
1227 }
1228
1229 #endif
1230
1231 static const struct net_device_ops ipgre_netdev_ops = {
1232         .ndo_init               = ipgre_tunnel_init,
1233         .ndo_uninit             = ipgre_tunnel_uninit,
1234 #ifdef CONFIG_NET_IPGRE_BROADCAST
1235         .ndo_open               = ipgre_open,
1236         .ndo_stop               = ipgre_close,
1237 #endif
1238         .ndo_start_xmit         = ipgre_tunnel_xmit,
1239         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1240         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1241         .ndo_get_stats          = ipgre_get_stats,
1242 };
1243
1244 static void ipgre_dev_free(struct net_device *dev)
1245 {
1246         free_percpu(dev->tstats);
1247         free_netdev(dev);
1248 }
1249
1250 static void ipgre_tunnel_setup(struct net_device *dev)
1251 {
1252         dev->netdev_ops         = &ipgre_netdev_ops;
1253         dev->destructor         = ipgre_dev_free;
1254
1255         dev->type               = ARPHRD_IPGRE;
1256         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1257         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1258         dev->flags              = IFF_NOARP;
1259         dev->iflink             = 0;
1260         dev->addr_len           = 4;
1261         dev->features           |= NETIF_F_NETNS_LOCAL;
1262         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1263 }
1264
1265 static int ipgre_tunnel_init(struct net_device *dev)
1266 {
1267         struct ip_tunnel *tunnel;
1268         struct iphdr *iph;
1269
1270         tunnel = netdev_priv(dev);
1271         iph = &tunnel->parms.iph;
1272
1273         tunnel->dev = dev;
1274         strcpy(tunnel->parms.name, dev->name);
1275
1276         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1277         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1278
1279         if (iph->daddr) {
1280 #ifdef CONFIG_NET_IPGRE_BROADCAST
1281                 if (ipv4_is_multicast(iph->daddr)) {
1282                         if (!iph->saddr)
1283                                 return -EINVAL;
1284                         dev->flags = IFF_BROADCAST;
1285                         dev->header_ops = &ipgre_header_ops;
1286                 }
1287 #endif
1288         } else
1289                 dev->header_ops = &ipgre_header_ops;
1290
1291         dev->tstats = alloc_percpu(struct pcpu_tstats);
1292         if (!dev->tstats)
1293                 return -ENOMEM;
1294
1295         return 0;
1296 }
1297
1298 static void ipgre_fb_tunnel_init(struct net_device *dev)
1299 {
1300         struct ip_tunnel *tunnel = netdev_priv(dev);
1301         struct iphdr *iph = &tunnel->parms.iph;
1302
1303         tunnel->dev = dev;
1304         strcpy(tunnel->parms.name, dev->name);
1305
1306         iph->version            = 4;
1307         iph->protocol           = IPPROTO_GRE;
1308         iph->ihl                = 5;
1309         tunnel->hlen            = sizeof(struct iphdr) + 4;
1310
1311         dev_hold(dev);
1312 }
1313
1314
1315 static const struct gre_protocol ipgre_protocol = {
1316         .handler     = ipgre_rcv,
1317         .err_handler = ipgre_err,
1318 };
1319
1320 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1321 {
1322         int prio;
1323
1324         for (prio = 0; prio < 4; prio++) {
1325                 int h;
1326                 for (h = 0; h < HASH_SIZE; h++) {
1327                         struct ip_tunnel *t;
1328
1329                         t = rtnl_dereference(ign->tunnels[prio][h]);
1330
1331                         while (t != NULL) {
1332                                 unregister_netdevice_queue(t->dev, head);
1333                                 t = rtnl_dereference(t->next);
1334                         }
1335                 }
1336         }
1337 }
1338
1339 static int __net_init ipgre_init_net(struct net *net)
1340 {
1341         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1342         int err;
1343
1344         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1345                                            ipgre_tunnel_setup);
1346         if (!ign->fb_tunnel_dev) {
1347                 err = -ENOMEM;
1348                 goto err_alloc_dev;
1349         }
1350         dev_net_set(ign->fb_tunnel_dev, net);
1351
1352         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1353         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1354
1355         if ((err = register_netdev(ign->fb_tunnel_dev)))
1356                 goto err_reg_dev;
1357
1358         rcu_assign_pointer(ign->tunnels_wc[0],
1359                            netdev_priv(ign->fb_tunnel_dev));
1360         return 0;
1361
1362 err_reg_dev:
1363         ipgre_dev_free(ign->fb_tunnel_dev);
1364 err_alloc_dev:
1365         return err;
1366 }
1367
1368 static void __net_exit ipgre_exit_net(struct net *net)
1369 {
1370         struct ipgre_net *ign;
1371         LIST_HEAD(list);
1372
1373         ign = net_generic(net, ipgre_net_id);
1374         rtnl_lock();
1375         ipgre_destroy_tunnels(ign, &list);
1376         unregister_netdevice_many(&list);
1377         rtnl_unlock();
1378 }
1379
1380 static struct pernet_operations ipgre_net_ops = {
1381         .init = ipgre_init_net,
1382         .exit = ipgre_exit_net,
1383         .id   = &ipgre_net_id,
1384         .size = sizeof(struct ipgre_net),
1385 };
1386
1387 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1388 {
1389         __be16 flags;
1390
1391         if (!data)
1392                 return 0;
1393
1394         flags = 0;
1395         if (data[IFLA_GRE_IFLAGS])
1396                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1397         if (data[IFLA_GRE_OFLAGS])
1398                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1399         if (flags & (GRE_VERSION|GRE_ROUTING))
1400                 return -EINVAL;
1401
1402         return 0;
1403 }
1404
1405 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1406 {
1407         __be32 daddr;
1408
1409         if (tb[IFLA_ADDRESS]) {
1410                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1411                         return -EINVAL;
1412                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1413                         return -EADDRNOTAVAIL;
1414         }
1415
1416         if (!data)
1417                 goto out;
1418
1419         if (data[IFLA_GRE_REMOTE]) {
1420                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1421                 if (!daddr)
1422                         return -EINVAL;
1423         }
1424
1425 out:
1426         return ipgre_tunnel_validate(tb, data);
1427 }
1428
1429 static void ipgre_netlink_parms(struct nlattr *data[],
1430                                 struct ip_tunnel_parm *parms)
1431 {
1432         memset(parms, 0, sizeof(*parms));
1433
1434         parms->iph.protocol = IPPROTO_GRE;
1435
1436         if (!data)
1437                 return;
1438
1439         if (data[IFLA_GRE_LINK])
1440                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1441
1442         if (data[IFLA_GRE_IFLAGS])
1443                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1444
1445         if (data[IFLA_GRE_OFLAGS])
1446                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1447
1448         if (data[IFLA_GRE_IKEY])
1449                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1450
1451         if (data[IFLA_GRE_OKEY])
1452                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1453
1454         if (data[IFLA_GRE_LOCAL])
1455                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1456
1457         if (data[IFLA_GRE_REMOTE])
1458                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1459
1460         if (data[IFLA_GRE_TTL])
1461                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1462
1463         if (data[IFLA_GRE_TOS])
1464                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1465
1466         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1467                 parms->iph.frag_off = htons(IP_DF);
1468 }
1469
1470 static int ipgre_tap_init(struct net_device *dev)
1471 {
1472         struct ip_tunnel *tunnel;
1473
1474         tunnel = netdev_priv(dev);
1475
1476         tunnel->dev = dev;
1477         strcpy(tunnel->parms.name, dev->name);
1478
1479         ipgre_tunnel_bind_dev(dev);
1480
1481         dev->tstats = alloc_percpu(struct pcpu_tstats);
1482         if (!dev->tstats)
1483                 return -ENOMEM;
1484
1485         return 0;
1486 }
1487
1488 static const struct net_device_ops ipgre_tap_netdev_ops = {
1489         .ndo_init               = ipgre_tap_init,
1490         .ndo_uninit             = ipgre_tunnel_uninit,
1491         .ndo_start_xmit         = ipgre_tunnel_xmit,
1492         .ndo_set_mac_address    = eth_mac_addr,
1493         .ndo_validate_addr      = eth_validate_addr,
1494         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1495         .ndo_get_stats          = ipgre_get_stats,
1496 };
1497
1498 static void ipgre_tap_setup(struct net_device *dev)
1499 {
1500
1501         ether_setup(dev);
1502
1503         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1504         dev->destructor         = ipgre_dev_free;
1505
1506         dev->iflink             = 0;
1507         dev->features           |= NETIF_F_NETNS_LOCAL;
1508 }
1509
1510 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1511                          struct nlattr *data[])
1512 {
1513         struct ip_tunnel *nt;
1514         struct net *net = dev_net(dev);
1515         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1516         int mtu;
1517         int err;
1518
1519         nt = netdev_priv(dev);
1520         ipgre_netlink_parms(data, &nt->parms);
1521
1522         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1523                 return -EEXIST;
1524
1525         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1526                 random_ether_addr(dev->dev_addr);
1527
1528         mtu = ipgre_tunnel_bind_dev(dev);
1529         if (!tb[IFLA_MTU])
1530                 dev->mtu = mtu;
1531
1532         /* Can use a lockless transmit, unless we generate output sequences */
1533         if (!(nt->parms.o_flags & GRE_SEQ))
1534                 dev->features |= NETIF_F_LLTX;
1535
1536         err = register_netdevice(dev);
1537         if (err)
1538                 goto out;
1539
1540         dev_hold(dev);
1541         ipgre_tunnel_link(ign, nt);
1542
1543 out:
1544         return err;
1545 }
1546
1547 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1548                             struct nlattr *data[])
1549 {
1550         struct ip_tunnel *t, *nt;
1551         struct net *net = dev_net(dev);
1552         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1553         struct ip_tunnel_parm p;
1554         int mtu;
1555
1556         if (dev == ign->fb_tunnel_dev)
1557                 return -EINVAL;
1558
1559         nt = netdev_priv(dev);
1560         ipgre_netlink_parms(data, &p);
1561
1562         t = ipgre_tunnel_locate(net, &p, 0);
1563
1564         if (t) {
1565                 if (t->dev != dev)
1566                         return -EEXIST;
1567         } else {
1568                 t = nt;
1569
1570                 if (dev->type != ARPHRD_ETHER) {
1571                         unsigned int nflags = 0;
1572
1573                         if (ipv4_is_multicast(p.iph.daddr))
1574                                 nflags = IFF_BROADCAST;
1575                         else if (p.iph.daddr)
1576                                 nflags = IFF_POINTOPOINT;
1577
1578                         if ((dev->flags ^ nflags) &
1579                             (IFF_POINTOPOINT | IFF_BROADCAST))
1580                                 return -EINVAL;
1581                 }
1582
1583                 ipgre_tunnel_unlink(ign, t);
1584                 t->parms.iph.saddr = p.iph.saddr;
1585                 t->parms.iph.daddr = p.iph.daddr;
1586                 t->parms.i_key = p.i_key;
1587                 if (dev->type != ARPHRD_ETHER) {
1588                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1589                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1590                 }
1591                 ipgre_tunnel_link(ign, t);
1592                 netdev_state_change(dev);
1593         }
1594
1595         t->parms.o_key = p.o_key;
1596         t->parms.iph.ttl = p.iph.ttl;
1597         t->parms.iph.tos = p.iph.tos;
1598         t->parms.iph.frag_off = p.iph.frag_off;
1599
1600         if (t->parms.link != p.link) {
1601                 t->parms.link = p.link;
1602                 mtu = ipgre_tunnel_bind_dev(dev);
1603                 if (!tb[IFLA_MTU])
1604                         dev->mtu = mtu;
1605                 netdev_state_change(dev);
1606         }
1607
1608         return 0;
1609 }
1610
1611 static size_t ipgre_get_size(const struct net_device *dev)
1612 {
1613         return
1614                 /* IFLA_GRE_LINK */
1615                 nla_total_size(4) +
1616                 /* IFLA_GRE_IFLAGS */
1617                 nla_total_size(2) +
1618                 /* IFLA_GRE_OFLAGS */
1619                 nla_total_size(2) +
1620                 /* IFLA_GRE_IKEY */
1621                 nla_total_size(4) +
1622                 /* IFLA_GRE_OKEY */
1623                 nla_total_size(4) +
1624                 /* IFLA_GRE_LOCAL */
1625                 nla_total_size(4) +
1626                 /* IFLA_GRE_REMOTE */
1627                 nla_total_size(4) +
1628                 /* IFLA_GRE_TTL */
1629                 nla_total_size(1) +
1630                 /* IFLA_GRE_TOS */
1631                 nla_total_size(1) +
1632                 /* IFLA_GRE_PMTUDISC */
1633                 nla_total_size(1) +
1634                 0;
1635 }
1636
1637 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1638 {
1639         struct ip_tunnel *t = netdev_priv(dev);
1640         struct ip_tunnel_parm *p = &t->parms;
1641
1642         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1643         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1644         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1645         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1646         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1647         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1648         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1649         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1650         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1651         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1652
1653         return 0;
1654
1655 nla_put_failure:
1656         return -EMSGSIZE;
1657 }
1658
1659 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1660         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1661         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1662         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1663         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1664         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1665         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1666         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1667         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1668         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1669         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1670 };
1671
1672 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1673         .kind           = "gre",
1674         .maxtype        = IFLA_GRE_MAX,
1675         .policy         = ipgre_policy,
1676         .priv_size      = sizeof(struct ip_tunnel),
1677         .setup          = ipgre_tunnel_setup,
1678         .validate       = ipgre_tunnel_validate,
1679         .newlink        = ipgre_newlink,
1680         .changelink     = ipgre_changelink,
1681         .get_size       = ipgre_get_size,
1682         .fill_info      = ipgre_fill_info,
1683 };
1684
1685 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1686         .kind           = "gretap",
1687         .maxtype        = IFLA_GRE_MAX,
1688         .policy         = ipgre_policy,
1689         .priv_size      = sizeof(struct ip_tunnel),
1690         .setup          = ipgre_tap_setup,
1691         .validate       = ipgre_tap_validate,
1692         .newlink        = ipgre_newlink,
1693         .changelink     = ipgre_changelink,
1694         .get_size       = ipgre_get_size,
1695         .fill_info      = ipgre_fill_info,
1696 };
1697
1698 /*
1699  *      And now the modules code and kernel interface.
1700  */
1701
1702 static int __init ipgre_init(void)
1703 {
1704         int err;
1705
1706         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1707
1708         err = register_pernet_device(&ipgre_net_ops);
1709         if (err < 0)
1710                 return err;
1711
1712         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1713         if (err < 0) {
1714                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1715                 goto add_proto_failed;
1716         }
1717
1718         err = rtnl_link_register(&ipgre_link_ops);
1719         if (err < 0)
1720                 goto rtnl_link_failed;
1721
1722         err = rtnl_link_register(&ipgre_tap_ops);
1723         if (err < 0)
1724                 goto tap_ops_failed;
1725
1726 out:
1727         return err;
1728
1729 tap_ops_failed:
1730         rtnl_link_unregister(&ipgre_link_ops);
1731 rtnl_link_failed:
1732         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1733 add_proto_failed:
1734         unregister_pernet_device(&ipgre_net_ops);
1735         goto out;
1736 }
1737
1738 static void __exit ipgre_fini(void)
1739 {
1740         rtnl_link_unregister(&ipgre_tap_ops);
1741         rtnl_link_unregister(&ipgre_link_ops);
1742         if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1743                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1744         unregister_pernet_device(&ipgre_net_ops);
1745 }
1746
1747 module_init(ipgre_init);
1748 module_exit(ipgre_fini);
1749 MODULE_LICENSE("GPL");
1750 MODULE_ALIAS_RTNL_LINK("gre");
1751 MODULE_ALIAS_RTNL_LINK("gretap");
1752 MODULE_ALIAS_NETDEV("gre0");