Merge branches 'stable/ia64', 'stable/blkfront-cleanup' and 'stable/cleanup' of git...
[pandora-kernel.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
33
34 #include <net/sock.h>
35 #include <net/ip.h>
36 #include <net/icmp.h>
37 #include <net/protocol.h>
38 #include <net/ipip.h>
39 #include <net/arp.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
47 #include <net/gre.h>
48
49 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #endif
54
55 /*
56    Problems & solutions
57    --------------------
58
59    1. The most important issue is detecting local dead loops.
60    They would cause complete host lockup in transmit, which
61    would be "resolved" by stack overflow or, if queueing is enabled,
62    with infinite looping in net_bh.
63
64    We cannot track such dead loops during route installation,
65    it is infeasible task. The most general solutions would be
66    to keep skb->encapsulation counter (sort of local ttl),
67    and silently drop packet when it expires. It is a good
68    solution, but it supposes maintaing new variable in ALL
69    skb, even if no tunneling is used.
70
71    Current solution: xmit_recursion breaks dead loops. This is a percpu
72    counter, since when we enter the first ndo_xmit(), cpu migration is
73    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
74
75    2. Networking dead loops would not kill routers, but would really
76    kill network. IP hop limit plays role of "t->recursion" in this case,
77    if we copy it from packet being encapsulated to upper header.
78    It is very good solution, but it introduces two problems:
79
80    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
81      do not work over tunnels.
82    - traceroute does not work. I planned to relay ICMP from tunnel,
83      so that this problem would be solved and traceroute output
84      would even more informative. This idea appeared to be wrong:
85      only Linux complies to rfc1812 now (yes, guys, Linux is the only
86      true router now :-)), all routers (at least, in neighbourhood of mine)
87      return only 8 bytes of payload. It is the end.
88
89    Hence, if we want that OSPF worked or traceroute said something reasonable,
90    we should search for another solution.
91
92    One of them is to parse packet trying to detect inner encapsulation
93    made by our node. It is difficult or even impossible, especially,
94    taking into account fragmentation. TO be short, tt is not solution at all.
95
96    Current solution: The solution was UNEXPECTEDLY SIMPLE.
97    We force DF flag on tunnels with preconfigured hop limit,
98    that is ALL. :-) Well, it does not remove the problem completely,
99    but exponential growth of network traffic is changed to linear
100    (branches, that exceed pmtu are pruned) and tunnel mtu
101    fastly degrades to value <68, where looping stops.
102    Yes, it is not good if there exists a router in the loop,
103    which does not force DF, even when encapsulating packets have DF set.
104    But it is not our problem! Nobody could accuse us, we made
105    all that we could make. Even if it is your gated who injected
106    fatal route to network, even if it were you who configured
107    fatal static route: you are innocent. :-)
108
109
110
111    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
112    practically identical code. It would be good to glue them
113    together, but it is not very evident, how to make them modular.
114    sit is integral part of IPv6, ipip and gre are naturally modular.
115    We could extract common parts (hash table, ioctl etc)
116    to a separate module (ip_tunnel.c).
117
118    Alexey Kuznetsov.
119  */
120
121 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
122 static int ipgre_tunnel_init(struct net_device *dev);
123 static void ipgre_tunnel_setup(struct net_device *dev);
124 static int ipgre_tunnel_bind_dev(struct net_device *dev);
125
126 /* Fallback tunnel: no source, no destination, no key, no options */
127
128 #define HASH_SIZE  16
129
130 static int ipgre_net_id __read_mostly;
131 struct ipgre_net {
132         struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
133
134         struct net_device *fb_tunnel_dev;
135 };
136
137 /* Tunnel hash table */
138
139 /*
140    4 hash tables:
141
142    3: (remote,local)
143    2: (remote,*)
144    1: (*,local)
145    0: (*,*)
146
147    We require exact key match i.e. if a key is present in packet
148    it will match only tunnel with the same key; if it is not present,
149    it will match only keyless tunnel.
150
151    All keysless packets, if not matched configured keyless tunnels
152    will match fallback tunnel.
153  */
154
155 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156
157 #define tunnels_r_l     tunnels[3]
158 #define tunnels_r       tunnels[2]
159 #define tunnels_l       tunnels[1]
160 #define tunnels_wc      tunnels[0]
161 /*
162  * Locking : hash tables are protected by RCU and RTNL
163  */
164
165 #define for_each_ip_tunnel_rcu(start) \
166         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167
168 /* often modified stats are per cpu, other are shared (netdev->stats) */
169 struct pcpu_tstats {
170         unsigned long   rx_packets;
171         unsigned long   rx_bytes;
172         unsigned long   tx_packets;
173         unsigned long   tx_bytes;
174 };
175
176 static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
177 {
178         struct pcpu_tstats sum = { 0 };
179         int i;
180
181         for_each_possible_cpu(i) {
182                 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
183
184                 sum.rx_packets += tstats->rx_packets;
185                 sum.rx_bytes   += tstats->rx_bytes;
186                 sum.tx_packets += tstats->tx_packets;
187                 sum.tx_bytes   += tstats->tx_bytes;
188         }
189         dev->stats.rx_packets = sum.rx_packets;
190         dev->stats.rx_bytes   = sum.rx_bytes;
191         dev->stats.tx_packets = sum.tx_packets;
192         dev->stats.tx_bytes   = sum.tx_bytes;
193         return &dev->stats;
194 }
195
196 /* Given src, dst and key, find appropriate for input tunnel. */
197
198 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
199                                               __be32 remote, __be32 local,
200                                               __be32 key, __be16 gre_proto)
201 {
202         struct net *net = dev_net(dev);
203         int link = dev->ifindex;
204         unsigned int h0 = HASH(remote);
205         unsigned int h1 = HASH(key);
206         struct ip_tunnel *t, *cand = NULL;
207         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
208         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
209                        ARPHRD_ETHER : ARPHRD_IPGRE;
210         int score, cand_score = 4;
211
212         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
213                 if (local != t->parms.iph.saddr ||
214                     remote != t->parms.iph.daddr ||
215                     key != t->parms.i_key ||
216                     !(t->dev->flags & IFF_UP))
217                         continue;
218
219                 if (t->dev->type != ARPHRD_IPGRE &&
220                     t->dev->type != dev_type)
221                         continue;
222
223                 score = 0;
224                 if (t->parms.link != link)
225                         score |= 1;
226                 if (t->dev->type != dev_type)
227                         score |= 2;
228                 if (score == 0)
229                         return t;
230
231                 if (score < cand_score) {
232                         cand = t;
233                         cand_score = score;
234                 }
235         }
236
237         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
238                 if (remote != t->parms.iph.daddr ||
239                     key != t->parms.i_key ||
240                     !(t->dev->flags & IFF_UP))
241                         continue;
242
243                 if (t->dev->type != ARPHRD_IPGRE &&
244                     t->dev->type != dev_type)
245                         continue;
246
247                 score = 0;
248                 if (t->parms.link != link)
249                         score |= 1;
250                 if (t->dev->type != dev_type)
251                         score |= 2;
252                 if (score == 0)
253                         return t;
254
255                 if (score < cand_score) {
256                         cand = t;
257                         cand_score = score;
258                 }
259         }
260
261         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
262                 if ((local != t->parms.iph.saddr &&
263                      (local != t->parms.iph.daddr ||
264                       !ipv4_is_multicast(local))) ||
265                     key != t->parms.i_key ||
266                     !(t->dev->flags & IFF_UP))
267                         continue;
268
269                 if (t->dev->type != ARPHRD_IPGRE &&
270                     t->dev->type != dev_type)
271                         continue;
272
273                 score = 0;
274                 if (t->parms.link != link)
275                         score |= 1;
276                 if (t->dev->type != dev_type)
277                         score |= 2;
278                 if (score == 0)
279                         return t;
280
281                 if (score < cand_score) {
282                         cand = t;
283                         cand_score = score;
284                 }
285         }
286
287         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
288                 if (t->parms.i_key != key ||
289                     !(t->dev->flags & IFF_UP))
290                         continue;
291
292                 if (t->dev->type != ARPHRD_IPGRE &&
293                     t->dev->type != dev_type)
294                         continue;
295
296                 score = 0;
297                 if (t->parms.link != link)
298                         score |= 1;
299                 if (t->dev->type != dev_type)
300                         score |= 2;
301                 if (score == 0)
302                         return t;
303
304                 if (score < cand_score) {
305                         cand = t;
306                         cand_score = score;
307                 }
308         }
309
310         if (cand != NULL)
311                 return cand;
312
313         dev = ign->fb_tunnel_dev;
314         if (dev->flags & IFF_UP)
315                 return netdev_priv(dev);
316
317         return NULL;
318 }
319
320 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
321                 struct ip_tunnel_parm *parms)
322 {
323         __be32 remote = parms->iph.daddr;
324         __be32 local = parms->iph.saddr;
325         __be32 key = parms->i_key;
326         unsigned int h = HASH(key);
327         int prio = 0;
328
329         if (local)
330                 prio |= 1;
331         if (remote && !ipv4_is_multicast(remote)) {
332                 prio |= 2;
333                 h ^= HASH(remote);
334         }
335
336         return &ign->tunnels[prio][h];
337 }
338
339 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
340                 struct ip_tunnel *t)
341 {
342         return __ipgre_bucket(ign, &t->parms);
343 }
344
345 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
346 {
347         struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
348
349         rcu_assign_pointer(t->next, rtnl_dereference(*tp));
350         rcu_assign_pointer(*tp, t);
351 }
352
353 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
354 {
355         struct ip_tunnel __rcu **tp;
356         struct ip_tunnel *iter;
357
358         for (tp = ipgre_bucket(ign, t);
359              (iter = rtnl_dereference(*tp)) != NULL;
360              tp = &iter->next) {
361                 if (t == iter) {
362                         rcu_assign_pointer(*tp, t->next);
363                         break;
364                 }
365         }
366 }
367
368 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
369                                            struct ip_tunnel_parm *parms,
370                                            int type)
371 {
372         __be32 remote = parms->iph.daddr;
373         __be32 local = parms->iph.saddr;
374         __be32 key = parms->i_key;
375         int link = parms->link;
376         struct ip_tunnel *t;
377         struct ip_tunnel __rcu **tp;
378         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
379
380         for (tp = __ipgre_bucket(ign, parms);
381              (t = rtnl_dereference(*tp)) != NULL;
382              tp = &t->next)
383                 if (local == t->parms.iph.saddr &&
384                     remote == t->parms.iph.daddr &&
385                     key == t->parms.i_key &&
386                     link == t->parms.link &&
387                     type == t->dev->type)
388                         break;
389
390         return t;
391 }
392
393 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
394                 struct ip_tunnel_parm *parms, int create)
395 {
396         struct ip_tunnel *t, *nt;
397         struct net_device *dev;
398         char name[IFNAMSIZ];
399         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
400
401         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
402         if (t || !create)
403                 return t;
404
405         if (parms->name[0])
406                 strlcpy(name, parms->name, IFNAMSIZ);
407         else
408                 strcpy(name, "gre%d");
409
410         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
411         if (!dev)
412                 return NULL;
413
414         dev_net_set(dev, net);
415
416         if (strchr(name, '%')) {
417                 if (dev_alloc_name(dev, name) < 0)
418                         goto failed_free;
419         }
420
421         nt = netdev_priv(dev);
422         nt->parms = *parms;
423         dev->rtnl_link_ops = &ipgre_link_ops;
424
425         dev->mtu = ipgre_tunnel_bind_dev(dev);
426
427         if (register_netdevice(dev) < 0)
428                 goto failed_free;
429
430         dev_hold(dev);
431         ipgre_tunnel_link(ign, nt);
432         return nt;
433
434 failed_free:
435         free_netdev(dev);
436         return NULL;
437 }
438
439 static void ipgre_tunnel_uninit(struct net_device *dev)
440 {
441         struct net *net = dev_net(dev);
442         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
443
444         ipgre_tunnel_unlink(ign, netdev_priv(dev));
445         dev_put(dev);
446 }
447
448
449 static void ipgre_err(struct sk_buff *skb, u32 info)
450 {
451
452 /* All the routers (except for Linux) return only
453    8 bytes of packet payload. It means, that precise relaying of
454    ICMP in the real Internet is absolutely infeasible.
455
456    Moreover, Cisco "wise men" put GRE key to the third word
457    in GRE header. It makes impossible maintaining even soft state for keyed
458    GRE tunnels with enabled checksum. Tell them "thank you".
459
460    Well, I wonder, rfc1812 was written by Cisco employee,
461    what the hell these idiots break standrads established
462    by themself???
463  */
464
465         struct iphdr *iph = (struct iphdr *)skb->data;
466         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
467         int grehlen = (iph->ihl<<2) + 4;
468         const int type = icmp_hdr(skb)->type;
469         const int code = icmp_hdr(skb)->code;
470         struct ip_tunnel *t;
471         __be16 flags;
472
473         flags = p[0];
474         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
475                 if (flags&(GRE_VERSION|GRE_ROUTING))
476                         return;
477                 if (flags&GRE_KEY) {
478                         grehlen += 4;
479                         if (flags&GRE_CSUM)
480                                 grehlen += 4;
481                 }
482         }
483
484         /* If only 8 bytes returned, keyed message will be dropped here */
485         if (skb_headlen(skb) < grehlen)
486                 return;
487
488         switch (type) {
489         default:
490         case ICMP_PARAMETERPROB:
491                 return;
492
493         case ICMP_DEST_UNREACH:
494                 switch (code) {
495                 case ICMP_SR_FAILED:
496                 case ICMP_PORT_UNREACH:
497                         /* Impossible event. */
498                         return;
499                 case ICMP_FRAG_NEEDED:
500                         /* Soft state for pmtu is maintained by IP core. */
501                         return;
502                 default:
503                         /* All others are translated to HOST_UNREACH.
504                            rfc2003 contains "deep thoughts" about NET_UNREACH,
505                            I believe they are just ether pollution. --ANK
506                          */
507                         break;
508                 }
509                 break;
510         case ICMP_TIME_EXCEEDED:
511                 if (code != ICMP_EXC_TTL)
512                         return;
513                 break;
514         }
515
516         rcu_read_lock();
517         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
518                                 flags & GRE_KEY ?
519                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
520                                 p[1]);
521         if (t == NULL || t->parms.iph.daddr == 0 ||
522             ipv4_is_multicast(t->parms.iph.daddr))
523                 goto out;
524
525         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
526                 goto out;
527
528         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
529                 t->err_count++;
530         else
531                 t->err_count = 1;
532         t->err_time = jiffies;
533 out:
534         rcu_read_unlock();
535 }
536
537 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
538 {
539         if (INET_ECN_is_ce(iph->tos)) {
540                 if (skb->protocol == htons(ETH_P_IP)) {
541                         IP_ECN_set_ce(ip_hdr(skb));
542                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
543                         IP6_ECN_set_ce(ipv6_hdr(skb));
544                 }
545         }
546 }
547
548 static inline u8
549 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
550 {
551         u8 inner = 0;
552         if (skb->protocol == htons(ETH_P_IP))
553                 inner = old_iph->tos;
554         else if (skb->protocol == htons(ETH_P_IPV6))
555                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
556         return INET_ECN_encapsulate(tos, inner);
557 }
558
559 static int ipgre_rcv(struct sk_buff *skb)
560 {
561         struct iphdr *iph;
562         u8     *h;
563         __be16    flags;
564         __sum16   csum = 0;
565         __be32 key = 0;
566         u32    seqno = 0;
567         struct ip_tunnel *tunnel;
568         int    offset = 4;
569         __be16 gre_proto;
570
571         if (!pskb_may_pull(skb, 16))
572                 goto drop_nolock;
573
574         iph = ip_hdr(skb);
575         h = skb->data;
576         flags = *(__be16*)h;
577
578         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
579                 /* - Version must be 0.
580                    - We do not support routing headers.
581                  */
582                 if (flags&(GRE_VERSION|GRE_ROUTING))
583                         goto drop_nolock;
584
585                 if (flags&GRE_CSUM) {
586                         switch (skb->ip_summed) {
587                         case CHECKSUM_COMPLETE:
588                                 csum = csum_fold(skb->csum);
589                                 if (!csum)
590                                         break;
591                                 /* fall through */
592                         case CHECKSUM_NONE:
593                                 skb->csum = 0;
594                                 csum = __skb_checksum_complete(skb);
595                                 skb->ip_summed = CHECKSUM_COMPLETE;
596                         }
597                         offset += 4;
598                 }
599                 if (flags&GRE_KEY) {
600                         key = *(__be32*)(h + offset);
601                         offset += 4;
602                 }
603                 if (flags&GRE_SEQ) {
604                         seqno = ntohl(*(__be32*)(h + offset));
605                         offset += 4;
606                 }
607         }
608
609         gre_proto = *(__be16 *)(h + 2);
610
611         rcu_read_lock();
612         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
613                                           iph->saddr, iph->daddr, key,
614                                           gre_proto))) {
615                 struct pcpu_tstats *tstats;
616
617                 secpath_reset(skb);
618
619                 skb->protocol = gre_proto;
620                 /* WCCP version 1 and 2 protocol decoding.
621                  * - Change protocol to IP
622                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
623                  */
624                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
625                         skb->protocol = htons(ETH_P_IP);
626                         if ((*(h + offset) & 0xF0) != 0x40)
627                                 offset += 4;
628                 }
629
630                 skb->mac_header = skb->network_header;
631                 __pskb_pull(skb, offset);
632                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
633                 skb->pkt_type = PACKET_HOST;
634 #ifdef CONFIG_NET_IPGRE_BROADCAST
635                 if (ipv4_is_multicast(iph->daddr)) {
636                         /* Looped back packet, drop it! */
637                         if (rt_is_output_route(skb_rtable(skb)))
638                                 goto drop;
639                         tunnel->dev->stats.multicast++;
640                         skb->pkt_type = PACKET_BROADCAST;
641                 }
642 #endif
643
644                 if (((flags&GRE_CSUM) && csum) ||
645                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
646                         tunnel->dev->stats.rx_crc_errors++;
647                         tunnel->dev->stats.rx_errors++;
648                         goto drop;
649                 }
650                 if (tunnel->parms.i_flags&GRE_SEQ) {
651                         if (!(flags&GRE_SEQ) ||
652                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
653                                 tunnel->dev->stats.rx_fifo_errors++;
654                                 tunnel->dev->stats.rx_errors++;
655                                 goto drop;
656                         }
657                         tunnel->i_seqno = seqno + 1;
658                 }
659
660                 /* Warning: All skb pointers will be invalidated! */
661                 if (tunnel->dev->type == ARPHRD_ETHER) {
662                         if (!pskb_may_pull(skb, ETH_HLEN)) {
663                                 tunnel->dev->stats.rx_length_errors++;
664                                 tunnel->dev->stats.rx_errors++;
665                                 goto drop;
666                         }
667
668                         iph = ip_hdr(skb);
669                         skb->protocol = eth_type_trans(skb, tunnel->dev);
670                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
671                 }
672
673                 tstats = this_cpu_ptr(tunnel->dev->tstats);
674                 tstats->rx_packets++;
675                 tstats->rx_bytes += skb->len;
676
677                 __skb_tunnel_rx(skb, tunnel->dev);
678
679                 skb_reset_network_header(skb);
680                 ipgre_ecn_decapsulate(iph, skb);
681
682                 netif_rx(skb);
683
684                 rcu_read_unlock();
685                 return 0;
686         }
687         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
688
689 drop:
690         rcu_read_unlock();
691 drop_nolock:
692         kfree_skb(skb);
693         return 0;
694 }
695
696 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
697 {
698         struct ip_tunnel *tunnel = netdev_priv(dev);
699         struct pcpu_tstats *tstats;
700         struct iphdr  *old_iph = ip_hdr(skb);
701         struct iphdr  *tiph;
702         u8     tos;
703         __be16 df;
704         struct rtable *rt;                      /* Route to the other host */
705         struct net_device *tdev;                /* Device to other host */
706         struct iphdr  *iph;                     /* Our new IP header */
707         unsigned int max_headroom;              /* The extra header space needed */
708         int    gre_hlen;
709         __be32 dst;
710         int    mtu;
711
712         if (dev->type == ARPHRD_ETHER)
713                 IPCB(skb)->flags = 0;
714
715         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
716                 gre_hlen = 0;
717                 tiph = (struct iphdr *)skb->data;
718         } else {
719                 gre_hlen = tunnel->hlen;
720                 tiph = &tunnel->parms.iph;
721         }
722
723         if ((dst = tiph->daddr) == 0) {
724                 /* NBMA tunnel */
725
726                 if (skb_dst(skb) == NULL) {
727                         dev->stats.tx_fifo_errors++;
728                         goto tx_error;
729                 }
730
731                 if (skb->protocol == htons(ETH_P_IP)) {
732                         rt = skb_rtable(skb);
733                         if ((dst = rt->rt_gateway) == 0)
734                                 goto tx_error_icmp;
735                 }
736 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
737                 else if (skb->protocol == htons(ETH_P_IPV6)) {
738                         struct in6_addr *addr6;
739                         int addr_type;
740                         struct neighbour *neigh = skb_dst(skb)->neighbour;
741
742                         if (neigh == NULL)
743                                 goto tx_error;
744
745                         addr6 = (struct in6_addr *)&neigh->primary_key;
746                         addr_type = ipv6_addr_type(addr6);
747
748                         if (addr_type == IPV6_ADDR_ANY) {
749                                 addr6 = &ipv6_hdr(skb)->daddr;
750                                 addr_type = ipv6_addr_type(addr6);
751                         }
752
753                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
754                                 goto tx_error_icmp;
755
756                         dst = addr6->s6_addr32[3];
757                 }
758 #endif
759                 else
760                         goto tx_error;
761         }
762
763         tos = tiph->tos;
764         if (tos == 1) {
765                 tos = 0;
766                 if (skb->protocol == htons(ETH_P_IP))
767                         tos = old_iph->tos;
768                 else if (skb->protocol == htons(ETH_P_IPV6))
769                         tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
770         }
771
772         {
773                 struct flowi fl = {
774                         .oif = tunnel->parms.link,
775                         .fl4_dst = dst,
776                         .fl4_src = tiph->saddr,
777                         .fl4_tos = RT_TOS(tos),
778                         .proto = IPPROTO_GRE,
779                         .fl_gre_key = tunnel->parms.o_key
780                 };
781                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
782                         dev->stats.tx_carrier_errors++;
783                         goto tx_error;
784                 }
785         }
786         tdev = rt->dst.dev;
787
788         if (tdev == dev) {
789                 ip_rt_put(rt);
790                 dev->stats.collisions++;
791                 goto tx_error;
792         }
793
794         df = tiph->frag_off;
795         if (df)
796                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
797         else
798                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
799
800         if (skb_dst(skb))
801                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
802
803         if (skb->protocol == htons(ETH_P_IP)) {
804                 df |= (old_iph->frag_off&htons(IP_DF));
805
806                 if ((old_iph->frag_off&htons(IP_DF)) &&
807                     mtu < ntohs(old_iph->tot_len)) {
808                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
809                         ip_rt_put(rt);
810                         goto tx_error;
811                 }
812         }
813 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
814         else if (skb->protocol == htons(ETH_P_IPV6)) {
815                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
816
817                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
818                         if ((tunnel->parms.iph.daddr &&
819                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
820                             rt6->rt6i_dst.plen == 128) {
821                                 rt6->rt6i_flags |= RTF_MODIFIED;
822                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
823                         }
824                 }
825
826                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
827                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
828                         ip_rt_put(rt);
829                         goto tx_error;
830                 }
831         }
832 #endif
833
834         if (tunnel->err_count > 0) {
835                 if (time_before(jiffies,
836                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
837                         tunnel->err_count--;
838
839                         dst_link_failure(skb);
840                 } else
841                         tunnel->err_count = 0;
842         }
843
844         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
845
846         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
847             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
848                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
849                 if (max_headroom > dev->needed_headroom)
850                         dev->needed_headroom = max_headroom;
851                 if (!new_skb) {
852                         ip_rt_put(rt);
853                         dev->stats.tx_dropped++;
854                         dev_kfree_skb(skb);
855                         return NETDEV_TX_OK;
856                 }
857                 if (skb->sk)
858                         skb_set_owner_w(new_skb, skb->sk);
859                 dev_kfree_skb(skb);
860                 skb = new_skb;
861                 old_iph = ip_hdr(skb);
862         }
863
864         skb_reset_transport_header(skb);
865         skb_push(skb, gre_hlen);
866         skb_reset_network_header(skb);
867         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
868         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
869                               IPSKB_REROUTED);
870         skb_dst_drop(skb);
871         skb_dst_set(skb, &rt->dst);
872
873         /*
874          *      Push down and install the IPIP header.
875          */
876
877         iph                     =       ip_hdr(skb);
878         iph->version            =       4;
879         iph->ihl                =       sizeof(struct iphdr) >> 2;
880         iph->frag_off           =       df;
881         iph->protocol           =       IPPROTO_GRE;
882         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
883         iph->daddr              =       rt->rt_dst;
884         iph->saddr              =       rt->rt_src;
885
886         if ((iph->ttl = tiph->ttl) == 0) {
887                 if (skb->protocol == htons(ETH_P_IP))
888                         iph->ttl = old_iph->ttl;
889 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
890                 else if (skb->protocol == htons(ETH_P_IPV6))
891                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
892 #endif
893                 else
894                         iph->ttl = ip4_dst_hoplimit(&rt->dst);
895         }
896
897         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
898         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
899                                    htons(ETH_P_TEB) : skb->protocol;
900
901         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
902                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
903
904                 if (tunnel->parms.o_flags&GRE_SEQ) {
905                         ++tunnel->o_seqno;
906                         *ptr = htonl(tunnel->o_seqno);
907                         ptr--;
908                 }
909                 if (tunnel->parms.o_flags&GRE_KEY) {
910                         *ptr = tunnel->parms.o_key;
911                         ptr--;
912                 }
913                 if (tunnel->parms.o_flags&GRE_CSUM) {
914                         *ptr = 0;
915                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
916                 }
917         }
918
919         nf_reset(skb);
920         tstats = this_cpu_ptr(dev->tstats);
921         __IPTUNNEL_XMIT(tstats, &dev->stats);
922         return NETDEV_TX_OK;
923
924 tx_error_icmp:
925         dst_link_failure(skb);
926
927 tx_error:
928         dev->stats.tx_errors++;
929         dev_kfree_skb(skb);
930         return NETDEV_TX_OK;
931 }
932
933 static int ipgre_tunnel_bind_dev(struct net_device *dev)
934 {
935         struct net_device *tdev = NULL;
936         struct ip_tunnel *tunnel;
937         struct iphdr *iph;
938         int hlen = LL_MAX_HEADER;
939         int mtu = ETH_DATA_LEN;
940         int addend = sizeof(struct iphdr) + 4;
941
942         tunnel = netdev_priv(dev);
943         iph = &tunnel->parms.iph;
944
945         /* Guess output device to choose reasonable mtu and needed_headroom */
946
947         if (iph->daddr) {
948                 struct flowi fl = {
949                         .oif = tunnel->parms.link,
950                         .fl4_dst = iph->daddr,
951                         .fl4_src = iph->saddr,
952                         .fl4_tos = RT_TOS(iph->tos),
953                         .proto = IPPROTO_GRE,
954                         .fl_gre_key = tunnel->parms.o_key
955                 };
956                 struct rtable *rt;
957
958                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
959                         tdev = rt->dst.dev;
960                         ip_rt_put(rt);
961                 }
962
963                 if (dev->type != ARPHRD_ETHER)
964                         dev->flags |= IFF_POINTOPOINT;
965         }
966
967         if (!tdev && tunnel->parms.link)
968                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
969
970         if (tdev) {
971                 hlen = tdev->hard_header_len + tdev->needed_headroom;
972                 mtu = tdev->mtu;
973         }
974         dev->iflink = tunnel->parms.link;
975
976         /* Precalculate GRE options length */
977         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
978                 if (tunnel->parms.o_flags&GRE_CSUM)
979                         addend += 4;
980                 if (tunnel->parms.o_flags&GRE_KEY)
981                         addend += 4;
982                 if (tunnel->parms.o_flags&GRE_SEQ)
983                         addend += 4;
984         }
985         dev->needed_headroom = addend + hlen;
986         mtu -= dev->hard_header_len + addend;
987
988         if (mtu < 68)
989                 mtu = 68;
990
991         tunnel->hlen = addend;
992
993         return mtu;
994 }
995
996 static int
997 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
998 {
999         int err = 0;
1000         struct ip_tunnel_parm p;
1001         struct ip_tunnel *t;
1002         struct net *net = dev_net(dev);
1003         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1004
1005         switch (cmd) {
1006         case SIOCGETTUNNEL:
1007                 t = NULL;
1008                 if (dev == ign->fb_tunnel_dev) {
1009                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1010                                 err = -EFAULT;
1011                                 break;
1012                         }
1013                         t = ipgre_tunnel_locate(net, &p, 0);
1014                 }
1015                 if (t == NULL)
1016                         t = netdev_priv(dev);
1017                 memcpy(&p, &t->parms, sizeof(p));
1018                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1019                         err = -EFAULT;
1020                 break;
1021
1022         case SIOCADDTUNNEL:
1023         case SIOCCHGTUNNEL:
1024                 err = -EPERM;
1025                 if (!capable(CAP_NET_ADMIN))
1026                         goto done;
1027
1028                 err = -EFAULT;
1029                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1030                         goto done;
1031
1032                 err = -EINVAL;
1033                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1034                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1035                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1036                         goto done;
1037                 if (p.iph.ttl)
1038                         p.iph.frag_off |= htons(IP_DF);
1039
1040                 if (!(p.i_flags&GRE_KEY))
1041                         p.i_key = 0;
1042                 if (!(p.o_flags&GRE_KEY))
1043                         p.o_key = 0;
1044
1045                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1046
1047                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1048                         if (t != NULL) {
1049                                 if (t->dev != dev) {
1050                                         err = -EEXIST;
1051                                         break;
1052                                 }
1053                         } else {
1054                                 unsigned int nflags = 0;
1055
1056                                 t = netdev_priv(dev);
1057
1058                                 if (ipv4_is_multicast(p.iph.daddr))
1059                                         nflags = IFF_BROADCAST;
1060                                 else if (p.iph.daddr)
1061                                         nflags = IFF_POINTOPOINT;
1062
1063                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1064                                         err = -EINVAL;
1065                                         break;
1066                                 }
1067                                 ipgre_tunnel_unlink(ign, t);
1068                                 synchronize_net();
1069                                 t->parms.iph.saddr = p.iph.saddr;
1070                                 t->parms.iph.daddr = p.iph.daddr;
1071                                 t->parms.i_key = p.i_key;
1072                                 t->parms.o_key = p.o_key;
1073                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1074                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1075                                 ipgre_tunnel_link(ign, t);
1076                                 netdev_state_change(dev);
1077                         }
1078                 }
1079
1080                 if (t) {
1081                         err = 0;
1082                         if (cmd == SIOCCHGTUNNEL) {
1083                                 t->parms.iph.ttl = p.iph.ttl;
1084                                 t->parms.iph.tos = p.iph.tos;
1085                                 t->parms.iph.frag_off = p.iph.frag_off;
1086                                 if (t->parms.link != p.link) {
1087                                         t->parms.link = p.link;
1088                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1089                                         netdev_state_change(dev);
1090                                 }
1091                         }
1092                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1093                                 err = -EFAULT;
1094                 } else
1095                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1096                 break;
1097
1098         case SIOCDELTUNNEL:
1099                 err = -EPERM;
1100                 if (!capable(CAP_NET_ADMIN))
1101                         goto done;
1102
1103                 if (dev == ign->fb_tunnel_dev) {
1104                         err = -EFAULT;
1105                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1106                                 goto done;
1107                         err = -ENOENT;
1108                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1109                                 goto done;
1110                         err = -EPERM;
1111                         if (t == netdev_priv(ign->fb_tunnel_dev))
1112                                 goto done;
1113                         dev = t->dev;
1114                 }
1115                 unregister_netdevice(dev);
1116                 err = 0;
1117                 break;
1118
1119         default:
1120                 err = -EINVAL;
1121         }
1122
1123 done:
1124         return err;
1125 }
1126
1127 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1128 {
1129         struct ip_tunnel *tunnel = netdev_priv(dev);
1130         if (new_mtu < 68 ||
1131             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1132                 return -EINVAL;
1133         dev->mtu = new_mtu;
1134         return 0;
1135 }
1136
1137 /* Nice toy. Unfortunately, useless in real life :-)
1138    It allows to construct virtual multiprotocol broadcast "LAN"
1139    over the Internet, provided multicast routing is tuned.
1140
1141
1142    I have no idea was this bicycle invented before me,
1143    so that I had to set ARPHRD_IPGRE to a random value.
1144    I have an impression, that Cisco could make something similar,
1145    but this feature is apparently missing in IOS<=11.2(8).
1146
1147    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1148    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1149
1150    ping -t 255 224.66.66.66
1151
1152    If nobody answers, mbone does not work.
1153
1154    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1155    ip addr add 10.66.66.<somewhat>/24 dev Universe
1156    ifconfig Universe up
1157    ifconfig Universe add fe80::<Your_real_addr>/10
1158    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1159    ftp 10.66.66.66
1160    ...
1161    ftp fec0:6666:6666::193.233.7.65
1162    ...
1163
1164  */
1165
1166 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1167                         unsigned short type,
1168                         const void *daddr, const void *saddr, unsigned int len)
1169 {
1170         struct ip_tunnel *t = netdev_priv(dev);
1171         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1172         __be16 *p = (__be16*)(iph+1);
1173
1174         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1175         p[0]            = t->parms.o_flags;
1176         p[1]            = htons(type);
1177
1178         /*
1179          *      Set the source hardware address.
1180          */
1181
1182         if (saddr)
1183                 memcpy(&iph->saddr, saddr, 4);
1184         if (daddr)
1185                 memcpy(&iph->daddr, daddr, 4);
1186         if (iph->daddr)
1187                 return t->hlen;
1188
1189         return -t->hlen;
1190 }
1191
1192 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1193 {
1194         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1195         memcpy(haddr, &iph->saddr, 4);
1196         return 4;
1197 }
1198
1199 static const struct header_ops ipgre_header_ops = {
1200         .create = ipgre_header,
1201         .parse  = ipgre_header_parse,
1202 };
1203
1204 #ifdef CONFIG_NET_IPGRE_BROADCAST
1205 static int ipgre_open(struct net_device *dev)
1206 {
1207         struct ip_tunnel *t = netdev_priv(dev);
1208
1209         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1210                 struct flowi fl = {
1211                         .oif = t->parms.link,
1212                         .fl4_dst = t->parms.iph.daddr,
1213                         .fl4_src = t->parms.iph.saddr,
1214                         .fl4_tos = RT_TOS(t->parms.iph.tos),
1215                         .proto = IPPROTO_GRE,
1216                         .fl_gre_key = t->parms.o_key
1217                 };
1218                 struct rtable *rt;
1219
1220                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1221                         return -EADDRNOTAVAIL;
1222                 dev = rt->dst.dev;
1223                 ip_rt_put(rt);
1224                 if (__in_dev_get_rtnl(dev) == NULL)
1225                         return -EADDRNOTAVAIL;
1226                 t->mlink = dev->ifindex;
1227                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1228         }
1229         return 0;
1230 }
1231
1232 static int ipgre_close(struct net_device *dev)
1233 {
1234         struct ip_tunnel *t = netdev_priv(dev);
1235
1236         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1237                 struct in_device *in_dev;
1238                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1239                 if (in_dev)
1240                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1241         }
1242         return 0;
1243 }
1244
1245 #endif
1246
1247 static const struct net_device_ops ipgre_netdev_ops = {
1248         .ndo_init               = ipgre_tunnel_init,
1249         .ndo_uninit             = ipgre_tunnel_uninit,
1250 #ifdef CONFIG_NET_IPGRE_BROADCAST
1251         .ndo_open               = ipgre_open,
1252         .ndo_stop               = ipgre_close,
1253 #endif
1254         .ndo_start_xmit         = ipgre_tunnel_xmit,
1255         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1256         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1257         .ndo_get_stats          = ipgre_get_stats,
1258 };
1259
1260 static void ipgre_dev_free(struct net_device *dev)
1261 {
1262         free_percpu(dev->tstats);
1263         free_netdev(dev);
1264 }
1265
1266 static void ipgre_tunnel_setup(struct net_device *dev)
1267 {
1268         dev->netdev_ops         = &ipgre_netdev_ops;
1269         dev->destructor         = ipgre_dev_free;
1270
1271         dev->type               = ARPHRD_IPGRE;
1272         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1273         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1274         dev->flags              = IFF_NOARP;
1275         dev->iflink             = 0;
1276         dev->addr_len           = 4;
1277         dev->features           |= NETIF_F_NETNS_LOCAL;
1278         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1279 }
1280
1281 static int ipgre_tunnel_init(struct net_device *dev)
1282 {
1283         struct ip_tunnel *tunnel;
1284         struct iphdr *iph;
1285
1286         tunnel = netdev_priv(dev);
1287         iph = &tunnel->parms.iph;
1288
1289         tunnel->dev = dev;
1290         strcpy(tunnel->parms.name, dev->name);
1291
1292         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1293         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1294
1295         if (iph->daddr) {
1296 #ifdef CONFIG_NET_IPGRE_BROADCAST
1297                 if (ipv4_is_multicast(iph->daddr)) {
1298                         if (!iph->saddr)
1299                                 return -EINVAL;
1300                         dev->flags = IFF_BROADCAST;
1301                         dev->header_ops = &ipgre_header_ops;
1302                 }
1303 #endif
1304         } else
1305                 dev->header_ops = &ipgre_header_ops;
1306
1307         dev->tstats = alloc_percpu(struct pcpu_tstats);
1308         if (!dev->tstats)
1309                 return -ENOMEM;
1310
1311         return 0;
1312 }
1313
1314 static void ipgre_fb_tunnel_init(struct net_device *dev)
1315 {
1316         struct ip_tunnel *tunnel = netdev_priv(dev);
1317         struct iphdr *iph = &tunnel->parms.iph;
1318
1319         tunnel->dev = dev;
1320         strcpy(tunnel->parms.name, dev->name);
1321
1322         iph->version            = 4;
1323         iph->protocol           = IPPROTO_GRE;
1324         iph->ihl                = 5;
1325         tunnel->hlen            = sizeof(struct iphdr) + 4;
1326
1327         dev_hold(dev);
1328 }
1329
1330
1331 static const struct gre_protocol ipgre_protocol = {
1332         .handler     = ipgre_rcv,
1333         .err_handler = ipgre_err,
1334 };
1335
1336 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1337 {
1338         int prio;
1339
1340         for (prio = 0; prio < 4; prio++) {
1341                 int h;
1342                 for (h = 0; h < HASH_SIZE; h++) {
1343                         struct ip_tunnel *t;
1344
1345                         t = rtnl_dereference(ign->tunnels[prio][h]);
1346
1347                         while (t != NULL) {
1348                                 unregister_netdevice_queue(t->dev, head);
1349                                 t = rtnl_dereference(t->next);
1350                         }
1351                 }
1352         }
1353 }
1354
1355 static int __net_init ipgre_init_net(struct net *net)
1356 {
1357         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1358         int err;
1359
1360         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1361                                            ipgre_tunnel_setup);
1362         if (!ign->fb_tunnel_dev) {
1363                 err = -ENOMEM;
1364                 goto err_alloc_dev;
1365         }
1366         dev_net_set(ign->fb_tunnel_dev, net);
1367
1368         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1369         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1370
1371         if ((err = register_netdev(ign->fb_tunnel_dev)))
1372                 goto err_reg_dev;
1373
1374         rcu_assign_pointer(ign->tunnels_wc[0],
1375                            netdev_priv(ign->fb_tunnel_dev));
1376         return 0;
1377
1378 err_reg_dev:
1379         ipgre_dev_free(ign->fb_tunnel_dev);
1380 err_alloc_dev:
1381         return err;
1382 }
1383
1384 static void __net_exit ipgre_exit_net(struct net *net)
1385 {
1386         struct ipgre_net *ign;
1387         LIST_HEAD(list);
1388
1389         ign = net_generic(net, ipgre_net_id);
1390         rtnl_lock();
1391         ipgre_destroy_tunnels(ign, &list);
1392         unregister_netdevice_many(&list);
1393         rtnl_unlock();
1394 }
1395
1396 static struct pernet_operations ipgre_net_ops = {
1397         .init = ipgre_init_net,
1398         .exit = ipgre_exit_net,
1399         .id   = &ipgre_net_id,
1400         .size = sizeof(struct ipgre_net),
1401 };
1402
1403 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1404 {
1405         __be16 flags;
1406
1407         if (!data)
1408                 return 0;
1409
1410         flags = 0;
1411         if (data[IFLA_GRE_IFLAGS])
1412                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1413         if (data[IFLA_GRE_OFLAGS])
1414                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1415         if (flags & (GRE_VERSION|GRE_ROUTING))
1416                 return -EINVAL;
1417
1418         return 0;
1419 }
1420
1421 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1422 {
1423         __be32 daddr;
1424
1425         if (tb[IFLA_ADDRESS]) {
1426                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1427                         return -EINVAL;
1428                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1429                         return -EADDRNOTAVAIL;
1430         }
1431
1432         if (!data)
1433                 goto out;
1434
1435         if (data[IFLA_GRE_REMOTE]) {
1436                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1437                 if (!daddr)
1438                         return -EINVAL;
1439         }
1440
1441 out:
1442         return ipgre_tunnel_validate(tb, data);
1443 }
1444
1445 static void ipgre_netlink_parms(struct nlattr *data[],
1446                                 struct ip_tunnel_parm *parms)
1447 {
1448         memset(parms, 0, sizeof(*parms));
1449
1450         parms->iph.protocol = IPPROTO_GRE;
1451
1452         if (!data)
1453                 return;
1454
1455         if (data[IFLA_GRE_LINK])
1456                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1457
1458         if (data[IFLA_GRE_IFLAGS])
1459                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1460
1461         if (data[IFLA_GRE_OFLAGS])
1462                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1463
1464         if (data[IFLA_GRE_IKEY])
1465                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1466
1467         if (data[IFLA_GRE_OKEY])
1468                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1469
1470         if (data[IFLA_GRE_LOCAL])
1471                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1472
1473         if (data[IFLA_GRE_REMOTE])
1474                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1475
1476         if (data[IFLA_GRE_TTL])
1477                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1478
1479         if (data[IFLA_GRE_TOS])
1480                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1481
1482         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1483                 parms->iph.frag_off = htons(IP_DF);
1484 }
1485
1486 static int ipgre_tap_init(struct net_device *dev)
1487 {
1488         struct ip_tunnel *tunnel;
1489
1490         tunnel = netdev_priv(dev);
1491
1492         tunnel->dev = dev;
1493         strcpy(tunnel->parms.name, dev->name);
1494
1495         ipgre_tunnel_bind_dev(dev);
1496
1497         dev->tstats = alloc_percpu(struct pcpu_tstats);
1498         if (!dev->tstats)
1499                 return -ENOMEM;
1500
1501         return 0;
1502 }
1503
1504 static const struct net_device_ops ipgre_tap_netdev_ops = {
1505         .ndo_init               = ipgre_tap_init,
1506         .ndo_uninit             = ipgre_tunnel_uninit,
1507         .ndo_start_xmit         = ipgre_tunnel_xmit,
1508         .ndo_set_mac_address    = eth_mac_addr,
1509         .ndo_validate_addr      = eth_validate_addr,
1510         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1511         .ndo_get_stats          = ipgre_get_stats,
1512 };
1513
1514 static void ipgre_tap_setup(struct net_device *dev)
1515 {
1516
1517         ether_setup(dev);
1518
1519         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1520         dev->destructor         = ipgre_dev_free;
1521
1522         dev->iflink             = 0;
1523         dev->features           |= NETIF_F_NETNS_LOCAL;
1524 }
1525
1526 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1527                          struct nlattr *data[])
1528 {
1529         struct ip_tunnel *nt;
1530         struct net *net = dev_net(dev);
1531         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1532         int mtu;
1533         int err;
1534
1535         nt = netdev_priv(dev);
1536         ipgre_netlink_parms(data, &nt->parms);
1537
1538         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1539                 return -EEXIST;
1540
1541         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1542                 random_ether_addr(dev->dev_addr);
1543
1544         mtu = ipgre_tunnel_bind_dev(dev);
1545         if (!tb[IFLA_MTU])
1546                 dev->mtu = mtu;
1547
1548         /* Can use a lockless transmit, unless we generate output sequences */
1549         if (!(nt->parms.o_flags & GRE_SEQ))
1550                 dev->features |= NETIF_F_LLTX;
1551
1552         err = register_netdevice(dev);
1553         if (err)
1554                 goto out;
1555
1556         dev_hold(dev);
1557         ipgre_tunnel_link(ign, nt);
1558
1559 out:
1560         return err;
1561 }
1562
1563 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1564                             struct nlattr *data[])
1565 {
1566         struct ip_tunnel *t, *nt;
1567         struct net *net = dev_net(dev);
1568         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1569         struct ip_tunnel_parm p;
1570         int mtu;
1571
1572         if (dev == ign->fb_tunnel_dev)
1573                 return -EINVAL;
1574
1575         nt = netdev_priv(dev);
1576         ipgre_netlink_parms(data, &p);
1577
1578         t = ipgre_tunnel_locate(net, &p, 0);
1579
1580         if (t) {
1581                 if (t->dev != dev)
1582                         return -EEXIST;
1583         } else {
1584                 t = nt;
1585
1586                 if (dev->type != ARPHRD_ETHER) {
1587                         unsigned int nflags = 0;
1588
1589                         if (ipv4_is_multicast(p.iph.daddr))
1590                                 nflags = IFF_BROADCAST;
1591                         else if (p.iph.daddr)
1592                                 nflags = IFF_POINTOPOINT;
1593
1594                         if ((dev->flags ^ nflags) &
1595                             (IFF_POINTOPOINT | IFF_BROADCAST))
1596                                 return -EINVAL;
1597                 }
1598
1599                 ipgre_tunnel_unlink(ign, t);
1600                 t->parms.iph.saddr = p.iph.saddr;
1601                 t->parms.iph.daddr = p.iph.daddr;
1602                 t->parms.i_key = p.i_key;
1603                 if (dev->type != ARPHRD_ETHER) {
1604                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1605                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1606                 }
1607                 ipgre_tunnel_link(ign, t);
1608                 netdev_state_change(dev);
1609         }
1610
1611         t->parms.o_key = p.o_key;
1612         t->parms.iph.ttl = p.iph.ttl;
1613         t->parms.iph.tos = p.iph.tos;
1614         t->parms.iph.frag_off = p.iph.frag_off;
1615
1616         if (t->parms.link != p.link) {
1617                 t->parms.link = p.link;
1618                 mtu = ipgre_tunnel_bind_dev(dev);
1619                 if (!tb[IFLA_MTU])
1620                         dev->mtu = mtu;
1621                 netdev_state_change(dev);
1622         }
1623
1624         return 0;
1625 }
1626
1627 static size_t ipgre_get_size(const struct net_device *dev)
1628 {
1629         return
1630                 /* IFLA_GRE_LINK */
1631                 nla_total_size(4) +
1632                 /* IFLA_GRE_IFLAGS */
1633                 nla_total_size(2) +
1634                 /* IFLA_GRE_OFLAGS */
1635                 nla_total_size(2) +
1636                 /* IFLA_GRE_IKEY */
1637                 nla_total_size(4) +
1638                 /* IFLA_GRE_OKEY */
1639                 nla_total_size(4) +
1640                 /* IFLA_GRE_LOCAL */
1641                 nla_total_size(4) +
1642                 /* IFLA_GRE_REMOTE */
1643                 nla_total_size(4) +
1644                 /* IFLA_GRE_TTL */
1645                 nla_total_size(1) +
1646                 /* IFLA_GRE_TOS */
1647                 nla_total_size(1) +
1648                 /* IFLA_GRE_PMTUDISC */
1649                 nla_total_size(1) +
1650                 0;
1651 }
1652
1653 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1654 {
1655         struct ip_tunnel *t = netdev_priv(dev);
1656         struct ip_tunnel_parm *p = &t->parms;
1657
1658         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1659         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1660         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1661         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1662         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1663         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1664         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1665         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1666         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1667         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1668
1669         return 0;
1670
1671 nla_put_failure:
1672         return -EMSGSIZE;
1673 }
1674
1675 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1676         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1677         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1678         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1679         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1680         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1681         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1682         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1683         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1684         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1685         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1686 };
1687
1688 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1689         .kind           = "gre",
1690         .maxtype        = IFLA_GRE_MAX,
1691         .policy         = ipgre_policy,
1692         .priv_size      = sizeof(struct ip_tunnel),
1693         .setup          = ipgre_tunnel_setup,
1694         .validate       = ipgre_tunnel_validate,
1695         .newlink        = ipgre_newlink,
1696         .changelink     = ipgre_changelink,
1697         .get_size       = ipgre_get_size,
1698         .fill_info      = ipgre_fill_info,
1699 };
1700
1701 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1702         .kind           = "gretap",
1703         .maxtype        = IFLA_GRE_MAX,
1704         .policy         = ipgre_policy,
1705         .priv_size      = sizeof(struct ip_tunnel),
1706         .setup          = ipgre_tap_setup,
1707         .validate       = ipgre_tap_validate,
1708         .newlink        = ipgre_newlink,
1709         .changelink     = ipgre_changelink,
1710         .get_size       = ipgre_get_size,
1711         .fill_info      = ipgre_fill_info,
1712 };
1713
1714 /*
1715  *      And now the modules code and kernel interface.
1716  */
1717
1718 static int __init ipgre_init(void)
1719 {
1720         int err;
1721
1722         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1723
1724         err = register_pernet_device(&ipgre_net_ops);
1725         if (err < 0)
1726                 return err;
1727
1728         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1729         if (err < 0) {
1730                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1731                 goto add_proto_failed;
1732         }
1733
1734         err = rtnl_link_register(&ipgre_link_ops);
1735         if (err < 0)
1736                 goto rtnl_link_failed;
1737
1738         err = rtnl_link_register(&ipgre_tap_ops);
1739         if (err < 0)
1740                 goto tap_ops_failed;
1741
1742 out:
1743         return err;
1744
1745 tap_ops_failed:
1746         rtnl_link_unregister(&ipgre_link_ops);
1747 rtnl_link_failed:
1748         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1749 add_proto_failed:
1750         unregister_pernet_device(&ipgre_net_ops);
1751         goto out;
1752 }
1753
1754 static void __exit ipgre_fini(void)
1755 {
1756         rtnl_link_unregister(&ipgre_tap_ops);
1757         rtnl_link_unregister(&ipgre_link_ops);
1758         if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1759                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1760         unregister_pernet_device(&ipgre_net_ops);
1761 }
1762
1763 module_init(ipgre_init);
1764 module_exit(ipgre_fini);
1765 MODULE_LICENSE("GPL");
1766 MODULE_ALIAS_RTNL_LINK("gre");
1767 MODULE_ALIAS_RTNL_LINK("gretap");
1768 MODULE_ALIAS_NETDEV("gre0");