Merge git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6-nmw
[pandora-kernel.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
33
34 #include <net/sock.h>
35 #include <net/ip.h>
36 #include <net/icmp.h>
37 #include <net/protocol.h>
38 #include <net/ipip.h>
39 #include <net/arp.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
47
48 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #endif
53
54 /*
55    Problems & solutions
56    --------------------
57
58    1. The most important issue is detecting local dead loops.
59    They would cause complete host lockup in transmit, which
60    would be "resolved" by stack overflow or, if queueing is enabled,
61    with infinite looping in net_bh.
62
63    We cannot track such dead loops during route installation,
64    it is infeasible task. The most general solutions would be
65    to keep skb->encapsulation counter (sort of local ttl),
66    and silently drop packet when it expires. It is the best
67    solution, but it supposes maintaing new variable in ALL
68    skb, even if no tunneling is used.
69
70    Current solution: HARD_TX_LOCK lock breaks dead loops.
71
72
73
74    2. Networking dead loops would not kill routers, but would really
75    kill network. IP hop limit plays role of "t->recursion" in this case,
76    if we copy it from packet being encapsulated to upper header.
77    It is very good solution, but it introduces two problems:
78
79    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
80      do not work over tunnels.
81    - traceroute does not work. I planned to relay ICMP from tunnel,
82      so that this problem would be solved and traceroute output
83      would even more informative. This idea appeared to be wrong:
84      only Linux complies to rfc1812 now (yes, guys, Linux is the only
85      true router now :-)), all routers (at least, in neighbourhood of mine)
86      return only 8 bytes of payload. It is the end.
87
88    Hence, if we want that OSPF worked or traceroute said something reasonable,
89    we should search for another solution.
90
91    One of them is to parse packet trying to detect inner encapsulation
92    made by our node. It is difficult or even impossible, especially,
93    taking into account fragmentation. TO be short, tt is not solution at all.
94
95    Current solution: The solution was UNEXPECTEDLY SIMPLE.
96    We force DF flag on tunnels with preconfigured hop limit,
97    that is ALL. :-) Well, it does not remove the problem completely,
98    but exponential growth of network traffic is changed to linear
99    (branches, that exceed pmtu are pruned) and tunnel mtu
100    fastly degrades to value <68, where looping stops.
101    Yes, it is not good if there exists a router in the loop,
102    which does not force DF, even when encapsulating packets have DF set.
103    But it is not our problem! Nobody could accuse us, we made
104    all that we could make. Even if it is your gated who injected
105    fatal route to network, even if it were you who configured
106    fatal static route: you are innocent. :-)
107
108
109
110    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
111    practically identical code. It would be good to glue them
112    together, but it is not very evident, how to make them modular.
113    sit is integral part of IPv6, ipip and gre are naturally modular.
114    We could extract common parts (hash table, ioctl etc)
115    to a separate module (ip_tunnel.c).
116
117    Alexey Kuznetsov.
118  */
119
120 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
121 static int ipgre_tunnel_init(struct net_device *dev);
122 static void ipgre_tunnel_setup(struct net_device *dev);
123 static int ipgre_tunnel_bind_dev(struct net_device *dev);
124
125 /* Fallback tunnel: no source, no destination, no key, no options */
126
127 #define HASH_SIZE  16
128
129 static int ipgre_net_id __read_mostly;
130 struct ipgre_net {
131         struct ip_tunnel *tunnels[4][HASH_SIZE];
132
133         struct net_device *fb_tunnel_dev;
134 };
135
136 /* Tunnel hash table */
137
138 /*
139    4 hash tables:
140
141    3: (remote,local)
142    2: (remote,*)
143    1: (*,local)
144    0: (*,*)
145
146    We require exact key match i.e. if a key is present in packet
147    it will match only tunnel with the same key; if it is not present,
148    it will match only keyless tunnel.
149
150    All keysless packets, if not matched configured keyless tunnels
151    will match fallback tunnel.
152  */
153
154 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
155
156 #define tunnels_r_l     tunnels[3]
157 #define tunnels_r       tunnels[2]
158 #define tunnels_l       tunnels[1]
159 #define tunnels_wc      tunnels[0]
160 /*
161  * Locking : hash tables are protected by RCU and a spinlock
162  */
163 static DEFINE_SPINLOCK(ipgre_lock);
164
165 #define for_each_ip_tunnel_rcu(start) \
166         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167
168 /* Given src, dst and key, find appropriate for input tunnel. */
169
170 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
171                                               __be32 remote, __be32 local,
172                                               __be32 key, __be16 gre_proto)
173 {
174         struct net *net = dev_net(dev);
175         int link = dev->ifindex;
176         unsigned h0 = HASH(remote);
177         unsigned h1 = HASH(key);
178         struct ip_tunnel *t, *cand = NULL;
179         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
180         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
181                        ARPHRD_ETHER : ARPHRD_IPGRE;
182         int score, cand_score = 4;
183
184         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
185                 if (local != t->parms.iph.saddr ||
186                     remote != t->parms.iph.daddr ||
187                     key != t->parms.i_key ||
188                     !(t->dev->flags & IFF_UP))
189                         continue;
190
191                 if (t->dev->type != ARPHRD_IPGRE &&
192                     t->dev->type != dev_type)
193                         continue;
194
195                 score = 0;
196                 if (t->parms.link != link)
197                         score |= 1;
198                 if (t->dev->type != dev_type)
199                         score |= 2;
200                 if (score == 0)
201                         return t;
202
203                 if (score < cand_score) {
204                         cand = t;
205                         cand_score = score;
206                 }
207         }
208
209         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
210                 if (remote != t->parms.iph.daddr ||
211                     key != t->parms.i_key ||
212                     !(t->dev->flags & IFF_UP))
213                         continue;
214
215                 if (t->dev->type != ARPHRD_IPGRE &&
216                     t->dev->type != dev_type)
217                         continue;
218
219                 score = 0;
220                 if (t->parms.link != link)
221                         score |= 1;
222                 if (t->dev->type != dev_type)
223                         score |= 2;
224                 if (score == 0)
225                         return t;
226
227                 if (score < cand_score) {
228                         cand = t;
229                         cand_score = score;
230                 }
231         }
232
233         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
234                 if ((local != t->parms.iph.saddr &&
235                      (local != t->parms.iph.daddr ||
236                       !ipv4_is_multicast(local))) ||
237                     key != t->parms.i_key ||
238                     !(t->dev->flags & IFF_UP))
239                         continue;
240
241                 if (t->dev->type != ARPHRD_IPGRE &&
242                     t->dev->type != dev_type)
243                         continue;
244
245                 score = 0;
246                 if (t->parms.link != link)
247                         score |= 1;
248                 if (t->dev->type != dev_type)
249                         score |= 2;
250                 if (score == 0)
251                         return t;
252
253                 if (score < cand_score) {
254                         cand = t;
255                         cand_score = score;
256                 }
257         }
258
259         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
260                 if (t->parms.i_key != key ||
261                     !(t->dev->flags & IFF_UP))
262                         continue;
263
264                 if (t->dev->type != ARPHRD_IPGRE &&
265                     t->dev->type != dev_type)
266                         continue;
267
268                 score = 0;
269                 if (t->parms.link != link)
270                         score |= 1;
271                 if (t->dev->type != dev_type)
272                         score |= 2;
273                 if (score == 0)
274                         return t;
275
276                 if (score < cand_score) {
277                         cand = t;
278                         cand_score = score;
279                 }
280         }
281
282         if (cand != NULL)
283                 return cand;
284
285         dev = ign->fb_tunnel_dev;
286         if (dev->flags & IFF_UP)
287                 return netdev_priv(dev);
288
289         return NULL;
290 }
291
292 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
293                 struct ip_tunnel_parm *parms)
294 {
295         __be32 remote = parms->iph.daddr;
296         __be32 local = parms->iph.saddr;
297         __be32 key = parms->i_key;
298         unsigned h = HASH(key);
299         int prio = 0;
300
301         if (local)
302                 prio |= 1;
303         if (remote && !ipv4_is_multicast(remote)) {
304                 prio |= 2;
305                 h ^= HASH(remote);
306         }
307
308         return &ign->tunnels[prio][h];
309 }
310
311 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
312                 struct ip_tunnel *t)
313 {
314         return __ipgre_bucket(ign, &t->parms);
315 }
316
317 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
318 {
319         struct ip_tunnel **tp = ipgre_bucket(ign, t);
320
321         spin_lock_bh(&ipgre_lock);
322         t->next = *tp;
323         rcu_assign_pointer(*tp, t);
324         spin_unlock_bh(&ipgre_lock);
325 }
326
327 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
328 {
329         struct ip_tunnel **tp;
330
331         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
332                 if (t == *tp) {
333                         spin_lock_bh(&ipgre_lock);
334                         *tp = t->next;
335                         spin_unlock_bh(&ipgre_lock);
336                         break;
337                 }
338         }
339 }
340
341 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
342                                            struct ip_tunnel_parm *parms,
343                                            int type)
344 {
345         __be32 remote = parms->iph.daddr;
346         __be32 local = parms->iph.saddr;
347         __be32 key = parms->i_key;
348         int link = parms->link;
349         struct ip_tunnel *t, **tp;
350         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
351
352         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
353                 if (local == t->parms.iph.saddr &&
354                     remote == t->parms.iph.daddr &&
355                     key == t->parms.i_key &&
356                     link == t->parms.link &&
357                     type == t->dev->type)
358                         break;
359
360         return t;
361 }
362
363 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
364                 struct ip_tunnel_parm *parms, int create)
365 {
366         struct ip_tunnel *t, *nt;
367         struct net_device *dev;
368         char name[IFNAMSIZ];
369         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
370
371         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
372         if (t || !create)
373                 return t;
374
375         if (parms->name[0])
376                 strlcpy(name, parms->name, IFNAMSIZ);
377         else
378                 sprintf(name, "gre%%d");
379
380         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
381         if (!dev)
382           return NULL;
383
384         dev_net_set(dev, net);
385
386         if (strchr(name, '%')) {
387                 if (dev_alloc_name(dev, name) < 0)
388                         goto failed_free;
389         }
390
391         nt = netdev_priv(dev);
392         nt->parms = *parms;
393         dev->rtnl_link_ops = &ipgre_link_ops;
394
395         dev->mtu = ipgre_tunnel_bind_dev(dev);
396
397         if (register_netdevice(dev) < 0)
398                 goto failed_free;
399
400         dev_hold(dev);
401         ipgre_tunnel_link(ign, nt);
402         return nt;
403
404 failed_free:
405         free_netdev(dev);
406         return NULL;
407 }
408
409 static void ipgre_tunnel_uninit(struct net_device *dev)
410 {
411         struct net *net = dev_net(dev);
412         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
413
414         ipgre_tunnel_unlink(ign, netdev_priv(dev));
415         dev_put(dev);
416 }
417
418
419 static void ipgre_err(struct sk_buff *skb, u32 info)
420 {
421
422 /* All the routers (except for Linux) return only
423    8 bytes of packet payload. It means, that precise relaying of
424    ICMP in the real Internet is absolutely infeasible.
425
426    Moreover, Cisco "wise men" put GRE key to the third word
427    in GRE header. It makes impossible maintaining even soft state for keyed
428    GRE tunnels with enabled checksum. Tell them "thank you".
429
430    Well, I wonder, rfc1812 was written by Cisco employee,
431    what the hell these idiots break standrads established
432    by themself???
433  */
434
435         struct iphdr *iph = (struct iphdr *)skb->data;
436         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
437         int grehlen = (iph->ihl<<2) + 4;
438         const int type = icmp_hdr(skb)->type;
439         const int code = icmp_hdr(skb)->code;
440         struct ip_tunnel *t;
441         __be16 flags;
442
443         flags = p[0];
444         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
445                 if (flags&(GRE_VERSION|GRE_ROUTING))
446                         return;
447                 if (flags&GRE_KEY) {
448                         grehlen += 4;
449                         if (flags&GRE_CSUM)
450                                 grehlen += 4;
451                 }
452         }
453
454         /* If only 8 bytes returned, keyed message will be dropped here */
455         if (skb_headlen(skb) < grehlen)
456                 return;
457
458         switch (type) {
459         default:
460         case ICMP_PARAMETERPROB:
461                 return;
462
463         case ICMP_DEST_UNREACH:
464                 switch (code) {
465                 case ICMP_SR_FAILED:
466                 case ICMP_PORT_UNREACH:
467                         /* Impossible event. */
468                         return;
469                 case ICMP_FRAG_NEEDED:
470                         /* Soft state for pmtu is maintained by IP core. */
471                         return;
472                 default:
473                         /* All others are translated to HOST_UNREACH.
474                            rfc2003 contains "deep thoughts" about NET_UNREACH,
475                            I believe they are just ether pollution. --ANK
476                          */
477                         break;
478                 }
479                 break;
480         case ICMP_TIME_EXCEEDED:
481                 if (code != ICMP_EXC_TTL)
482                         return;
483                 break;
484         }
485
486         rcu_read_lock();
487         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
488                                 flags & GRE_KEY ?
489                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
490                                 p[1]);
491         if (t == NULL || t->parms.iph.daddr == 0 ||
492             ipv4_is_multicast(t->parms.iph.daddr))
493                 goto out;
494
495         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
496                 goto out;
497
498         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
499                 t->err_count++;
500         else
501                 t->err_count = 1;
502         t->err_time = jiffies;
503 out:
504         rcu_read_unlock();
505 }
506
507 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
508 {
509         if (INET_ECN_is_ce(iph->tos)) {
510                 if (skb->protocol == htons(ETH_P_IP)) {
511                         IP_ECN_set_ce(ip_hdr(skb));
512                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
513                         IP6_ECN_set_ce(ipv6_hdr(skb));
514                 }
515         }
516 }
517
518 static inline u8
519 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
520 {
521         u8 inner = 0;
522         if (skb->protocol == htons(ETH_P_IP))
523                 inner = old_iph->tos;
524         else if (skb->protocol == htons(ETH_P_IPV6))
525                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
526         return INET_ECN_encapsulate(tos, inner);
527 }
528
529 static int ipgre_rcv(struct sk_buff *skb)
530 {
531         struct iphdr *iph;
532         u8     *h;
533         __be16    flags;
534         __sum16   csum = 0;
535         __be32 key = 0;
536         u32    seqno = 0;
537         struct ip_tunnel *tunnel;
538         int    offset = 4;
539         __be16 gre_proto;
540
541         if (!pskb_may_pull(skb, 16))
542                 goto drop_nolock;
543
544         iph = ip_hdr(skb);
545         h = skb->data;
546         flags = *(__be16*)h;
547
548         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
549                 /* - Version must be 0.
550                    - We do not support routing headers.
551                  */
552                 if (flags&(GRE_VERSION|GRE_ROUTING))
553                         goto drop_nolock;
554
555                 if (flags&GRE_CSUM) {
556                         switch (skb->ip_summed) {
557                         case CHECKSUM_COMPLETE:
558                                 csum = csum_fold(skb->csum);
559                                 if (!csum)
560                                         break;
561                                 /* fall through */
562                         case CHECKSUM_NONE:
563                                 skb->csum = 0;
564                                 csum = __skb_checksum_complete(skb);
565                                 skb->ip_summed = CHECKSUM_COMPLETE;
566                         }
567                         offset += 4;
568                 }
569                 if (flags&GRE_KEY) {
570                         key = *(__be32*)(h + offset);
571                         offset += 4;
572                 }
573                 if (flags&GRE_SEQ) {
574                         seqno = ntohl(*(__be32*)(h + offset));
575                         offset += 4;
576                 }
577         }
578
579         gre_proto = *(__be16 *)(h + 2);
580
581         rcu_read_lock();
582         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
583                                           iph->saddr, iph->daddr, key,
584                                           gre_proto))) {
585                 struct net_device_stats *stats = &tunnel->dev->stats;
586
587                 secpath_reset(skb);
588
589                 skb->protocol = gre_proto;
590                 /* WCCP version 1 and 2 protocol decoding.
591                  * - Change protocol to IP
592                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
593                  */
594                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
595                         skb->protocol = htons(ETH_P_IP);
596                         if ((*(h + offset) & 0xF0) != 0x40)
597                                 offset += 4;
598                 }
599
600                 skb->mac_header = skb->network_header;
601                 __pskb_pull(skb, offset);
602                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
603                 skb->pkt_type = PACKET_HOST;
604 #ifdef CONFIG_NET_IPGRE_BROADCAST
605                 if (ipv4_is_multicast(iph->daddr)) {
606                         /* Looped back packet, drop it! */
607                         if (skb_rtable(skb)->fl.iif == 0)
608                                 goto drop;
609                         stats->multicast++;
610                         skb->pkt_type = PACKET_BROADCAST;
611                 }
612 #endif
613
614                 if (((flags&GRE_CSUM) && csum) ||
615                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
616                         stats->rx_crc_errors++;
617                         stats->rx_errors++;
618                         goto drop;
619                 }
620                 if (tunnel->parms.i_flags&GRE_SEQ) {
621                         if (!(flags&GRE_SEQ) ||
622                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
623                                 stats->rx_fifo_errors++;
624                                 stats->rx_errors++;
625                                 goto drop;
626                         }
627                         tunnel->i_seqno = seqno + 1;
628                 }
629
630                 /* Warning: All skb pointers will be invalidated! */
631                 if (tunnel->dev->type == ARPHRD_ETHER) {
632                         if (!pskb_may_pull(skb, ETH_HLEN)) {
633                                 stats->rx_length_errors++;
634                                 stats->rx_errors++;
635                                 goto drop;
636                         }
637
638                         iph = ip_hdr(skb);
639                         skb->protocol = eth_type_trans(skb, tunnel->dev);
640                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
641                 }
642
643                 skb_tunnel_rx(skb, tunnel->dev);
644
645                 skb_reset_network_header(skb);
646                 ipgre_ecn_decapsulate(iph, skb);
647
648                 netif_rx(skb);
649                 rcu_read_unlock();
650                 return(0);
651         }
652         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
653
654 drop:
655         rcu_read_unlock();
656 drop_nolock:
657         kfree_skb(skb);
658         return(0);
659 }
660
661 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
662 {
663         struct ip_tunnel *tunnel = netdev_priv(dev);
664         struct net_device_stats *stats = &dev->stats;
665         struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
666         struct iphdr  *old_iph = ip_hdr(skb);
667         struct iphdr  *tiph;
668         u8     tos;
669         __be16 df;
670         struct rtable *rt;                      /* Route to the other host */
671         struct net_device *tdev;                        /* Device to other host */
672         struct iphdr  *iph;                     /* Our new IP header */
673         unsigned int max_headroom;              /* The extra header space needed */
674         int    gre_hlen;
675         __be32 dst;
676         int    mtu;
677
678         if (dev->type == ARPHRD_ETHER)
679                 IPCB(skb)->flags = 0;
680
681         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
682                 gre_hlen = 0;
683                 tiph = (struct iphdr *)skb->data;
684         } else {
685                 gre_hlen = tunnel->hlen;
686                 tiph = &tunnel->parms.iph;
687         }
688
689         if ((dst = tiph->daddr) == 0) {
690                 /* NBMA tunnel */
691
692                 if (skb_dst(skb) == NULL) {
693                         stats->tx_fifo_errors++;
694                         goto tx_error;
695                 }
696
697                 if (skb->protocol == htons(ETH_P_IP)) {
698                         rt = skb_rtable(skb);
699                         if ((dst = rt->rt_gateway) == 0)
700                                 goto tx_error_icmp;
701                 }
702 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
703                 else if (skb->protocol == htons(ETH_P_IPV6)) {
704                         struct in6_addr *addr6;
705                         int addr_type;
706                         struct neighbour *neigh = skb_dst(skb)->neighbour;
707
708                         if (neigh == NULL)
709                                 goto tx_error;
710
711                         addr6 = (struct in6_addr *)&neigh->primary_key;
712                         addr_type = ipv6_addr_type(addr6);
713
714                         if (addr_type == IPV6_ADDR_ANY) {
715                                 addr6 = &ipv6_hdr(skb)->daddr;
716                                 addr_type = ipv6_addr_type(addr6);
717                         }
718
719                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
720                                 goto tx_error_icmp;
721
722                         dst = addr6->s6_addr32[3];
723                 }
724 #endif
725                 else
726                         goto tx_error;
727         }
728
729         tos = tiph->tos;
730         if (tos == 1) {
731                 tos = 0;
732                 if (skb->protocol == htons(ETH_P_IP))
733                         tos = old_iph->tos;
734                 else if (skb->protocol == htons(ETH_P_IPV6))
735                         tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
736         }
737
738         {
739                 struct flowi fl = { .oif = tunnel->parms.link,
740                                     .nl_u = { .ip4_u =
741                                               { .daddr = dst,
742                                                 .saddr = tiph->saddr,
743                                                 .tos = RT_TOS(tos) } },
744                                     .proto = IPPROTO_GRE };
745                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
746                         stats->tx_carrier_errors++;
747                         goto tx_error;
748                 }
749         }
750         tdev = rt->dst.dev;
751
752         if (tdev == dev) {
753                 ip_rt_put(rt);
754                 stats->collisions++;
755                 goto tx_error;
756         }
757
758         df = tiph->frag_off;
759         if (df)
760                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
761         else
762                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
763
764         if (skb_dst(skb))
765                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
766
767         if (skb->protocol == htons(ETH_P_IP)) {
768                 df |= (old_iph->frag_off&htons(IP_DF));
769
770                 if ((old_iph->frag_off&htons(IP_DF)) &&
771                     mtu < ntohs(old_iph->tot_len)) {
772                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
773                         ip_rt_put(rt);
774                         goto tx_error;
775                 }
776         }
777 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
778         else if (skb->protocol == htons(ETH_P_IPV6)) {
779                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
780
781                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
782                         if ((tunnel->parms.iph.daddr &&
783                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
784                             rt6->rt6i_dst.plen == 128) {
785                                 rt6->rt6i_flags |= RTF_MODIFIED;
786                                 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
787                         }
788                 }
789
790                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
791                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
792                         ip_rt_put(rt);
793                         goto tx_error;
794                 }
795         }
796 #endif
797
798         if (tunnel->err_count > 0) {
799                 if (time_before(jiffies,
800                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
801                         tunnel->err_count--;
802
803                         dst_link_failure(skb);
804                 } else
805                         tunnel->err_count = 0;
806         }
807
808         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
809
810         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
811             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
812                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
813                 if (max_headroom > dev->needed_headroom)
814                         dev->needed_headroom = max_headroom;
815                 if (!new_skb) {
816                         ip_rt_put(rt);
817                         txq->tx_dropped++;
818                         dev_kfree_skb(skb);
819                         return NETDEV_TX_OK;
820                 }
821                 if (skb->sk)
822                         skb_set_owner_w(new_skb, skb->sk);
823                 dev_kfree_skb(skb);
824                 skb = new_skb;
825                 old_iph = ip_hdr(skb);
826         }
827
828         skb_reset_transport_header(skb);
829         skb_push(skb, gre_hlen);
830         skb_reset_network_header(skb);
831         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
832         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
833                               IPSKB_REROUTED);
834         skb_dst_drop(skb);
835         skb_dst_set(skb, &rt->dst);
836
837         /*
838          *      Push down and install the IPIP header.
839          */
840
841         iph                     =       ip_hdr(skb);
842         iph->version            =       4;
843         iph->ihl                =       sizeof(struct iphdr) >> 2;
844         iph->frag_off           =       df;
845         iph->protocol           =       IPPROTO_GRE;
846         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
847         iph->daddr              =       rt->rt_dst;
848         iph->saddr              =       rt->rt_src;
849
850         if ((iph->ttl = tiph->ttl) == 0) {
851                 if (skb->protocol == htons(ETH_P_IP))
852                         iph->ttl = old_iph->ttl;
853 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
854                 else if (skb->protocol == htons(ETH_P_IPV6))
855                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
856 #endif
857                 else
858                         iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT);
859         }
860
861         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
862         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
863                                    htons(ETH_P_TEB) : skb->protocol;
864
865         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
866                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
867
868                 if (tunnel->parms.o_flags&GRE_SEQ) {
869                         ++tunnel->o_seqno;
870                         *ptr = htonl(tunnel->o_seqno);
871                         ptr--;
872                 }
873                 if (tunnel->parms.o_flags&GRE_KEY) {
874                         *ptr = tunnel->parms.o_key;
875                         ptr--;
876                 }
877                 if (tunnel->parms.o_flags&GRE_CSUM) {
878                         *ptr = 0;
879                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
880                 }
881         }
882
883         nf_reset(skb);
884
885         IPTUNNEL_XMIT();
886         return NETDEV_TX_OK;
887
888 tx_error_icmp:
889         dst_link_failure(skb);
890
891 tx_error:
892         stats->tx_errors++;
893         dev_kfree_skb(skb);
894         return NETDEV_TX_OK;
895 }
896
897 static int ipgre_tunnel_bind_dev(struct net_device *dev)
898 {
899         struct net_device *tdev = NULL;
900         struct ip_tunnel *tunnel;
901         struct iphdr *iph;
902         int hlen = LL_MAX_HEADER;
903         int mtu = ETH_DATA_LEN;
904         int addend = sizeof(struct iphdr) + 4;
905
906         tunnel = netdev_priv(dev);
907         iph = &tunnel->parms.iph;
908
909         /* Guess output device to choose reasonable mtu and needed_headroom */
910
911         if (iph->daddr) {
912                 struct flowi fl = { .oif = tunnel->parms.link,
913                                     .nl_u = { .ip4_u =
914                                               { .daddr = iph->daddr,
915                                                 .saddr = iph->saddr,
916                                                 .tos = RT_TOS(iph->tos) } },
917                                     .proto = IPPROTO_GRE };
918                 struct rtable *rt;
919                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
920                         tdev = rt->dst.dev;
921                         ip_rt_put(rt);
922                 }
923
924                 if (dev->type != ARPHRD_ETHER)
925                         dev->flags |= IFF_POINTOPOINT;
926         }
927
928         if (!tdev && tunnel->parms.link)
929                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
930
931         if (tdev) {
932                 hlen = tdev->hard_header_len + tdev->needed_headroom;
933                 mtu = tdev->mtu;
934         }
935         dev->iflink = tunnel->parms.link;
936
937         /* Precalculate GRE options length */
938         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
939                 if (tunnel->parms.o_flags&GRE_CSUM)
940                         addend += 4;
941                 if (tunnel->parms.o_flags&GRE_KEY)
942                         addend += 4;
943                 if (tunnel->parms.o_flags&GRE_SEQ)
944                         addend += 4;
945         }
946         dev->needed_headroom = addend + hlen;
947         mtu -= dev->hard_header_len + addend;
948
949         if (mtu < 68)
950                 mtu = 68;
951
952         tunnel->hlen = addend;
953
954         return mtu;
955 }
956
957 static int
958 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
959 {
960         int err = 0;
961         struct ip_tunnel_parm p;
962         struct ip_tunnel *t;
963         struct net *net = dev_net(dev);
964         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
965
966         switch (cmd) {
967         case SIOCGETTUNNEL:
968                 t = NULL;
969                 if (dev == ign->fb_tunnel_dev) {
970                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
971                                 err = -EFAULT;
972                                 break;
973                         }
974                         t = ipgre_tunnel_locate(net, &p, 0);
975                 }
976                 if (t == NULL)
977                         t = netdev_priv(dev);
978                 memcpy(&p, &t->parms, sizeof(p));
979                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
980                         err = -EFAULT;
981                 break;
982
983         case SIOCADDTUNNEL:
984         case SIOCCHGTUNNEL:
985                 err = -EPERM;
986                 if (!capable(CAP_NET_ADMIN))
987                         goto done;
988
989                 err = -EFAULT;
990                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
991                         goto done;
992
993                 err = -EINVAL;
994                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
995                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
996                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
997                         goto done;
998                 if (p.iph.ttl)
999                         p.iph.frag_off |= htons(IP_DF);
1000
1001                 if (!(p.i_flags&GRE_KEY))
1002                         p.i_key = 0;
1003                 if (!(p.o_flags&GRE_KEY))
1004                         p.o_key = 0;
1005
1006                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1007
1008                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1009                         if (t != NULL) {
1010                                 if (t->dev != dev) {
1011                                         err = -EEXIST;
1012                                         break;
1013                                 }
1014                         } else {
1015                                 unsigned nflags = 0;
1016
1017                                 t = netdev_priv(dev);
1018
1019                                 if (ipv4_is_multicast(p.iph.daddr))
1020                                         nflags = IFF_BROADCAST;
1021                                 else if (p.iph.daddr)
1022                                         nflags = IFF_POINTOPOINT;
1023
1024                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1025                                         err = -EINVAL;
1026                                         break;
1027                                 }
1028                                 ipgre_tunnel_unlink(ign, t);
1029                                 t->parms.iph.saddr = p.iph.saddr;
1030                                 t->parms.iph.daddr = p.iph.daddr;
1031                                 t->parms.i_key = p.i_key;
1032                                 t->parms.o_key = p.o_key;
1033                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1034                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1035                                 ipgre_tunnel_link(ign, t);
1036                                 netdev_state_change(dev);
1037                         }
1038                 }
1039
1040                 if (t) {
1041                         err = 0;
1042                         if (cmd == SIOCCHGTUNNEL) {
1043                                 t->parms.iph.ttl = p.iph.ttl;
1044                                 t->parms.iph.tos = p.iph.tos;
1045                                 t->parms.iph.frag_off = p.iph.frag_off;
1046                                 if (t->parms.link != p.link) {
1047                                         t->parms.link = p.link;
1048                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1049                                         netdev_state_change(dev);
1050                                 }
1051                         }
1052                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1053                                 err = -EFAULT;
1054                 } else
1055                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1056                 break;
1057
1058         case SIOCDELTUNNEL:
1059                 err = -EPERM;
1060                 if (!capable(CAP_NET_ADMIN))
1061                         goto done;
1062
1063                 if (dev == ign->fb_tunnel_dev) {
1064                         err = -EFAULT;
1065                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1066                                 goto done;
1067                         err = -ENOENT;
1068                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1069                                 goto done;
1070                         err = -EPERM;
1071                         if (t == netdev_priv(ign->fb_tunnel_dev))
1072                                 goto done;
1073                         dev = t->dev;
1074                 }
1075                 unregister_netdevice(dev);
1076                 err = 0;
1077                 break;
1078
1079         default:
1080                 err = -EINVAL;
1081         }
1082
1083 done:
1084         return err;
1085 }
1086
1087 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1088 {
1089         struct ip_tunnel *tunnel = netdev_priv(dev);
1090         if (new_mtu < 68 ||
1091             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1092                 return -EINVAL;
1093         dev->mtu = new_mtu;
1094         return 0;
1095 }
1096
1097 /* Nice toy. Unfortunately, useless in real life :-)
1098    It allows to construct virtual multiprotocol broadcast "LAN"
1099    over the Internet, provided multicast routing is tuned.
1100
1101
1102    I have no idea was this bicycle invented before me,
1103    so that I had to set ARPHRD_IPGRE to a random value.
1104    I have an impression, that Cisco could make something similar,
1105    but this feature is apparently missing in IOS<=11.2(8).
1106
1107    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1108    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1109
1110    ping -t 255 224.66.66.66
1111
1112    If nobody answers, mbone does not work.
1113
1114    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1115    ip addr add 10.66.66.<somewhat>/24 dev Universe
1116    ifconfig Universe up
1117    ifconfig Universe add fe80::<Your_real_addr>/10
1118    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1119    ftp 10.66.66.66
1120    ...
1121    ftp fec0:6666:6666::193.233.7.65
1122    ...
1123
1124  */
1125
1126 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1127                         unsigned short type,
1128                         const void *daddr, const void *saddr, unsigned len)
1129 {
1130         struct ip_tunnel *t = netdev_priv(dev);
1131         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1132         __be16 *p = (__be16*)(iph+1);
1133
1134         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1135         p[0]            = t->parms.o_flags;
1136         p[1]            = htons(type);
1137
1138         /*
1139          *      Set the source hardware address.
1140          */
1141
1142         if (saddr)
1143                 memcpy(&iph->saddr, saddr, 4);
1144         if (daddr)
1145                 memcpy(&iph->daddr, daddr, 4);
1146         if (iph->daddr)
1147                 return t->hlen;
1148
1149         return -t->hlen;
1150 }
1151
1152 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1153 {
1154         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1155         memcpy(haddr, &iph->saddr, 4);
1156         return 4;
1157 }
1158
1159 static const struct header_ops ipgre_header_ops = {
1160         .create = ipgre_header,
1161         .parse  = ipgre_header_parse,
1162 };
1163
1164 #ifdef CONFIG_NET_IPGRE_BROADCAST
1165 static int ipgre_open(struct net_device *dev)
1166 {
1167         struct ip_tunnel *t = netdev_priv(dev);
1168
1169         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1170                 struct flowi fl = { .oif = t->parms.link,
1171                                     .nl_u = { .ip4_u =
1172                                               { .daddr = t->parms.iph.daddr,
1173                                                 .saddr = t->parms.iph.saddr,
1174                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1175                                     .proto = IPPROTO_GRE };
1176                 struct rtable *rt;
1177                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1178                         return -EADDRNOTAVAIL;
1179                 dev = rt->dst.dev;
1180                 ip_rt_put(rt);
1181                 if (__in_dev_get_rtnl(dev) == NULL)
1182                         return -EADDRNOTAVAIL;
1183                 t->mlink = dev->ifindex;
1184                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1185         }
1186         return 0;
1187 }
1188
1189 static int ipgre_close(struct net_device *dev)
1190 {
1191         struct ip_tunnel *t = netdev_priv(dev);
1192
1193         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1194                 struct in_device *in_dev;
1195                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1196                 if (in_dev) {
1197                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1198                         in_dev_put(in_dev);
1199                 }
1200         }
1201         return 0;
1202 }
1203
1204 #endif
1205
1206 static const struct net_device_ops ipgre_netdev_ops = {
1207         .ndo_init               = ipgre_tunnel_init,
1208         .ndo_uninit             = ipgre_tunnel_uninit,
1209 #ifdef CONFIG_NET_IPGRE_BROADCAST
1210         .ndo_open               = ipgre_open,
1211         .ndo_stop               = ipgre_close,
1212 #endif
1213         .ndo_start_xmit         = ipgre_tunnel_xmit,
1214         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1215         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1216 };
1217
1218 static void ipgre_tunnel_setup(struct net_device *dev)
1219 {
1220         dev->netdev_ops         = &ipgre_netdev_ops;
1221         dev->destructor         = free_netdev;
1222
1223         dev->type               = ARPHRD_IPGRE;
1224         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1225         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1226         dev->flags              = IFF_NOARP;
1227         dev->iflink             = 0;
1228         dev->addr_len           = 4;
1229         dev->features           |= NETIF_F_NETNS_LOCAL;
1230         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1231 }
1232
1233 static int ipgre_tunnel_init(struct net_device *dev)
1234 {
1235         struct ip_tunnel *tunnel;
1236         struct iphdr *iph;
1237
1238         tunnel = netdev_priv(dev);
1239         iph = &tunnel->parms.iph;
1240
1241         tunnel->dev = dev;
1242         strcpy(tunnel->parms.name, dev->name);
1243
1244         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1245         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1246
1247         if (iph->daddr) {
1248 #ifdef CONFIG_NET_IPGRE_BROADCAST
1249                 if (ipv4_is_multicast(iph->daddr)) {
1250                         if (!iph->saddr)
1251                                 return -EINVAL;
1252                         dev->flags = IFF_BROADCAST;
1253                         dev->header_ops = &ipgre_header_ops;
1254                 }
1255 #endif
1256         } else
1257                 dev->header_ops = &ipgre_header_ops;
1258
1259         return 0;
1260 }
1261
1262 static void ipgre_fb_tunnel_init(struct net_device *dev)
1263 {
1264         struct ip_tunnel *tunnel = netdev_priv(dev);
1265         struct iphdr *iph = &tunnel->parms.iph;
1266         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1267
1268         tunnel->dev = dev;
1269         strcpy(tunnel->parms.name, dev->name);
1270
1271         iph->version            = 4;
1272         iph->protocol           = IPPROTO_GRE;
1273         iph->ihl                = 5;
1274         tunnel->hlen            = sizeof(struct iphdr) + 4;
1275
1276         dev_hold(dev);
1277         ign->tunnels_wc[0]      = tunnel;
1278 }
1279
1280
1281 static const struct net_protocol ipgre_protocol = {
1282         .handler        =       ipgre_rcv,
1283         .err_handler    =       ipgre_err,
1284         .netns_ok       =       1,
1285 };
1286
1287 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1288 {
1289         int prio;
1290
1291         for (prio = 0; prio < 4; prio++) {
1292                 int h;
1293                 for (h = 0; h < HASH_SIZE; h++) {
1294                         struct ip_tunnel *t = ign->tunnels[prio][h];
1295
1296                         while (t != NULL) {
1297                                 unregister_netdevice_queue(t->dev, head);
1298                                 t = t->next;
1299                         }
1300                 }
1301         }
1302 }
1303
1304 static int __net_init ipgre_init_net(struct net *net)
1305 {
1306         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1307         int err;
1308
1309         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1310                                            ipgre_tunnel_setup);
1311         if (!ign->fb_tunnel_dev) {
1312                 err = -ENOMEM;
1313                 goto err_alloc_dev;
1314         }
1315         dev_net_set(ign->fb_tunnel_dev, net);
1316
1317         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1318         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1319
1320         if ((err = register_netdev(ign->fb_tunnel_dev)))
1321                 goto err_reg_dev;
1322
1323         return 0;
1324
1325 err_reg_dev:
1326         free_netdev(ign->fb_tunnel_dev);
1327 err_alloc_dev:
1328         return err;
1329 }
1330
1331 static void __net_exit ipgre_exit_net(struct net *net)
1332 {
1333         struct ipgre_net *ign;
1334         LIST_HEAD(list);
1335
1336         ign = net_generic(net, ipgre_net_id);
1337         rtnl_lock();
1338         ipgre_destroy_tunnels(ign, &list);
1339         unregister_netdevice_many(&list);
1340         rtnl_unlock();
1341 }
1342
1343 static struct pernet_operations ipgre_net_ops = {
1344         .init = ipgre_init_net,
1345         .exit = ipgre_exit_net,
1346         .id   = &ipgre_net_id,
1347         .size = sizeof(struct ipgre_net),
1348 };
1349
1350 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1351 {
1352         __be16 flags;
1353
1354         if (!data)
1355                 return 0;
1356
1357         flags = 0;
1358         if (data[IFLA_GRE_IFLAGS])
1359                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1360         if (data[IFLA_GRE_OFLAGS])
1361                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1362         if (flags & (GRE_VERSION|GRE_ROUTING))
1363                 return -EINVAL;
1364
1365         return 0;
1366 }
1367
1368 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1369 {
1370         __be32 daddr;
1371
1372         if (tb[IFLA_ADDRESS]) {
1373                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1374                         return -EINVAL;
1375                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1376                         return -EADDRNOTAVAIL;
1377         }
1378
1379         if (!data)
1380                 goto out;
1381
1382         if (data[IFLA_GRE_REMOTE]) {
1383                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1384                 if (!daddr)
1385                         return -EINVAL;
1386         }
1387
1388 out:
1389         return ipgre_tunnel_validate(tb, data);
1390 }
1391
1392 static void ipgre_netlink_parms(struct nlattr *data[],
1393                                 struct ip_tunnel_parm *parms)
1394 {
1395         memset(parms, 0, sizeof(*parms));
1396
1397         parms->iph.protocol = IPPROTO_GRE;
1398
1399         if (!data)
1400                 return;
1401
1402         if (data[IFLA_GRE_LINK])
1403                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1404
1405         if (data[IFLA_GRE_IFLAGS])
1406                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1407
1408         if (data[IFLA_GRE_OFLAGS])
1409                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1410
1411         if (data[IFLA_GRE_IKEY])
1412                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1413
1414         if (data[IFLA_GRE_OKEY])
1415                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1416
1417         if (data[IFLA_GRE_LOCAL])
1418                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1419
1420         if (data[IFLA_GRE_REMOTE])
1421                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1422
1423         if (data[IFLA_GRE_TTL])
1424                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1425
1426         if (data[IFLA_GRE_TOS])
1427                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1428
1429         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1430                 parms->iph.frag_off = htons(IP_DF);
1431 }
1432
1433 static int ipgre_tap_init(struct net_device *dev)
1434 {
1435         struct ip_tunnel *tunnel;
1436
1437         tunnel = netdev_priv(dev);
1438
1439         tunnel->dev = dev;
1440         strcpy(tunnel->parms.name, dev->name);
1441
1442         ipgre_tunnel_bind_dev(dev);
1443
1444         return 0;
1445 }
1446
1447 static const struct net_device_ops ipgre_tap_netdev_ops = {
1448         .ndo_init               = ipgre_tap_init,
1449         .ndo_uninit             = ipgre_tunnel_uninit,
1450         .ndo_start_xmit         = ipgre_tunnel_xmit,
1451         .ndo_set_mac_address    = eth_mac_addr,
1452         .ndo_validate_addr      = eth_validate_addr,
1453         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1454 };
1455
1456 static void ipgre_tap_setup(struct net_device *dev)
1457 {
1458
1459         ether_setup(dev);
1460
1461         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1462         dev->destructor         = free_netdev;
1463
1464         dev->iflink             = 0;
1465         dev->features           |= NETIF_F_NETNS_LOCAL;
1466 }
1467
1468 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1469                          struct nlattr *data[])
1470 {
1471         struct ip_tunnel *nt;
1472         struct net *net = dev_net(dev);
1473         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1474         int mtu;
1475         int err;
1476
1477         nt = netdev_priv(dev);
1478         ipgre_netlink_parms(data, &nt->parms);
1479
1480         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1481                 return -EEXIST;
1482
1483         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1484                 random_ether_addr(dev->dev_addr);
1485
1486         mtu = ipgre_tunnel_bind_dev(dev);
1487         if (!tb[IFLA_MTU])
1488                 dev->mtu = mtu;
1489
1490         err = register_netdevice(dev);
1491         if (err)
1492                 goto out;
1493
1494         dev_hold(dev);
1495         ipgre_tunnel_link(ign, nt);
1496
1497 out:
1498         return err;
1499 }
1500
1501 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1502                             struct nlattr *data[])
1503 {
1504         struct ip_tunnel *t, *nt;
1505         struct net *net = dev_net(dev);
1506         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1507         struct ip_tunnel_parm p;
1508         int mtu;
1509
1510         if (dev == ign->fb_tunnel_dev)
1511                 return -EINVAL;
1512
1513         nt = netdev_priv(dev);
1514         ipgre_netlink_parms(data, &p);
1515
1516         t = ipgre_tunnel_locate(net, &p, 0);
1517
1518         if (t) {
1519                 if (t->dev != dev)
1520                         return -EEXIST;
1521         } else {
1522                 t = nt;
1523
1524                 if (dev->type != ARPHRD_ETHER) {
1525                         unsigned nflags = 0;
1526
1527                         if (ipv4_is_multicast(p.iph.daddr))
1528                                 nflags = IFF_BROADCAST;
1529                         else if (p.iph.daddr)
1530                                 nflags = IFF_POINTOPOINT;
1531
1532                         if ((dev->flags ^ nflags) &
1533                             (IFF_POINTOPOINT | IFF_BROADCAST))
1534                                 return -EINVAL;
1535                 }
1536
1537                 ipgre_tunnel_unlink(ign, t);
1538                 t->parms.iph.saddr = p.iph.saddr;
1539                 t->parms.iph.daddr = p.iph.daddr;
1540                 t->parms.i_key = p.i_key;
1541                 if (dev->type != ARPHRD_ETHER) {
1542                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1543                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1544                 }
1545                 ipgre_tunnel_link(ign, t);
1546                 netdev_state_change(dev);
1547         }
1548
1549         t->parms.o_key = p.o_key;
1550         t->parms.iph.ttl = p.iph.ttl;
1551         t->parms.iph.tos = p.iph.tos;
1552         t->parms.iph.frag_off = p.iph.frag_off;
1553
1554         if (t->parms.link != p.link) {
1555                 t->parms.link = p.link;
1556                 mtu = ipgre_tunnel_bind_dev(dev);
1557                 if (!tb[IFLA_MTU])
1558                         dev->mtu = mtu;
1559                 netdev_state_change(dev);
1560         }
1561
1562         return 0;
1563 }
1564
1565 static size_t ipgre_get_size(const struct net_device *dev)
1566 {
1567         return
1568                 /* IFLA_GRE_LINK */
1569                 nla_total_size(4) +
1570                 /* IFLA_GRE_IFLAGS */
1571                 nla_total_size(2) +
1572                 /* IFLA_GRE_OFLAGS */
1573                 nla_total_size(2) +
1574                 /* IFLA_GRE_IKEY */
1575                 nla_total_size(4) +
1576                 /* IFLA_GRE_OKEY */
1577                 nla_total_size(4) +
1578                 /* IFLA_GRE_LOCAL */
1579                 nla_total_size(4) +
1580                 /* IFLA_GRE_REMOTE */
1581                 nla_total_size(4) +
1582                 /* IFLA_GRE_TTL */
1583                 nla_total_size(1) +
1584                 /* IFLA_GRE_TOS */
1585                 nla_total_size(1) +
1586                 /* IFLA_GRE_PMTUDISC */
1587                 nla_total_size(1) +
1588                 0;
1589 }
1590
1591 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1592 {
1593         struct ip_tunnel *t = netdev_priv(dev);
1594         struct ip_tunnel_parm *p = &t->parms;
1595
1596         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1597         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1598         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1599         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1600         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1601         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1602         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1603         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1604         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1605         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1606
1607         return 0;
1608
1609 nla_put_failure:
1610         return -EMSGSIZE;
1611 }
1612
1613 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1614         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1615         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1616         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1617         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1618         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1619         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1620         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1621         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1622         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1623         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1624 };
1625
1626 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1627         .kind           = "gre",
1628         .maxtype        = IFLA_GRE_MAX,
1629         .policy         = ipgre_policy,
1630         .priv_size      = sizeof(struct ip_tunnel),
1631         .setup          = ipgre_tunnel_setup,
1632         .validate       = ipgre_tunnel_validate,
1633         .newlink        = ipgre_newlink,
1634         .changelink     = ipgre_changelink,
1635         .get_size       = ipgre_get_size,
1636         .fill_info      = ipgre_fill_info,
1637 };
1638
1639 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1640         .kind           = "gretap",
1641         .maxtype        = IFLA_GRE_MAX,
1642         .policy         = ipgre_policy,
1643         .priv_size      = sizeof(struct ip_tunnel),
1644         .setup          = ipgre_tap_setup,
1645         .validate       = ipgre_tap_validate,
1646         .newlink        = ipgre_newlink,
1647         .changelink     = ipgre_changelink,
1648         .get_size       = ipgre_get_size,
1649         .fill_info      = ipgre_fill_info,
1650 };
1651
1652 /*
1653  *      And now the modules code and kernel interface.
1654  */
1655
1656 static int __init ipgre_init(void)
1657 {
1658         int err;
1659
1660         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1661
1662         err = register_pernet_device(&ipgre_net_ops);
1663         if (err < 0)
1664                 return err;
1665
1666         err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE);
1667         if (err < 0) {
1668                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1669                 goto add_proto_failed;
1670         }
1671
1672         err = rtnl_link_register(&ipgre_link_ops);
1673         if (err < 0)
1674                 goto rtnl_link_failed;
1675
1676         err = rtnl_link_register(&ipgre_tap_ops);
1677         if (err < 0)
1678                 goto tap_ops_failed;
1679
1680 out:
1681         return err;
1682
1683 tap_ops_failed:
1684         rtnl_link_unregister(&ipgre_link_ops);
1685 rtnl_link_failed:
1686         inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1687 add_proto_failed:
1688         unregister_pernet_device(&ipgre_net_ops);
1689         goto out;
1690 }
1691
1692 static void __exit ipgre_fini(void)
1693 {
1694         rtnl_link_unregister(&ipgre_tap_ops);
1695         rtnl_link_unregister(&ipgre_link_ops);
1696         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1697                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1698         unregister_pernet_device(&ipgre_net_ops);
1699 }
1700
1701 module_init(ipgre_init);
1702 module_exit(ipgre_fini);
1703 MODULE_LICENSE("GPL");
1704 MODULE_ALIAS_RTNL_LINK("gre");
1705 MODULE_ALIAS_RTNL_LINK("gretap");