Merge branch 'v4l_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab...
[pandora-kernel.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ipip.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
56
57 /*
58    Problems & solutions
59    --------------------
60
61    1. The most important issue is detecting local dead loops.
62    They would cause complete host lockup in transmit, which
63    would be "resolved" by stack overflow or, if queueing is enabled,
64    with infinite looping in net_bh.
65
66    We cannot track such dead loops during route installation,
67    it is infeasible task. The most general solutions would be
68    to keep skb->encapsulation counter (sort of local ttl),
69    and silently drop packet when it expires. It is a good
70    solution, but it supposes maintaining new variable in ALL
71    skb, even if no tunneling is used.
72
73    Current solution: xmit_recursion breaks dead loops. This is a percpu
74    counter, since when we enter the first ndo_xmit(), cpu migration is
75    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
76
77    2. Networking dead loops would not kill routers, but would really
78    kill network. IP hop limit plays role of "t->recursion" in this case,
79    if we copy it from packet being encapsulated to upper header.
80    It is very good solution, but it introduces two problems:
81
82    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83      do not work over tunnels.
84    - traceroute does not work. I planned to relay ICMP from tunnel,
85      so that this problem would be solved and traceroute output
86      would even more informative. This idea appeared to be wrong:
87      only Linux complies to rfc1812 now (yes, guys, Linux is the only
88      true router now :-)), all routers (at least, in neighbourhood of mine)
89      return only 8 bytes of payload. It is the end.
90
91    Hence, if we want that OSPF worked or traceroute said something reasonable,
92    we should search for another solution.
93
94    One of them is to parse packet trying to detect inner encapsulation
95    made by our node. It is difficult or even impossible, especially,
96    taking into account fragmentation. TO be short, ttl is not solution at all.
97
98    Current solution: The solution was UNEXPECTEDLY SIMPLE.
99    We force DF flag on tunnels with preconfigured hop limit,
100    that is ALL. :-) Well, it does not remove the problem completely,
101    but exponential growth of network traffic is changed to linear
102    (branches, that exceed pmtu are pruned) and tunnel mtu
103    rapidly degrades to value <68, where looping stops.
104    Yes, it is not good if there exists a router in the loop,
105    which does not force DF, even when encapsulating packets have DF set.
106    But it is not our problem! Nobody could accuse us, we made
107    all that we could make. Even if it is your gated who injected
108    fatal route to network, even if it were you who configured
109    fatal static route: you are innocent. :-)
110
111
112
113    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
114    practically identical code. It would be good to glue them
115    together, but it is not very evident, how to make them modular.
116    sit is integral part of IPv6, ipip and gre are naturally modular.
117    We could extract common parts (hash table, ioctl etc)
118    to a separate module (ip_tunnel.c).
119
120    Alexey Kuznetsov.
121  */
122
123 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
124 static int ipgre_tunnel_init(struct net_device *dev);
125 static void ipgre_tunnel_setup(struct net_device *dev);
126 static int ipgre_tunnel_bind_dev(struct net_device *dev);
127
128 /* Fallback tunnel: no source, no destination, no key, no options */
129
130 #define HASH_SIZE  16
131
132 static int ipgre_net_id __read_mostly;
133 struct ipgre_net {
134         struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
135
136         struct net_device *fb_tunnel_dev;
137 };
138
139 /* Tunnel hash table */
140
141 /*
142    4 hash tables:
143
144    3: (remote,local)
145    2: (remote,*)
146    1: (*,local)
147    0: (*,*)
148
149    We require exact key match i.e. if a key is present in packet
150    it will match only tunnel with the same key; if it is not present,
151    it will match only keyless tunnel.
152
153    All keysless packets, if not matched configured keyless tunnels
154    will match fallback tunnel.
155  */
156
157 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
158
159 #define tunnels_r_l     tunnels[3]
160 #define tunnels_r       tunnels[2]
161 #define tunnels_l       tunnels[1]
162 #define tunnels_wc      tunnels[0]
163 /*
164  * Locking : hash tables are protected by RCU and RTNL
165  */
166
167 #define for_each_ip_tunnel_rcu(start) \
168         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
169
170 /* often modified stats are per cpu, other are shared (netdev->stats) */
171 struct pcpu_tstats {
172         unsigned long   rx_packets;
173         unsigned long   rx_bytes;
174         unsigned long   tx_packets;
175         unsigned long   tx_bytes;
176 } __attribute__((aligned(4*sizeof(unsigned long))));
177
178 static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
179 {
180         struct pcpu_tstats sum = { 0 };
181         int i;
182
183         for_each_possible_cpu(i) {
184                 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
185
186                 sum.rx_packets += tstats->rx_packets;
187                 sum.rx_bytes   += tstats->rx_bytes;
188                 sum.tx_packets += tstats->tx_packets;
189                 sum.tx_bytes   += tstats->tx_bytes;
190         }
191         dev->stats.rx_packets = sum.rx_packets;
192         dev->stats.rx_bytes   = sum.rx_bytes;
193         dev->stats.tx_packets = sum.tx_packets;
194         dev->stats.tx_bytes   = sum.tx_bytes;
195         return &dev->stats;
196 }
197
198 /* Given src, dst and key, find appropriate for input tunnel. */
199
200 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
201                                               __be32 remote, __be32 local,
202                                               __be32 key, __be16 gre_proto)
203 {
204         struct net *net = dev_net(dev);
205         int link = dev->ifindex;
206         unsigned int h0 = HASH(remote);
207         unsigned int h1 = HASH(key);
208         struct ip_tunnel *t, *cand = NULL;
209         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
210         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
211                        ARPHRD_ETHER : ARPHRD_IPGRE;
212         int score, cand_score = 4;
213
214         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
215                 if (local != t->parms.iph.saddr ||
216                     remote != t->parms.iph.daddr ||
217                     key != t->parms.i_key ||
218                     !(t->dev->flags & IFF_UP))
219                         continue;
220
221                 if (t->dev->type != ARPHRD_IPGRE &&
222                     t->dev->type != dev_type)
223                         continue;
224
225                 score = 0;
226                 if (t->parms.link != link)
227                         score |= 1;
228                 if (t->dev->type != dev_type)
229                         score |= 2;
230                 if (score == 0)
231                         return t;
232
233                 if (score < cand_score) {
234                         cand = t;
235                         cand_score = score;
236                 }
237         }
238
239         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
240                 if (remote != t->parms.iph.daddr ||
241                     key != t->parms.i_key ||
242                     !(t->dev->flags & IFF_UP))
243                         continue;
244
245                 if (t->dev->type != ARPHRD_IPGRE &&
246                     t->dev->type != dev_type)
247                         continue;
248
249                 score = 0;
250                 if (t->parms.link != link)
251                         score |= 1;
252                 if (t->dev->type != dev_type)
253                         score |= 2;
254                 if (score == 0)
255                         return t;
256
257                 if (score < cand_score) {
258                         cand = t;
259                         cand_score = score;
260                 }
261         }
262
263         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
264                 if ((local != t->parms.iph.saddr &&
265                      (local != t->parms.iph.daddr ||
266                       !ipv4_is_multicast(local))) ||
267                     key != t->parms.i_key ||
268                     !(t->dev->flags & IFF_UP))
269                         continue;
270
271                 if (t->dev->type != ARPHRD_IPGRE &&
272                     t->dev->type != dev_type)
273                         continue;
274
275                 score = 0;
276                 if (t->parms.link != link)
277                         score |= 1;
278                 if (t->dev->type != dev_type)
279                         score |= 2;
280                 if (score == 0)
281                         return t;
282
283                 if (score < cand_score) {
284                         cand = t;
285                         cand_score = score;
286                 }
287         }
288
289         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
290                 if (t->parms.i_key != key ||
291                     !(t->dev->flags & IFF_UP))
292                         continue;
293
294                 if (t->dev->type != ARPHRD_IPGRE &&
295                     t->dev->type != dev_type)
296                         continue;
297
298                 score = 0;
299                 if (t->parms.link != link)
300                         score |= 1;
301                 if (t->dev->type != dev_type)
302                         score |= 2;
303                 if (score == 0)
304                         return t;
305
306                 if (score < cand_score) {
307                         cand = t;
308                         cand_score = score;
309                 }
310         }
311
312         if (cand != NULL)
313                 return cand;
314
315         dev = ign->fb_tunnel_dev;
316         if (dev->flags & IFF_UP)
317                 return netdev_priv(dev);
318
319         return NULL;
320 }
321
322 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
323                 struct ip_tunnel_parm *parms)
324 {
325         __be32 remote = parms->iph.daddr;
326         __be32 local = parms->iph.saddr;
327         __be32 key = parms->i_key;
328         unsigned int h = HASH(key);
329         int prio = 0;
330
331         if (local)
332                 prio |= 1;
333         if (remote && !ipv4_is_multicast(remote)) {
334                 prio |= 2;
335                 h ^= HASH(remote);
336         }
337
338         return &ign->tunnels[prio][h];
339 }
340
341 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
342                 struct ip_tunnel *t)
343 {
344         return __ipgre_bucket(ign, &t->parms);
345 }
346
347 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
348 {
349         struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
350
351         rcu_assign_pointer(t->next, rtnl_dereference(*tp));
352         rcu_assign_pointer(*tp, t);
353 }
354
355 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
356 {
357         struct ip_tunnel __rcu **tp;
358         struct ip_tunnel *iter;
359
360         for (tp = ipgre_bucket(ign, t);
361              (iter = rtnl_dereference(*tp)) != NULL;
362              tp = &iter->next) {
363                 if (t == iter) {
364                         rcu_assign_pointer(*tp, t->next);
365                         break;
366                 }
367         }
368 }
369
370 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
371                                            struct ip_tunnel_parm *parms,
372                                            int type)
373 {
374         __be32 remote = parms->iph.daddr;
375         __be32 local = parms->iph.saddr;
376         __be32 key = parms->i_key;
377         int link = parms->link;
378         struct ip_tunnel *t;
379         struct ip_tunnel __rcu **tp;
380         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
381
382         for (tp = __ipgre_bucket(ign, parms);
383              (t = rtnl_dereference(*tp)) != NULL;
384              tp = &t->next)
385                 if (local == t->parms.iph.saddr &&
386                     remote == t->parms.iph.daddr &&
387                     key == t->parms.i_key &&
388                     link == t->parms.link &&
389                     type == t->dev->type)
390                         break;
391
392         return t;
393 }
394
395 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
396                 struct ip_tunnel_parm *parms, int create)
397 {
398         struct ip_tunnel *t, *nt;
399         struct net_device *dev;
400         char name[IFNAMSIZ];
401         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
402
403         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
404         if (t || !create)
405                 return t;
406
407         if (parms->name[0])
408                 strlcpy(name, parms->name, IFNAMSIZ);
409         else
410                 strcpy(name, "gre%d");
411
412         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
413         if (!dev)
414                 return NULL;
415
416         dev_net_set(dev, net);
417
418         nt = netdev_priv(dev);
419         nt->parms = *parms;
420         dev->rtnl_link_ops = &ipgre_link_ops;
421
422         dev->mtu = ipgre_tunnel_bind_dev(dev);
423
424         if (register_netdevice(dev) < 0)
425                 goto failed_free;
426
427         /* Can use a lockless transmit, unless we generate output sequences */
428         if (!(nt->parms.o_flags & GRE_SEQ))
429                 dev->features |= NETIF_F_LLTX;
430
431         dev_hold(dev);
432         ipgre_tunnel_link(ign, nt);
433         return nt;
434
435 failed_free:
436         free_netdev(dev);
437         return NULL;
438 }
439
440 static void ipgre_tunnel_uninit(struct net_device *dev)
441 {
442         struct net *net = dev_net(dev);
443         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
444
445         ipgre_tunnel_unlink(ign, netdev_priv(dev));
446         dev_put(dev);
447 }
448
449
450 static void ipgre_err(struct sk_buff *skb, u32 info)
451 {
452
453 /* All the routers (except for Linux) return only
454    8 bytes of packet payload. It means, that precise relaying of
455    ICMP in the real Internet is absolutely infeasible.
456
457    Moreover, Cisco "wise men" put GRE key to the third word
458    in GRE header. It makes impossible maintaining even soft state for keyed
459    GRE tunnels with enabled checksum. Tell them "thank you".
460
461    Well, I wonder, rfc1812 was written by Cisco employee,
462    what the hell these idiots break standards established
463    by themselves???
464  */
465
466         const struct iphdr *iph = (const struct iphdr *)skb->data;
467         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
468         int grehlen = (iph->ihl<<2) + 4;
469         const int type = icmp_hdr(skb)->type;
470         const int code = icmp_hdr(skb)->code;
471         struct ip_tunnel *t;
472         __be16 flags;
473
474         flags = p[0];
475         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
476                 if (flags&(GRE_VERSION|GRE_ROUTING))
477                         return;
478                 if (flags&GRE_KEY) {
479                         grehlen += 4;
480                         if (flags&GRE_CSUM)
481                                 grehlen += 4;
482                 }
483         }
484
485         /* If only 8 bytes returned, keyed message will be dropped here */
486         if (skb_headlen(skb) < grehlen)
487                 return;
488
489         switch (type) {
490         default:
491         case ICMP_PARAMETERPROB:
492                 return;
493
494         case ICMP_DEST_UNREACH:
495                 switch (code) {
496                 case ICMP_SR_FAILED:
497                 case ICMP_PORT_UNREACH:
498                         /* Impossible event. */
499                         return;
500                 case ICMP_FRAG_NEEDED:
501                         /* Soft state for pmtu is maintained by IP core. */
502                         return;
503                 default:
504                         /* All others are translated to HOST_UNREACH.
505                            rfc2003 contains "deep thoughts" about NET_UNREACH,
506                            I believe they are just ether pollution. --ANK
507                          */
508                         break;
509                 }
510                 break;
511         case ICMP_TIME_EXCEEDED:
512                 if (code != ICMP_EXC_TTL)
513                         return;
514                 break;
515         }
516
517         rcu_read_lock();
518         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
519                                 flags & GRE_KEY ?
520                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
521                                 p[1]);
522         if (t == NULL || t->parms.iph.daddr == 0 ||
523             ipv4_is_multicast(t->parms.iph.daddr))
524                 goto out;
525
526         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
527                 goto out;
528
529         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
530                 t->err_count++;
531         else
532                 t->err_count = 1;
533         t->err_time = jiffies;
534 out:
535         rcu_read_unlock();
536 }
537
538 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
539 {
540         if (INET_ECN_is_ce(iph->tos)) {
541                 if (skb->protocol == htons(ETH_P_IP)) {
542                         IP_ECN_set_ce(ip_hdr(skb));
543                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
544                         IP6_ECN_set_ce(ipv6_hdr(skb));
545                 }
546         }
547 }
548
549 static inline u8
550 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
551 {
552         u8 inner = 0;
553         if (skb->protocol == htons(ETH_P_IP))
554                 inner = old_iph->tos;
555         else if (skb->protocol == htons(ETH_P_IPV6))
556                 inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
557         return INET_ECN_encapsulate(tos, inner);
558 }
559
560 static int ipgre_rcv(struct sk_buff *skb)
561 {
562         const struct iphdr *iph;
563         u8     *h;
564         __be16    flags;
565         __sum16   csum = 0;
566         __be32 key = 0;
567         u32    seqno = 0;
568         struct ip_tunnel *tunnel;
569         int    offset = 4;
570         __be16 gre_proto;
571
572         if (!pskb_may_pull(skb, 16))
573                 goto drop_nolock;
574
575         iph = ip_hdr(skb);
576         h = skb->data;
577         flags = *(__be16*)h;
578
579         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
580                 /* - Version must be 0.
581                    - We do not support routing headers.
582                  */
583                 if (flags&(GRE_VERSION|GRE_ROUTING))
584                         goto drop_nolock;
585
586                 if (flags&GRE_CSUM) {
587                         switch (skb->ip_summed) {
588                         case CHECKSUM_COMPLETE:
589                                 csum = csum_fold(skb->csum);
590                                 if (!csum)
591                                         break;
592                                 /* fall through */
593                         case CHECKSUM_NONE:
594                                 skb->csum = 0;
595                                 csum = __skb_checksum_complete(skb);
596                                 skb->ip_summed = CHECKSUM_COMPLETE;
597                         }
598                         offset += 4;
599                 }
600                 if (flags&GRE_KEY) {
601                         key = *(__be32*)(h + offset);
602                         offset += 4;
603                 }
604                 if (flags&GRE_SEQ) {
605                         seqno = ntohl(*(__be32*)(h + offset));
606                         offset += 4;
607                 }
608         }
609
610         gre_proto = *(__be16 *)(h + 2);
611
612         rcu_read_lock();
613         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
614                                           iph->saddr, iph->daddr, key,
615                                           gre_proto))) {
616                 struct pcpu_tstats *tstats;
617
618                 secpath_reset(skb);
619
620                 skb->protocol = gre_proto;
621                 /* WCCP version 1 and 2 protocol decoding.
622                  * - Change protocol to IP
623                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
624                  */
625                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
626                         skb->protocol = htons(ETH_P_IP);
627                         if ((*(h + offset) & 0xF0) != 0x40)
628                                 offset += 4;
629                 }
630
631                 skb->mac_header = skb->network_header;
632                 __pskb_pull(skb, offset);
633                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
634                 skb->pkt_type = PACKET_HOST;
635 #ifdef CONFIG_NET_IPGRE_BROADCAST
636                 if (ipv4_is_multicast(iph->daddr)) {
637                         /* Looped back packet, drop it! */
638                         if (rt_is_output_route(skb_rtable(skb)))
639                                 goto drop;
640                         tunnel->dev->stats.multicast++;
641                         skb->pkt_type = PACKET_BROADCAST;
642                 }
643 #endif
644
645                 if (((flags&GRE_CSUM) && csum) ||
646                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
647                         tunnel->dev->stats.rx_crc_errors++;
648                         tunnel->dev->stats.rx_errors++;
649                         goto drop;
650                 }
651                 if (tunnel->parms.i_flags&GRE_SEQ) {
652                         if (!(flags&GRE_SEQ) ||
653                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
654                                 tunnel->dev->stats.rx_fifo_errors++;
655                                 tunnel->dev->stats.rx_errors++;
656                                 goto drop;
657                         }
658                         tunnel->i_seqno = seqno + 1;
659                 }
660
661                 /* Warning: All skb pointers will be invalidated! */
662                 if (tunnel->dev->type == ARPHRD_ETHER) {
663                         if (!pskb_may_pull(skb, ETH_HLEN)) {
664                                 tunnel->dev->stats.rx_length_errors++;
665                                 tunnel->dev->stats.rx_errors++;
666                                 goto drop;
667                         }
668
669                         iph = ip_hdr(skb);
670                         skb->protocol = eth_type_trans(skb, tunnel->dev);
671                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
672                 }
673
674                 tstats = this_cpu_ptr(tunnel->dev->tstats);
675                 tstats->rx_packets++;
676                 tstats->rx_bytes += skb->len;
677
678                 __skb_tunnel_rx(skb, tunnel->dev);
679
680                 skb_reset_network_header(skb);
681                 ipgre_ecn_decapsulate(iph, skb);
682
683                 netif_rx(skb);
684
685                 rcu_read_unlock();
686                 return 0;
687         }
688         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
689
690 drop:
691         rcu_read_unlock();
692 drop_nolock:
693         kfree_skb(skb);
694         return 0;
695 }
696
697 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
698 {
699         struct ip_tunnel *tunnel = netdev_priv(dev);
700         struct pcpu_tstats *tstats;
701         const struct iphdr  *old_iph = ip_hdr(skb);
702         const struct iphdr  *tiph;
703         struct flowi4 fl4;
704         u8     tos;
705         __be16 df;
706         struct rtable *rt;                      /* Route to the other host */
707         struct net_device *tdev;                /* Device to other host */
708         struct iphdr  *iph;                     /* Our new IP header */
709         unsigned int max_headroom;              /* The extra header space needed */
710         int    gre_hlen;
711         __be32 dst;
712         int    mtu;
713
714         if (dev->type == ARPHRD_ETHER)
715                 IPCB(skb)->flags = 0;
716
717         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
718                 gre_hlen = 0;
719                 tiph = (const struct iphdr *)skb->data;
720         } else {
721                 gre_hlen = tunnel->hlen;
722                 tiph = &tunnel->parms.iph;
723         }
724
725         if ((dst = tiph->daddr) == 0) {
726                 /* NBMA tunnel */
727
728                 if (skb_dst(skb) == NULL) {
729                         dev->stats.tx_fifo_errors++;
730                         goto tx_error;
731                 }
732
733                 if (skb->protocol == htons(ETH_P_IP)) {
734                         rt = skb_rtable(skb);
735                         dst = rt->rt_gateway;
736                 }
737 #if IS_ENABLED(CONFIG_IPV6)
738                 else if (skb->protocol == htons(ETH_P_IPV6)) {
739                         const struct in6_addr *addr6;
740                         struct neighbour *neigh;
741                         bool do_tx_error_icmp;
742                         int addr_type;
743
744                         neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
745                         if (neigh == NULL)
746                                 goto tx_error;
747
748                         addr6 = (const struct in6_addr *)&neigh->primary_key;
749                         addr_type = ipv6_addr_type(addr6);
750
751                         if (addr_type == IPV6_ADDR_ANY) {
752                                 addr6 = &ipv6_hdr(skb)->daddr;
753                                 addr_type = ipv6_addr_type(addr6);
754                         }
755
756                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
757                                 do_tx_error_icmp = true;
758                         else {
759                                 do_tx_error_icmp = false;
760                                 dst = addr6->s6_addr32[3];
761                         }
762                         neigh_release(neigh);
763                         if (do_tx_error_icmp)
764                                 goto tx_error_icmp;
765                 }
766 #endif
767                 else
768                         goto tx_error;
769         }
770
771         tos = tiph->tos;
772         if (tos == 1) {
773                 tos = 0;
774                 if (skb->protocol == htons(ETH_P_IP))
775                         tos = old_iph->tos;
776                 else if (skb->protocol == htons(ETH_P_IPV6))
777                         tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
778         }
779
780         rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
781                                  tunnel->parms.o_key, RT_TOS(tos),
782                                  tunnel->parms.link);
783         if (IS_ERR(rt)) {
784                 dev->stats.tx_carrier_errors++;
785                 goto tx_error;
786         }
787         tdev = rt->dst.dev;
788
789         if (tdev == dev) {
790                 ip_rt_put(rt);
791                 dev->stats.collisions++;
792                 goto tx_error;
793         }
794
795         df = tiph->frag_off;
796         if (df)
797                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
798         else
799                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
800
801         if (skb_dst(skb))
802                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
803
804         if (skb->protocol == htons(ETH_P_IP)) {
805                 df |= (old_iph->frag_off&htons(IP_DF));
806
807                 if ((old_iph->frag_off&htons(IP_DF)) &&
808                     mtu < ntohs(old_iph->tot_len)) {
809                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
810                         ip_rt_put(rt);
811                         goto tx_error;
812                 }
813         }
814 #if IS_ENABLED(CONFIG_IPV6)
815         else if (skb->protocol == htons(ETH_P_IPV6)) {
816                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
817
818                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
819                         if ((tunnel->parms.iph.daddr &&
820                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
821                             rt6->rt6i_dst.plen == 128) {
822                                 rt6->rt6i_flags |= RTF_MODIFIED;
823                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
824                         }
825                 }
826
827                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
828                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
829                         ip_rt_put(rt);
830                         goto tx_error;
831                 }
832         }
833 #endif
834
835         if (tunnel->err_count > 0) {
836                 if (time_before(jiffies,
837                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
838                         tunnel->err_count--;
839
840                         dst_link_failure(skb);
841                 } else
842                         tunnel->err_count = 0;
843         }
844
845         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
846
847         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
848             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
849                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
850                 if (max_headroom > dev->needed_headroom)
851                         dev->needed_headroom = max_headroom;
852                 if (!new_skb) {
853                         ip_rt_put(rt);
854                         dev->stats.tx_dropped++;
855                         dev_kfree_skb(skb);
856                         return NETDEV_TX_OK;
857                 }
858                 if (skb->sk)
859                         skb_set_owner_w(new_skb, skb->sk);
860                 dev_kfree_skb(skb);
861                 skb = new_skb;
862                 old_iph = ip_hdr(skb);
863         }
864
865         skb_reset_transport_header(skb);
866         skb_push(skb, gre_hlen);
867         skb_reset_network_header(skb);
868         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
869         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
870                               IPSKB_REROUTED);
871         skb_dst_drop(skb);
872         skb_dst_set(skb, &rt->dst);
873
874         /*
875          *      Push down and install the IPIP header.
876          */
877
878         iph                     =       ip_hdr(skb);
879         iph->version            =       4;
880         iph->ihl                =       sizeof(struct iphdr) >> 2;
881         iph->frag_off           =       df;
882         iph->protocol           =       IPPROTO_GRE;
883         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
884         iph->daddr              =       fl4.daddr;
885         iph->saddr              =       fl4.saddr;
886
887         if ((iph->ttl = tiph->ttl) == 0) {
888                 if (skb->protocol == htons(ETH_P_IP))
889                         iph->ttl = old_iph->ttl;
890 #if IS_ENABLED(CONFIG_IPV6)
891                 else if (skb->protocol == htons(ETH_P_IPV6))
892                         iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
893 #endif
894                 else
895                         iph->ttl = ip4_dst_hoplimit(&rt->dst);
896         }
897
898         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
899         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
900                                    htons(ETH_P_TEB) : skb->protocol;
901
902         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
903                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
904
905                 if (tunnel->parms.o_flags&GRE_SEQ) {
906                         ++tunnel->o_seqno;
907                         *ptr = htonl(tunnel->o_seqno);
908                         ptr--;
909                 }
910                 if (tunnel->parms.o_flags&GRE_KEY) {
911                         *ptr = tunnel->parms.o_key;
912                         ptr--;
913                 }
914                 if (tunnel->parms.o_flags&GRE_CSUM) {
915                         *ptr = 0;
916                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
917                 }
918         }
919
920         nf_reset(skb);
921         tstats = this_cpu_ptr(dev->tstats);
922         __IPTUNNEL_XMIT(tstats, &dev->stats);
923         return NETDEV_TX_OK;
924
925 #if IS_ENABLED(CONFIG_IPV6)
926 tx_error_icmp:
927         dst_link_failure(skb);
928 #endif
929 tx_error:
930         dev->stats.tx_errors++;
931         dev_kfree_skb(skb);
932         return NETDEV_TX_OK;
933 }
934
935 static int ipgre_tunnel_bind_dev(struct net_device *dev)
936 {
937         struct net_device *tdev = NULL;
938         struct ip_tunnel *tunnel;
939         const struct iphdr *iph;
940         int hlen = LL_MAX_HEADER;
941         int mtu = ETH_DATA_LEN;
942         int addend = sizeof(struct iphdr) + 4;
943
944         tunnel = netdev_priv(dev);
945         iph = &tunnel->parms.iph;
946
947         /* Guess output device to choose reasonable mtu and needed_headroom */
948
949         if (iph->daddr) {
950                 struct flowi4 fl4;
951                 struct rtable *rt;
952
953                 rt = ip_route_output_gre(dev_net(dev), &fl4,
954                                          iph->daddr, iph->saddr,
955                                          tunnel->parms.o_key,
956                                          RT_TOS(iph->tos),
957                                          tunnel->parms.link);
958                 if (!IS_ERR(rt)) {
959                         tdev = rt->dst.dev;
960                         ip_rt_put(rt);
961                 }
962
963                 if (dev->type != ARPHRD_ETHER)
964                         dev->flags |= IFF_POINTOPOINT;
965         }
966
967         if (!tdev && tunnel->parms.link)
968                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
969
970         if (tdev) {
971                 hlen = tdev->hard_header_len + tdev->needed_headroom;
972                 mtu = tdev->mtu;
973         }
974         dev->iflink = tunnel->parms.link;
975
976         /* Precalculate GRE options length */
977         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
978                 if (tunnel->parms.o_flags&GRE_CSUM)
979                         addend += 4;
980                 if (tunnel->parms.o_flags&GRE_KEY)
981                         addend += 4;
982                 if (tunnel->parms.o_flags&GRE_SEQ)
983                         addend += 4;
984         }
985         dev->needed_headroom = addend + hlen;
986         mtu -= dev->hard_header_len + addend;
987
988         if (mtu < 68)
989                 mtu = 68;
990
991         tunnel->hlen = addend;
992
993         return mtu;
994 }
995
996 static int
997 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
998 {
999         int err = 0;
1000         struct ip_tunnel_parm p;
1001         struct ip_tunnel *t;
1002         struct net *net = dev_net(dev);
1003         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1004
1005         switch (cmd) {
1006         case SIOCGETTUNNEL:
1007                 t = NULL;
1008                 if (dev == ign->fb_tunnel_dev) {
1009                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1010                                 err = -EFAULT;
1011                                 break;
1012                         }
1013                         t = ipgre_tunnel_locate(net, &p, 0);
1014                 }
1015                 if (t == NULL)
1016                         t = netdev_priv(dev);
1017                 memcpy(&p, &t->parms, sizeof(p));
1018                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1019                         err = -EFAULT;
1020                 break;
1021
1022         case SIOCADDTUNNEL:
1023         case SIOCCHGTUNNEL:
1024                 err = -EPERM;
1025                 if (!capable(CAP_NET_ADMIN))
1026                         goto done;
1027
1028                 err = -EFAULT;
1029                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1030                         goto done;
1031
1032                 err = -EINVAL;
1033                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1034                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1035                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1036                         goto done;
1037                 if (p.iph.ttl)
1038                         p.iph.frag_off |= htons(IP_DF);
1039
1040                 if (!(p.i_flags&GRE_KEY))
1041                         p.i_key = 0;
1042                 if (!(p.o_flags&GRE_KEY))
1043                         p.o_key = 0;
1044
1045                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1046
1047                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1048                         if (t != NULL) {
1049                                 if (t->dev != dev) {
1050                                         err = -EEXIST;
1051                                         break;
1052                                 }
1053                         } else {
1054                                 unsigned int nflags = 0;
1055
1056                                 t = netdev_priv(dev);
1057
1058                                 if (ipv4_is_multicast(p.iph.daddr))
1059                                         nflags = IFF_BROADCAST;
1060                                 else if (p.iph.daddr)
1061                                         nflags = IFF_POINTOPOINT;
1062
1063                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1064                                         err = -EINVAL;
1065                                         break;
1066                                 }
1067                                 ipgre_tunnel_unlink(ign, t);
1068                                 synchronize_net();
1069                                 t->parms.iph.saddr = p.iph.saddr;
1070                                 t->parms.iph.daddr = p.iph.daddr;
1071                                 t->parms.i_key = p.i_key;
1072                                 t->parms.o_key = p.o_key;
1073                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1074                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1075                                 ipgre_tunnel_link(ign, t);
1076                                 netdev_state_change(dev);
1077                         }
1078                 }
1079
1080                 if (t) {
1081                         err = 0;
1082                         if (cmd == SIOCCHGTUNNEL) {
1083                                 t->parms.iph.ttl = p.iph.ttl;
1084                                 t->parms.iph.tos = p.iph.tos;
1085                                 t->parms.iph.frag_off = p.iph.frag_off;
1086                                 if (t->parms.link != p.link) {
1087                                         t->parms.link = p.link;
1088                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1089                                         netdev_state_change(dev);
1090                                 }
1091                         }
1092                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1093                                 err = -EFAULT;
1094                 } else
1095                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1096                 break;
1097
1098         case SIOCDELTUNNEL:
1099                 err = -EPERM;
1100                 if (!capable(CAP_NET_ADMIN))
1101                         goto done;
1102
1103                 if (dev == ign->fb_tunnel_dev) {
1104                         err = -EFAULT;
1105                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1106                                 goto done;
1107                         err = -ENOENT;
1108                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1109                                 goto done;
1110                         err = -EPERM;
1111                         if (t == netdev_priv(ign->fb_tunnel_dev))
1112                                 goto done;
1113                         dev = t->dev;
1114                 }
1115                 unregister_netdevice(dev);
1116                 err = 0;
1117                 break;
1118
1119         default:
1120                 err = -EINVAL;
1121         }
1122
1123 done:
1124         return err;
1125 }
1126
1127 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1128 {
1129         struct ip_tunnel *tunnel = netdev_priv(dev);
1130         if (new_mtu < 68 ||
1131             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1132                 return -EINVAL;
1133         dev->mtu = new_mtu;
1134         return 0;
1135 }
1136
1137 /* Nice toy. Unfortunately, useless in real life :-)
1138    It allows to construct virtual multiprotocol broadcast "LAN"
1139    over the Internet, provided multicast routing is tuned.
1140
1141
1142    I have no idea was this bicycle invented before me,
1143    so that I had to set ARPHRD_IPGRE to a random value.
1144    I have an impression, that Cisco could make something similar,
1145    but this feature is apparently missing in IOS<=11.2(8).
1146
1147    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1148    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1149
1150    ping -t 255 224.66.66.66
1151
1152    If nobody answers, mbone does not work.
1153
1154    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1155    ip addr add 10.66.66.<somewhat>/24 dev Universe
1156    ifconfig Universe up
1157    ifconfig Universe add fe80::<Your_real_addr>/10
1158    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1159    ftp 10.66.66.66
1160    ...
1161    ftp fec0:6666:6666::193.233.7.65
1162    ...
1163
1164  */
1165
1166 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1167                         unsigned short type,
1168                         const void *daddr, const void *saddr, unsigned int len)
1169 {
1170         struct ip_tunnel *t = netdev_priv(dev);
1171         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1172         __be16 *p = (__be16*)(iph+1);
1173
1174         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1175         p[0]            = t->parms.o_flags;
1176         p[1]            = htons(type);
1177
1178         /*
1179          *      Set the source hardware address.
1180          */
1181
1182         if (saddr)
1183                 memcpy(&iph->saddr, saddr, 4);
1184         if (daddr)
1185                 memcpy(&iph->daddr, daddr, 4);
1186         if (iph->daddr)
1187                 return t->hlen;
1188
1189         return -t->hlen;
1190 }
1191
1192 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1193 {
1194         const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1195         memcpy(haddr, &iph->saddr, 4);
1196         return 4;
1197 }
1198
1199 static const struct header_ops ipgre_header_ops = {
1200         .create = ipgre_header,
1201         .parse  = ipgre_header_parse,
1202 };
1203
1204 #ifdef CONFIG_NET_IPGRE_BROADCAST
1205 static int ipgre_open(struct net_device *dev)
1206 {
1207         struct ip_tunnel *t = netdev_priv(dev);
1208
1209         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1210                 struct flowi4 fl4;
1211                 struct rtable *rt;
1212
1213                 rt = ip_route_output_gre(dev_net(dev), &fl4,
1214                                          t->parms.iph.daddr,
1215                                          t->parms.iph.saddr,
1216                                          t->parms.o_key,
1217                                          RT_TOS(t->parms.iph.tos),
1218                                          t->parms.link);
1219                 if (IS_ERR(rt))
1220                         return -EADDRNOTAVAIL;
1221                 dev = rt->dst.dev;
1222                 ip_rt_put(rt);
1223                 if (__in_dev_get_rtnl(dev) == NULL)
1224                         return -EADDRNOTAVAIL;
1225                 t->mlink = dev->ifindex;
1226                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1227         }
1228         return 0;
1229 }
1230
1231 static int ipgre_close(struct net_device *dev)
1232 {
1233         struct ip_tunnel *t = netdev_priv(dev);
1234
1235         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1236                 struct in_device *in_dev;
1237                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1238                 if (in_dev)
1239                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1240         }
1241         return 0;
1242 }
1243
1244 #endif
1245
1246 static const struct net_device_ops ipgre_netdev_ops = {
1247         .ndo_init               = ipgre_tunnel_init,
1248         .ndo_uninit             = ipgre_tunnel_uninit,
1249 #ifdef CONFIG_NET_IPGRE_BROADCAST
1250         .ndo_open               = ipgre_open,
1251         .ndo_stop               = ipgre_close,
1252 #endif
1253         .ndo_start_xmit         = ipgre_tunnel_xmit,
1254         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1255         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1256         .ndo_get_stats          = ipgre_get_stats,
1257 };
1258
1259 static void ipgre_dev_free(struct net_device *dev)
1260 {
1261         free_percpu(dev->tstats);
1262         free_netdev(dev);
1263 }
1264
1265 static void ipgre_tunnel_setup(struct net_device *dev)
1266 {
1267         dev->netdev_ops         = &ipgre_netdev_ops;
1268         dev->destructor         = ipgre_dev_free;
1269
1270         dev->type               = ARPHRD_IPGRE;
1271         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1272         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1273         dev->flags              = IFF_NOARP;
1274         dev->iflink             = 0;
1275         dev->addr_len           = 4;
1276         dev->features           |= NETIF_F_NETNS_LOCAL;
1277         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1278 }
1279
1280 static int ipgre_tunnel_init(struct net_device *dev)
1281 {
1282         struct ip_tunnel *tunnel;
1283         struct iphdr *iph;
1284
1285         tunnel = netdev_priv(dev);
1286         iph = &tunnel->parms.iph;
1287
1288         tunnel->dev = dev;
1289         strcpy(tunnel->parms.name, dev->name);
1290
1291         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1292         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1293
1294         if (iph->daddr) {
1295 #ifdef CONFIG_NET_IPGRE_BROADCAST
1296                 if (ipv4_is_multicast(iph->daddr)) {
1297                         if (!iph->saddr)
1298                                 return -EINVAL;
1299                         dev->flags = IFF_BROADCAST;
1300                         dev->header_ops = &ipgre_header_ops;
1301                 }
1302 #endif
1303         } else
1304                 dev->header_ops = &ipgre_header_ops;
1305
1306         dev->tstats = alloc_percpu(struct pcpu_tstats);
1307         if (!dev->tstats)
1308                 return -ENOMEM;
1309
1310         return 0;
1311 }
1312
1313 static void ipgre_fb_tunnel_init(struct net_device *dev)
1314 {
1315         struct ip_tunnel *tunnel = netdev_priv(dev);
1316         struct iphdr *iph = &tunnel->parms.iph;
1317
1318         tunnel->dev = dev;
1319         strcpy(tunnel->parms.name, dev->name);
1320
1321         iph->version            = 4;
1322         iph->protocol           = IPPROTO_GRE;
1323         iph->ihl                = 5;
1324         tunnel->hlen            = sizeof(struct iphdr) + 4;
1325
1326         dev_hold(dev);
1327 }
1328
1329
1330 static const struct gre_protocol ipgre_protocol = {
1331         .handler     = ipgre_rcv,
1332         .err_handler = ipgre_err,
1333 };
1334
1335 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1336 {
1337         int prio;
1338
1339         for (prio = 0; prio < 4; prio++) {
1340                 int h;
1341                 for (h = 0; h < HASH_SIZE; h++) {
1342                         struct ip_tunnel *t;
1343
1344                         t = rtnl_dereference(ign->tunnels[prio][h]);
1345
1346                         while (t != NULL) {
1347                                 unregister_netdevice_queue(t->dev, head);
1348                                 t = rtnl_dereference(t->next);
1349                         }
1350                 }
1351         }
1352 }
1353
1354 static int __net_init ipgre_init_net(struct net *net)
1355 {
1356         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1357         int err;
1358
1359         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1360                                            ipgre_tunnel_setup);
1361         if (!ign->fb_tunnel_dev) {
1362                 err = -ENOMEM;
1363                 goto err_alloc_dev;
1364         }
1365         dev_net_set(ign->fb_tunnel_dev, net);
1366
1367         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1368         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1369
1370         if ((err = register_netdev(ign->fb_tunnel_dev)))
1371                 goto err_reg_dev;
1372
1373         rcu_assign_pointer(ign->tunnels_wc[0],
1374                            netdev_priv(ign->fb_tunnel_dev));
1375         return 0;
1376
1377 err_reg_dev:
1378         ipgre_dev_free(ign->fb_tunnel_dev);
1379 err_alloc_dev:
1380         return err;
1381 }
1382
1383 static void __net_exit ipgre_exit_net(struct net *net)
1384 {
1385         struct ipgre_net *ign;
1386         LIST_HEAD(list);
1387
1388         ign = net_generic(net, ipgre_net_id);
1389         rtnl_lock();
1390         ipgre_destroy_tunnels(ign, &list);
1391         unregister_netdevice_many(&list);
1392         rtnl_unlock();
1393 }
1394
1395 static struct pernet_operations ipgre_net_ops = {
1396         .init = ipgre_init_net,
1397         .exit = ipgre_exit_net,
1398         .id   = &ipgre_net_id,
1399         .size = sizeof(struct ipgre_net),
1400 };
1401
1402 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1403 {
1404         __be16 flags;
1405
1406         if (!data)
1407                 return 0;
1408
1409         flags = 0;
1410         if (data[IFLA_GRE_IFLAGS])
1411                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1412         if (data[IFLA_GRE_OFLAGS])
1413                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1414         if (flags & (GRE_VERSION|GRE_ROUTING))
1415                 return -EINVAL;
1416
1417         return 0;
1418 }
1419
1420 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1421 {
1422         __be32 daddr;
1423
1424         if (tb[IFLA_ADDRESS]) {
1425                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1426                         return -EINVAL;
1427                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1428                         return -EADDRNOTAVAIL;
1429         }
1430
1431         if (!data)
1432                 goto out;
1433
1434         if (data[IFLA_GRE_REMOTE]) {
1435                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1436                 if (!daddr)
1437                         return -EINVAL;
1438         }
1439
1440 out:
1441         return ipgre_tunnel_validate(tb, data);
1442 }
1443
1444 static void ipgre_netlink_parms(struct nlattr *data[],
1445                                 struct ip_tunnel_parm *parms)
1446 {
1447         memset(parms, 0, sizeof(*parms));
1448
1449         parms->iph.protocol = IPPROTO_GRE;
1450
1451         if (!data)
1452                 return;
1453
1454         if (data[IFLA_GRE_LINK])
1455                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1456
1457         if (data[IFLA_GRE_IFLAGS])
1458                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1459
1460         if (data[IFLA_GRE_OFLAGS])
1461                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1462
1463         if (data[IFLA_GRE_IKEY])
1464                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1465
1466         if (data[IFLA_GRE_OKEY])
1467                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1468
1469         if (data[IFLA_GRE_LOCAL])
1470                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1471
1472         if (data[IFLA_GRE_REMOTE])
1473                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1474
1475         if (data[IFLA_GRE_TTL])
1476                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1477
1478         if (data[IFLA_GRE_TOS])
1479                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1480
1481         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1482                 parms->iph.frag_off = htons(IP_DF);
1483 }
1484
1485 static int ipgre_tap_init(struct net_device *dev)
1486 {
1487         struct ip_tunnel *tunnel;
1488
1489         tunnel = netdev_priv(dev);
1490
1491         tunnel->dev = dev;
1492         strcpy(tunnel->parms.name, dev->name);
1493
1494         ipgre_tunnel_bind_dev(dev);
1495
1496         dev->tstats = alloc_percpu(struct pcpu_tstats);
1497         if (!dev->tstats)
1498                 return -ENOMEM;
1499
1500         return 0;
1501 }
1502
1503 static const struct net_device_ops ipgre_tap_netdev_ops = {
1504         .ndo_init               = ipgre_tap_init,
1505         .ndo_uninit             = ipgre_tunnel_uninit,
1506         .ndo_start_xmit         = ipgre_tunnel_xmit,
1507         .ndo_set_mac_address    = eth_mac_addr,
1508         .ndo_validate_addr      = eth_validate_addr,
1509         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1510         .ndo_get_stats          = ipgre_get_stats,
1511 };
1512
1513 static void ipgre_tap_setup(struct net_device *dev)
1514 {
1515
1516         ether_setup(dev);
1517
1518         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1519         dev->destructor         = ipgre_dev_free;
1520
1521         dev->iflink             = 0;
1522         dev->features           |= NETIF_F_NETNS_LOCAL;
1523 }
1524
1525 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1526                          struct nlattr *data[])
1527 {
1528         struct ip_tunnel *nt;
1529         struct net *net = dev_net(dev);
1530         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1531         int mtu;
1532         int err;
1533
1534         nt = netdev_priv(dev);
1535         ipgre_netlink_parms(data, &nt->parms);
1536
1537         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1538                 return -EEXIST;
1539
1540         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1541                 eth_hw_addr_random(dev);
1542
1543         mtu = ipgre_tunnel_bind_dev(dev);
1544         if (!tb[IFLA_MTU])
1545                 dev->mtu = mtu;
1546
1547         /* Can use a lockless transmit, unless we generate output sequences */
1548         if (!(nt->parms.o_flags & GRE_SEQ))
1549                 dev->features |= NETIF_F_LLTX;
1550
1551         err = register_netdevice(dev);
1552         if (err)
1553                 goto out;
1554
1555         dev_hold(dev);
1556         ipgre_tunnel_link(ign, nt);
1557
1558 out:
1559         return err;
1560 }
1561
1562 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1563                             struct nlattr *data[])
1564 {
1565         struct ip_tunnel *t, *nt;
1566         struct net *net = dev_net(dev);
1567         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1568         struct ip_tunnel_parm p;
1569         int mtu;
1570
1571         if (dev == ign->fb_tunnel_dev)
1572                 return -EINVAL;
1573
1574         nt = netdev_priv(dev);
1575         ipgre_netlink_parms(data, &p);
1576
1577         t = ipgre_tunnel_locate(net, &p, 0);
1578
1579         if (t) {
1580                 if (t->dev != dev)
1581                         return -EEXIST;
1582         } else {
1583                 t = nt;
1584
1585                 if (dev->type != ARPHRD_ETHER) {
1586                         unsigned int nflags = 0;
1587
1588                         if (ipv4_is_multicast(p.iph.daddr))
1589                                 nflags = IFF_BROADCAST;
1590                         else if (p.iph.daddr)
1591                                 nflags = IFF_POINTOPOINT;
1592
1593                         if ((dev->flags ^ nflags) &
1594                             (IFF_POINTOPOINT | IFF_BROADCAST))
1595                                 return -EINVAL;
1596                 }
1597
1598                 ipgre_tunnel_unlink(ign, t);
1599                 t->parms.iph.saddr = p.iph.saddr;
1600                 t->parms.iph.daddr = p.iph.daddr;
1601                 t->parms.i_key = p.i_key;
1602                 if (dev->type != ARPHRD_ETHER) {
1603                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1604                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1605                 }
1606                 ipgre_tunnel_link(ign, t);
1607                 netdev_state_change(dev);
1608         }
1609
1610         t->parms.o_key = p.o_key;
1611         t->parms.iph.ttl = p.iph.ttl;
1612         t->parms.iph.tos = p.iph.tos;
1613         t->parms.iph.frag_off = p.iph.frag_off;
1614
1615         if (t->parms.link != p.link) {
1616                 t->parms.link = p.link;
1617                 mtu = ipgre_tunnel_bind_dev(dev);
1618                 if (!tb[IFLA_MTU])
1619                         dev->mtu = mtu;
1620                 netdev_state_change(dev);
1621         }
1622
1623         return 0;
1624 }
1625
1626 static size_t ipgre_get_size(const struct net_device *dev)
1627 {
1628         return
1629                 /* IFLA_GRE_LINK */
1630                 nla_total_size(4) +
1631                 /* IFLA_GRE_IFLAGS */
1632                 nla_total_size(2) +
1633                 /* IFLA_GRE_OFLAGS */
1634                 nla_total_size(2) +
1635                 /* IFLA_GRE_IKEY */
1636                 nla_total_size(4) +
1637                 /* IFLA_GRE_OKEY */
1638                 nla_total_size(4) +
1639                 /* IFLA_GRE_LOCAL */
1640                 nla_total_size(4) +
1641                 /* IFLA_GRE_REMOTE */
1642                 nla_total_size(4) +
1643                 /* IFLA_GRE_TTL */
1644                 nla_total_size(1) +
1645                 /* IFLA_GRE_TOS */
1646                 nla_total_size(1) +
1647                 /* IFLA_GRE_PMTUDISC */
1648                 nla_total_size(1) +
1649                 0;
1650 }
1651
1652 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1653 {
1654         struct ip_tunnel *t = netdev_priv(dev);
1655         struct ip_tunnel_parm *p = &t->parms;
1656
1657         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1658         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1659         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1660         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1661         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1662         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1663         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1664         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1665         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1666         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1667
1668         return 0;
1669
1670 nla_put_failure:
1671         return -EMSGSIZE;
1672 }
1673
1674 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1675         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1676         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1677         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1678         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1679         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1680         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1681         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1682         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1683         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1684         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1685 };
1686
1687 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1688         .kind           = "gre",
1689         .maxtype        = IFLA_GRE_MAX,
1690         .policy         = ipgre_policy,
1691         .priv_size      = sizeof(struct ip_tunnel),
1692         .setup          = ipgre_tunnel_setup,
1693         .validate       = ipgre_tunnel_validate,
1694         .newlink        = ipgre_newlink,
1695         .changelink     = ipgre_changelink,
1696         .get_size       = ipgre_get_size,
1697         .fill_info      = ipgre_fill_info,
1698 };
1699
1700 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1701         .kind           = "gretap",
1702         .maxtype        = IFLA_GRE_MAX,
1703         .policy         = ipgre_policy,
1704         .priv_size      = sizeof(struct ip_tunnel),
1705         .setup          = ipgre_tap_setup,
1706         .validate       = ipgre_tap_validate,
1707         .newlink        = ipgre_newlink,
1708         .changelink     = ipgre_changelink,
1709         .get_size       = ipgre_get_size,
1710         .fill_info      = ipgre_fill_info,
1711 };
1712
1713 /*
1714  *      And now the modules code and kernel interface.
1715  */
1716
1717 static int __init ipgre_init(void)
1718 {
1719         int err;
1720
1721         pr_info("GRE over IPv4 tunneling driver\n");
1722
1723         err = register_pernet_device(&ipgre_net_ops);
1724         if (err < 0)
1725                 return err;
1726
1727         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1728         if (err < 0) {
1729                 pr_info("%s: can't add protocol\n", __func__);
1730                 goto add_proto_failed;
1731         }
1732
1733         err = rtnl_link_register(&ipgre_link_ops);
1734         if (err < 0)
1735                 goto rtnl_link_failed;
1736
1737         err = rtnl_link_register(&ipgre_tap_ops);
1738         if (err < 0)
1739                 goto tap_ops_failed;
1740
1741 out:
1742         return err;
1743
1744 tap_ops_failed:
1745         rtnl_link_unregister(&ipgre_link_ops);
1746 rtnl_link_failed:
1747         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1748 add_proto_failed:
1749         unregister_pernet_device(&ipgre_net_ops);
1750         goto out;
1751 }
1752
1753 static void __exit ipgre_fini(void)
1754 {
1755         rtnl_link_unregister(&ipgre_tap_ops);
1756         rtnl_link_unregister(&ipgre_link_ops);
1757         if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1758                 pr_info("%s: can't remove protocol\n", __func__);
1759         unregister_pernet_device(&ipgre_net_ops);
1760 }
1761
1762 module_init(ipgre_init);
1763 module_exit(ipgre_fini);
1764 MODULE_LICENSE("GPL");
1765 MODULE_ALIAS_RTNL_LINK("gre");
1766 MODULE_ALIAS_RTNL_LINK("gretap");
1767 MODULE_ALIAS_NETDEV("gre0");