Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net...
[pandora-kernel.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 #include <net/udp.h>
59
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68         return hash_32((__force u32)key ^ (__force u32)remote,
69                          IP_TNL_HASH_BITS);
70 }
71
72 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
73                              struct dst_entry *dst, __be32 saddr)
74 {
75         struct dst_entry *old_dst;
76
77         dst_clone(dst);
78         old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
79         dst_release(old_dst);
80         idst->saddr = saddr;
81 }
82
83 static noinline void tunnel_dst_set(struct ip_tunnel *t,
84                            struct dst_entry *dst, __be32 saddr)
85 {
86         __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
87 }
88
89 static void tunnel_dst_reset(struct ip_tunnel *t)
90 {
91         tunnel_dst_set(t, NULL, 0);
92 }
93
94 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
95 {
96         int i;
97
98         for_each_possible_cpu(i)
99                 __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
100 }
101 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
102
103 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
104                                         u32 cookie, __be32 *saddr)
105 {
106         struct ip_tunnel_dst *idst;
107         struct dst_entry *dst;
108
109         rcu_read_lock();
110         idst = raw_cpu_ptr(t->dst_cache);
111         dst = rcu_dereference(idst->dst);
112         if (dst && !atomic_inc_not_zero(&dst->__refcnt))
113                 dst = NULL;
114         if (dst) {
115                 if (!dst->obsolete || dst->ops->check(dst, cookie)) {
116                         *saddr = idst->saddr;
117                 } else {
118                         tunnel_dst_reset(t);
119                         dst_release(dst);
120                         dst = NULL;
121                 }
122         }
123         rcu_read_unlock();
124         return (struct rtable *)dst;
125 }
126
127 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
128                                 __be16 flags, __be32 key)
129 {
130         if (p->i_flags & TUNNEL_KEY) {
131                 if (flags & TUNNEL_KEY)
132                         return key == p->i_key;
133                 else
134                         /* key expected, none present */
135                         return false;
136         } else
137                 return !(flags & TUNNEL_KEY);
138 }
139
140 /* Fallback tunnel: no source, no destination, no key, no options
141
142    Tunnel hash table:
143    We require exact key match i.e. if a key is present in packet
144    it will match only tunnel with the same key; if it is not present,
145    it will match only keyless tunnel.
146
147    All keysless packets, if not matched configured keyless tunnels
148    will match fallback tunnel.
149    Given src, dst and key, find appropriate for input tunnel.
150 */
151 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
152                                    int link, __be16 flags,
153                                    __be32 remote, __be32 local,
154                                    __be32 key)
155 {
156         unsigned int hash;
157         struct ip_tunnel *t, *cand = NULL;
158         struct hlist_head *head;
159
160         hash = ip_tunnel_hash(key, remote);
161         head = &itn->tunnels[hash];
162
163         hlist_for_each_entry_rcu(t, head, hash_node) {
164                 if (local != t->parms.iph.saddr ||
165                     remote != t->parms.iph.daddr ||
166                     !(t->dev->flags & IFF_UP))
167                         continue;
168
169                 if (!ip_tunnel_key_match(&t->parms, flags, key))
170                         continue;
171
172                 if (t->parms.link == link)
173                         return t;
174                 else
175                         cand = t;
176         }
177
178         hlist_for_each_entry_rcu(t, head, hash_node) {
179                 if (remote != t->parms.iph.daddr ||
180                     t->parms.iph.saddr != 0 ||
181                     !(t->dev->flags & IFF_UP))
182                         continue;
183
184                 if (!ip_tunnel_key_match(&t->parms, flags, key))
185                         continue;
186
187                 if (t->parms.link == link)
188                         return t;
189                 else if (!cand)
190                         cand = t;
191         }
192
193         hash = ip_tunnel_hash(key, 0);
194         head = &itn->tunnels[hash];
195
196         hlist_for_each_entry_rcu(t, head, hash_node) {
197                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
198                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
199                         continue;
200
201                 if (!(t->dev->flags & IFF_UP))
202                         continue;
203
204                 if (!ip_tunnel_key_match(&t->parms, flags, key))
205                         continue;
206
207                 if (t->parms.link == link)
208                         return t;
209                 else if (!cand)
210                         cand = t;
211         }
212
213         if (flags & TUNNEL_NO_KEY)
214                 goto skip_key_lookup;
215
216         hlist_for_each_entry_rcu(t, head, hash_node) {
217                 if (t->parms.i_key != key ||
218                     t->parms.iph.saddr != 0 ||
219                     t->parms.iph.daddr != 0 ||
220                     !(t->dev->flags & IFF_UP))
221                         continue;
222
223                 if (t->parms.link == link)
224                         return t;
225                 else if (!cand)
226                         cand = t;
227         }
228
229 skip_key_lookup:
230         if (cand)
231                 return cand;
232
233         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
234                 return netdev_priv(itn->fb_tunnel_dev);
235
236
237         return NULL;
238 }
239 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
240
241 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
242                                     struct ip_tunnel_parm *parms)
243 {
244         unsigned int h;
245         __be32 remote;
246         __be32 i_key = parms->i_key;
247
248         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
249                 remote = parms->iph.daddr;
250         else
251                 remote = 0;
252
253         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
254                 i_key = 0;
255
256         h = ip_tunnel_hash(i_key, remote);
257         return &itn->tunnels[h];
258 }
259
260 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
261 {
262         struct hlist_head *head = ip_bucket(itn, &t->parms);
263
264         hlist_add_head_rcu(&t->hash_node, head);
265 }
266
267 static void ip_tunnel_del(struct ip_tunnel *t)
268 {
269         hlist_del_init_rcu(&t->hash_node);
270 }
271
272 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
273                                         struct ip_tunnel_parm *parms,
274                                         int type)
275 {
276         __be32 remote = parms->iph.daddr;
277         __be32 local = parms->iph.saddr;
278         __be32 key = parms->i_key;
279         __be16 flags = parms->i_flags;
280         int link = parms->link;
281         struct ip_tunnel *t = NULL;
282         struct hlist_head *head = ip_bucket(itn, parms);
283
284         hlist_for_each_entry_rcu(t, head, hash_node) {
285                 if (local == t->parms.iph.saddr &&
286                     remote == t->parms.iph.daddr &&
287                     link == t->parms.link &&
288                     type == t->dev->type &&
289                     ip_tunnel_key_match(&t->parms, flags, key))
290                         break;
291         }
292         return t;
293 }
294
295 static struct net_device *__ip_tunnel_create(struct net *net,
296                                              const struct rtnl_link_ops *ops,
297                                              struct ip_tunnel_parm *parms)
298 {
299         int err;
300         struct ip_tunnel *tunnel;
301         struct net_device *dev;
302         char name[IFNAMSIZ];
303
304         if (parms->name[0])
305                 strlcpy(name, parms->name, IFNAMSIZ);
306         else {
307                 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
308                         err = -E2BIG;
309                         goto failed;
310                 }
311                 strlcpy(name, ops->kind, IFNAMSIZ);
312                 strncat(name, "%d", 2);
313         }
314
315         ASSERT_RTNL();
316         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
317         if (!dev) {
318                 err = -ENOMEM;
319                 goto failed;
320         }
321         dev_net_set(dev, net);
322
323         dev->rtnl_link_ops = ops;
324
325         tunnel = netdev_priv(dev);
326         tunnel->parms = *parms;
327         tunnel->net = net;
328
329         err = register_netdevice(dev);
330         if (err)
331                 goto failed_free;
332
333         return dev;
334
335 failed_free:
336         free_netdev(dev);
337 failed:
338         return ERR_PTR(err);
339 }
340
341 static inline void init_tunnel_flow(struct flowi4 *fl4,
342                                     int proto,
343                                     __be32 daddr, __be32 saddr,
344                                     __be32 key, __u8 tos, int oif)
345 {
346         memset(fl4, 0, sizeof(*fl4));
347         fl4->flowi4_oif = oif;
348         fl4->daddr = daddr;
349         fl4->saddr = saddr;
350         fl4->flowi4_tos = tos;
351         fl4->flowi4_proto = proto;
352         fl4->fl4_gre_key = key;
353 }
354
355 static int ip_tunnel_bind_dev(struct net_device *dev)
356 {
357         struct net_device *tdev = NULL;
358         struct ip_tunnel *tunnel = netdev_priv(dev);
359         const struct iphdr *iph;
360         int hlen = LL_MAX_HEADER;
361         int mtu = ETH_DATA_LEN;
362         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
363
364         iph = &tunnel->parms.iph;
365
366         /* Guess output device to choose reasonable mtu and needed_headroom */
367         if (iph->daddr) {
368                 struct flowi4 fl4;
369                 struct rtable *rt;
370
371                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
372                                  iph->saddr, tunnel->parms.o_key,
373                                  RT_TOS(iph->tos), tunnel->parms.link);
374                 rt = ip_route_output_key(tunnel->net, &fl4);
375
376                 if (!IS_ERR(rt)) {
377                         tdev = rt->dst.dev;
378                         tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
379                         ip_rt_put(rt);
380                 }
381                 if (dev->type != ARPHRD_ETHER)
382                         dev->flags |= IFF_POINTOPOINT;
383         }
384
385         if (!tdev && tunnel->parms.link)
386                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
387
388         if (tdev) {
389                 hlen = tdev->hard_header_len + tdev->needed_headroom;
390                 mtu = tdev->mtu;
391         }
392         dev->iflink = tunnel->parms.link;
393
394         dev->needed_headroom = t_hlen + hlen;
395         mtu -= (dev->hard_header_len + t_hlen);
396
397         if (mtu < 68)
398                 mtu = 68;
399
400         return mtu;
401 }
402
403 static struct ip_tunnel *ip_tunnel_create(struct net *net,
404                                           struct ip_tunnel_net *itn,
405                                           struct ip_tunnel_parm *parms)
406 {
407         struct ip_tunnel *nt;
408         struct net_device *dev;
409
410         BUG_ON(!itn->fb_tunnel_dev);
411         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
412         if (IS_ERR(dev))
413                 return ERR_CAST(dev);
414
415         dev->mtu = ip_tunnel_bind_dev(dev);
416
417         nt = netdev_priv(dev);
418         ip_tunnel_add(itn, nt);
419         return nt;
420 }
421
422 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
423                   const struct tnl_ptk_info *tpi, bool log_ecn_error)
424 {
425         struct pcpu_sw_netstats *tstats;
426         const struct iphdr *iph = ip_hdr(skb);
427         int err;
428
429 #ifdef CONFIG_NET_IPGRE_BROADCAST
430         if (ipv4_is_multicast(iph->daddr)) {
431                 tunnel->dev->stats.multicast++;
432                 skb->pkt_type = PACKET_BROADCAST;
433         }
434 #endif
435
436         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
437              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
438                 tunnel->dev->stats.rx_crc_errors++;
439                 tunnel->dev->stats.rx_errors++;
440                 goto drop;
441         }
442
443         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
444                 if (!(tpi->flags&TUNNEL_SEQ) ||
445                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
446                         tunnel->dev->stats.rx_fifo_errors++;
447                         tunnel->dev->stats.rx_errors++;
448                         goto drop;
449                 }
450                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
451         }
452
453         skb_reset_network_header(skb);
454
455         err = IP_ECN_decapsulate(iph, skb);
456         if (unlikely(err)) {
457                 if (log_ecn_error)
458                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
459                                         &iph->saddr, iph->tos);
460                 if (err > 1) {
461                         ++tunnel->dev->stats.rx_frame_errors;
462                         ++tunnel->dev->stats.rx_errors;
463                         goto drop;
464                 }
465         }
466
467         tstats = this_cpu_ptr(tunnel->dev->tstats);
468         u64_stats_update_begin(&tstats->syncp);
469         tstats->rx_packets++;
470         tstats->rx_bytes += skb->len;
471         u64_stats_update_end(&tstats->syncp);
472
473         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
474
475         if (tunnel->dev->type == ARPHRD_ETHER) {
476                 skb->protocol = eth_type_trans(skb, tunnel->dev);
477                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
478         } else {
479                 skb->dev = tunnel->dev;
480         }
481
482         gro_cells_receive(&tunnel->gro_cells, skb);
483         return 0;
484
485 drop:
486         kfree_skb(skb);
487         return 0;
488 }
489 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
490
491 static int ip_encap_hlen(struct ip_tunnel_encap *e)
492 {
493         switch (e->type) {
494         case TUNNEL_ENCAP_NONE:
495                 return 0;
496         case TUNNEL_ENCAP_FOU:
497                 return sizeof(struct udphdr);
498         default:
499                 return -EINVAL;
500         }
501 }
502
503 int ip_tunnel_encap_setup(struct ip_tunnel *t,
504                           struct ip_tunnel_encap *ipencap)
505 {
506         int hlen;
507
508         memset(&t->encap, 0, sizeof(t->encap));
509
510         hlen = ip_encap_hlen(ipencap);
511         if (hlen < 0)
512                 return hlen;
513
514         t->encap.type = ipencap->type;
515         t->encap.sport = ipencap->sport;
516         t->encap.dport = ipencap->dport;
517         t->encap.flags = ipencap->flags;
518
519         t->encap_hlen = hlen;
520         t->hlen = t->encap_hlen + t->tun_hlen;
521
522         return 0;
523 }
524 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
525
526 static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
527                             size_t hdr_len, u8 *protocol, struct flowi4 *fl4)
528 {
529         struct udphdr *uh;
530         __be16 sport;
531         bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
532         int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
533
534         skb = iptunnel_handle_offloads(skb, csum, type);
535
536         if (IS_ERR(skb))
537                 return PTR_ERR(skb);
538
539         /* Get length and hash before making space in skb */
540
541         sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
542                                                skb, 0, 0, false);
543
544         skb_push(skb, hdr_len);
545
546         skb_reset_transport_header(skb);
547         uh = udp_hdr(skb);
548
549         uh->dest = e->dport;
550         uh->source = sport;
551         uh->len = htons(skb->len);
552         uh->check = 0;
553         udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb,
554                      fl4->saddr, fl4->daddr, skb->len);
555
556         *protocol = IPPROTO_UDP;
557
558         return 0;
559 }
560
561 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
562                     u8 *protocol, struct flowi4 *fl4)
563 {
564         switch (t->encap.type) {
565         case TUNNEL_ENCAP_NONE:
566                 return 0;
567         case TUNNEL_ENCAP_FOU:
568                 return fou_build_header(skb, &t->encap, t->encap_hlen,
569                                         protocol, fl4);
570         default:
571                 return -EINVAL;
572         }
573 }
574 EXPORT_SYMBOL(ip_tunnel_encap);
575
576 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
577                             struct rtable *rt, __be16 df)
578 {
579         struct ip_tunnel *tunnel = netdev_priv(dev);
580         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
581         int mtu;
582
583         if (df)
584                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
585                                         - sizeof(struct iphdr) - tunnel->hlen;
586         else
587                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
588
589         if (skb_dst(skb))
590                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
591
592         if (skb->protocol == htons(ETH_P_IP)) {
593                 if (!skb_is_gso(skb) &&
594                     (df & htons(IP_DF)) && mtu < pkt_size) {
595                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
596                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
597                         return -E2BIG;
598                 }
599         }
600 #if IS_ENABLED(CONFIG_IPV6)
601         else if (skb->protocol == htons(ETH_P_IPV6)) {
602                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
603
604                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
605                            mtu >= IPV6_MIN_MTU) {
606                         if ((tunnel->parms.iph.daddr &&
607                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
608                             rt6->rt6i_dst.plen == 128) {
609                                 rt6->rt6i_flags |= RTF_MODIFIED;
610                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
611                         }
612                 }
613
614                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
615                                         mtu < pkt_size) {
616                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
617                         return -E2BIG;
618                 }
619         }
620 #endif
621         return 0;
622 }
623
624 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
625                     const struct iphdr *tnl_params, u8 protocol)
626 {
627         struct ip_tunnel *tunnel = netdev_priv(dev);
628         const struct iphdr *inner_iph;
629         struct flowi4 fl4;
630         u8     tos, ttl;
631         __be16 df;
632         struct rtable *rt;              /* Route to the other host */
633         unsigned int max_headroom;      /* The extra header space needed */
634         __be32 dst;
635         int err;
636         bool connected;
637
638         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
639         connected = (tunnel->parms.iph.daddr != 0);
640
641         dst = tnl_params->daddr;
642         if (dst == 0) {
643                 /* NBMA tunnel */
644
645                 if (skb_dst(skb) == NULL) {
646                         dev->stats.tx_fifo_errors++;
647                         goto tx_error;
648                 }
649
650                 if (skb->protocol == htons(ETH_P_IP)) {
651                         rt = skb_rtable(skb);
652                         dst = rt_nexthop(rt, inner_iph->daddr);
653                 }
654 #if IS_ENABLED(CONFIG_IPV6)
655                 else if (skb->protocol == htons(ETH_P_IPV6)) {
656                         const struct in6_addr *addr6;
657                         struct neighbour *neigh;
658                         bool do_tx_error_icmp;
659                         int addr_type;
660
661                         neigh = dst_neigh_lookup(skb_dst(skb),
662                                                  &ipv6_hdr(skb)->daddr);
663                         if (neigh == NULL)
664                                 goto tx_error;
665
666                         addr6 = (const struct in6_addr *)&neigh->primary_key;
667                         addr_type = ipv6_addr_type(addr6);
668
669                         if (addr_type == IPV6_ADDR_ANY) {
670                                 addr6 = &ipv6_hdr(skb)->daddr;
671                                 addr_type = ipv6_addr_type(addr6);
672                         }
673
674                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
675                                 do_tx_error_icmp = true;
676                         else {
677                                 do_tx_error_icmp = false;
678                                 dst = addr6->s6_addr32[3];
679                         }
680                         neigh_release(neigh);
681                         if (do_tx_error_icmp)
682                                 goto tx_error_icmp;
683                 }
684 #endif
685                 else
686                         goto tx_error;
687
688                 connected = false;
689         }
690
691         tos = tnl_params->tos;
692         if (tos & 0x1) {
693                 tos &= ~0x1;
694                 if (skb->protocol == htons(ETH_P_IP)) {
695                         tos = inner_iph->tos;
696                         connected = false;
697                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
698                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
699                         connected = false;
700                 }
701         }
702
703         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
704                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
705
706         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
707                 goto tx_error;
708
709         rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
710
711         if (!rt) {
712                 rt = ip_route_output_key(tunnel->net, &fl4);
713
714                 if (IS_ERR(rt)) {
715                         dev->stats.tx_carrier_errors++;
716                         goto tx_error;
717                 }
718                 if (connected)
719                         tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
720         }
721
722         if (rt->dst.dev == dev) {
723                 ip_rt_put(rt);
724                 dev->stats.collisions++;
725                 goto tx_error;
726         }
727
728         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
729                 ip_rt_put(rt);
730                 goto tx_error;
731         }
732
733         if (tunnel->err_count > 0) {
734                 if (time_before(jiffies,
735                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
736                         tunnel->err_count--;
737
738                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
739                         dst_link_failure(skb);
740                 } else
741                         tunnel->err_count = 0;
742         }
743
744         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
745         ttl = tnl_params->ttl;
746         if (ttl == 0) {
747                 if (skb->protocol == htons(ETH_P_IP))
748                         ttl = inner_iph->ttl;
749 #if IS_ENABLED(CONFIG_IPV6)
750                 else if (skb->protocol == htons(ETH_P_IPV6))
751                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
752 #endif
753                 else
754                         ttl = ip4_dst_hoplimit(&rt->dst);
755         }
756
757         df = tnl_params->frag_off;
758         if (skb->protocol == htons(ETH_P_IP))
759                 df |= (inner_iph->frag_off&htons(IP_DF));
760
761         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
762                         + rt->dst.header_len;
763         if (max_headroom > dev->needed_headroom)
764                 dev->needed_headroom = max_headroom;
765
766         if (skb_cow_head(skb, dev->needed_headroom)) {
767                 ip_rt_put(rt);
768                 dev->stats.tx_dropped++;
769                 kfree_skb(skb);
770                 return;
771         }
772
773         err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,
774                             tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
775         iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
776
777         return;
778
779 #if IS_ENABLED(CONFIG_IPV6)
780 tx_error_icmp:
781         dst_link_failure(skb);
782 #endif
783 tx_error:
784         dev->stats.tx_errors++;
785         kfree_skb(skb);
786 }
787 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
788
789 static void ip_tunnel_update(struct ip_tunnel_net *itn,
790                              struct ip_tunnel *t,
791                              struct net_device *dev,
792                              struct ip_tunnel_parm *p,
793                              bool set_mtu)
794 {
795         ip_tunnel_del(t);
796         t->parms.iph.saddr = p->iph.saddr;
797         t->parms.iph.daddr = p->iph.daddr;
798         t->parms.i_key = p->i_key;
799         t->parms.o_key = p->o_key;
800         if (dev->type != ARPHRD_ETHER) {
801                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
802                 memcpy(dev->broadcast, &p->iph.daddr, 4);
803         }
804         ip_tunnel_add(itn, t);
805
806         t->parms.iph.ttl = p->iph.ttl;
807         t->parms.iph.tos = p->iph.tos;
808         t->parms.iph.frag_off = p->iph.frag_off;
809
810         if (t->parms.link != p->link) {
811                 int mtu;
812
813                 t->parms.link = p->link;
814                 mtu = ip_tunnel_bind_dev(dev);
815                 if (set_mtu)
816                         dev->mtu = mtu;
817         }
818         ip_tunnel_dst_reset_all(t);
819         netdev_state_change(dev);
820 }
821
822 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
823 {
824         int err = 0;
825         struct ip_tunnel *t = netdev_priv(dev);
826         struct net *net = t->net;
827         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
828
829         BUG_ON(!itn->fb_tunnel_dev);
830         switch (cmd) {
831         case SIOCGETTUNNEL:
832                 if (dev == itn->fb_tunnel_dev) {
833                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
834                         if (t == NULL)
835                                 t = netdev_priv(dev);
836                 }
837                 memcpy(p, &t->parms, sizeof(*p));
838                 break;
839
840         case SIOCADDTUNNEL:
841         case SIOCCHGTUNNEL:
842                 err = -EPERM;
843                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
844                         goto done;
845                 if (p->iph.ttl)
846                         p->iph.frag_off |= htons(IP_DF);
847                 if (!(p->i_flags & VTI_ISVTI)) {
848                         if (!(p->i_flags & TUNNEL_KEY))
849                                 p->i_key = 0;
850                         if (!(p->o_flags & TUNNEL_KEY))
851                                 p->o_key = 0;
852                 }
853
854                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
855
856                 if (cmd == SIOCADDTUNNEL) {
857                         if (!t) {
858                                 t = ip_tunnel_create(net, itn, p);
859                                 err = PTR_ERR_OR_ZERO(t);
860                                 break;
861                         }
862
863                         err = -EEXIST;
864                         break;
865                 }
866                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
867                         if (t != NULL) {
868                                 if (t->dev != dev) {
869                                         err = -EEXIST;
870                                         break;
871                                 }
872                         } else {
873                                 unsigned int nflags = 0;
874
875                                 if (ipv4_is_multicast(p->iph.daddr))
876                                         nflags = IFF_BROADCAST;
877                                 else if (p->iph.daddr)
878                                         nflags = IFF_POINTOPOINT;
879
880                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
881                                         err = -EINVAL;
882                                         break;
883                                 }
884
885                                 t = netdev_priv(dev);
886                         }
887                 }
888
889                 if (t) {
890                         err = 0;
891                         ip_tunnel_update(itn, t, dev, p, true);
892                 } else {
893                         err = -ENOENT;
894                 }
895                 break;
896
897         case SIOCDELTUNNEL:
898                 err = -EPERM;
899                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
900                         goto done;
901
902                 if (dev == itn->fb_tunnel_dev) {
903                         err = -ENOENT;
904                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
905                         if (t == NULL)
906                                 goto done;
907                         err = -EPERM;
908                         if (t == netdev_priv(itn->fb_tunnel_dev))
909                                 goto done;
910                         dev = t->dev;
911                 }
912                 unregister_netdevice(dev);
913                 err = 0;
914                 break;
915
916         default:
917                 err = -EINVAL;
918         }
919
920 done:
921         return err;
922 }
923 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
924
925 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
926 {
927         struct ip_tunnel *tunnel = netdev_priv(dev);
928         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
929
930         if (new_mtu < 68 ||
931             new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
932                 return -EINVAL;
933         dev->mtu = new_mtu;
934         return 0;
935 }
936 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
937
938 static void ip_tunnel_dev_free(struct net_device *dev)
939 {
940         struct ip_tunnel *tunnel = netdev_priv(dev);
941
942         gro_cells_destroy(&tunnel->gro_cells);
943         free_percpu(tunnel->dst_cache);
944         free_percpu(dev->tstats);
945         free_netdev(dev);
946 }
947
948 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
949 {
950         struct ip_tunnel *tunnel = netdev_priv(dev);
951         struct ip_tunnel_net *itn;
952
953         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
954
955         if (itn->fb_tunnel_dev != dev) {
956                 ip_tunnel_del(netdev_priv(dev));
957                 unregister_netdevice_queue(dev, head);
958         }
959 }
960 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
961
962 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
963                                   struct rtnl_link_ops *ops, char *devname)
964 {
965         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
966         struct ip_tunnel_parm parms;
967         unsigned int i;
968
969         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
970                 INIT_HLIST_HEAD(&itn->tunnels[i]);
971
972         if (!ops) {
973                 itn->fb_tunnel_dev = NULL;
974                 return 0;
975         }
976
977         memset(&parms, 0, sizeof(parms));
978         if (devname)
979                 strlcpy(parms.name, devname, IFNAMSIZ);
980
981         rtnl_lock();
982         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
983         /* FB netdevice is special: we have one, and only one per netns.
984          * Allowing to move it to another netns is clearly unsafe.
985          */
986         if (!IS_ERR(itn->fb_tunnel_dev)) {
987                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
988                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
989                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
990         }
991         rtnl_unlock();
992
993         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
994 }
995 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
996
997 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
998                               struct rtnl_link_ops *ops)
999 {
1000         struct net *net = dev_net(itn->fb_tunnel_dev);
1001         struct net_device *dev, *aux;
1002         int h;
1003
1004         for_each_netdev_safe(net, dev, aux)
1005                 if (dev->rtnl_link_ops == ops)
1006                         unregister_netdevice_queue(dev, head);
1007
1008         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1009                 struct ip_tunnel *t;
1010                 struct hlist_node *n;
1011                 struct hlist_head *thead = &itn->tunnels[h];
1012
1013                 hlist_for_each_entry_safe(t, n, thead, hash_node)
1014                         /* If dev is in the same netns, it has already
1015                          * been added to the list by the previous loop.
1016                          */
1017                         if (!net_eq(dev_net(t->dev), net))
1018                                 unregister_netdevice_queue(t->dev, head);
1019         }
1020 }
1021
1022 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1023 {
1024         LIST_HEAD(list);
1025
1026         rtnl_lock();
1027         ip_tunnel_destroy(itn, &list, ops);
1028         unregister_netdevice_many(&list);
1029         rtnl_unlock();
1030 }
1031 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1032
1033 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1034                       struct ip_tunnel_parm *p)
1035 {
1036         struct ip_tunnel *nt;
1037         struct net *net = dev_net(dev);
1038         struct ip_tunnel_net *itn;
1039         int mtu;
1040         int err;
1041
1042         nt = netdev_priv(dev);
1043         itn = net_generic(net, nt->ip_tnl_net_id);
1044
1045         if (ip_tunnel_find(itn, p, dev->type))
1046                 return -EEXIST;
1047
1048         nt->net = net;
1049         nt->parms = *p;
1050         err = register_netdevice(dev);
1051         if (err)
1052                 goto out;
1053
1054         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1055                 eth_hw_addr_random(dev);
1056
1057         mtu = ip_tunnel_bind_dev(dev);
1058         if (!tb[IFLA_MTU])
1059                 dev->mtu = mtu;
1060
1061         ip_tunnel_add(itn, nt);
1062
1063 out:
1064         return err;
1065 }
1066 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1067
1068 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1069                          struct ip_tunnel_parm *p)
1070 {
1071         struct ip_tunnel *t;
1072         struct ip_tunnel *tunnel = netdev_priv(dev);
1073         struct net *net = tunnel->net;
1074         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1075
1076         if (dev == itn->fb_tunnel_dev)
1077                 return -EINVAL;
1078
1079         t = ip_tunnel_find(itn, p, dev->type);
1080
1081         if (t) {
1082                 if (t->dev != dev)
1083                         return -EEXIST;
1084         } else {
1085                 t = tunnel;
1086
1087                 if (dev->type != ARPHRD_ETHER) {
1088                         unsigned int nflags = 0;
1089
1090                         if (ipv4_is_multicast(p->iph.daddr))
1091                                 nflags = IFF_BROADCAST;
1092                         else if (p->iph.daddr)
1093                                 nflags = IFF_POINTOPOINT;
1094
1095                         if ((dev->flags ^ nflags) &
1096                             (IFF_POINTOPOINT | IFF_BROADCAST))
1097                                 return -EINVAL;
1098                 }
1099         }
1100
1101         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1102         return 0;
1103 }
1104 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1105
1106 int ip_tunnel_init(struct net_device *dev)
1107 {
1108         struct ip_tunnel *tunnel = netdev_priv(dev);
1109         struct iphdr *iph = &tunnel->parms.iph;
1110         int err;
1111
1112         dev->destructor = ip_tunnel_dev_free;
1113         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1114         if (!dev->tstats)
1115                 return -ENOMEM;
1116
1117         tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1118         if (!tunnel->dst_cache) {
1119                 free_percpu(dev->tstats);
1120                 return -ENOMEM;
1121         }
1122
1123         err = gro_cells_init(&tunnel->gro_cells, dev);
1124         if (err) {
1125                 free_percpu(tunnel->dst_cache);
1126                 free_percpu(dev->tstats);
1127                 return err;
1128         }
1129
1130         tunnel->dev = dev;
1131         tunnel->net = dev_net(dev);
1132         strcpy(tunnel->parms.name, dev->name);
1133         iph->version            = 4;
1134         iph->ihl                = 5;
1135
1136         return 0;
1137 }
1138 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1139
1140 void ip_tunnel_uninit(struct net_device *dev)
1141 {
1142         struct ip_tunnel *tunnel = netdev_priv(dev);
1143         struct net *net = tunnel->net;
1144         struct ip_tunnel_net *itn;
1145
1146         itn = net_generic(net, tunnel->ip_tnl_net_id);
1147         /* fb_tunnel_dev will be unregisted in net-exit call. */
1148         if (itn->fb_tunnel_dev != dev)
1149                 ip_tunnel_del(netdev_priv(dev));
1150
1151         ip_tunnel_dst_reset_all(tunnel);
1152 }
1153 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1154
1155 /* Do least required initialization, rest of init is done in tunnel_init call */
1156 void ip_tunnel_setup(struct net_device *dev, int net_id)
1157 {
1158         struct ip_tunnel *tunnel = netdev_priv(dev);
1159         tunnel->ip_tnl_net_id = net_id;
1160 }
1161 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1162
1163 MODULE_LICENSE("GPL");