2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
61 int __ip6_local_out(struct sk_buff *skb)
65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN)
68 ipv6_hdr(skb)->payload_len = htons(len);
70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 skb_dst(skb)->dev, dst_output);
74 int ip6_local_out(struct sk_buff *skb)
78 err = __ip6_local_out(skb);
80 err = dst_output(skb);
84 EXPORT_SYMBOL_GPL(ip6_local_out);
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
89 skb_reset_mac_header(newskb);
90 __skb_pull(newskb, skb_network_offset(newskb));
91 newskb->pkt_type = PACKET_LOOPBACK;
92 newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 WARN_ON(!skb_dst(newskb));
99 static int ip6_finish_output2(struct sk_buff *skb)
101 struct dst_entry *dst = skb_dst(skb);
102 struct net_device *dev = dst->dev;
103 struct neighbour *neigh;
105 skb->protocol = htons(ETH_P_IPV6);
108 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
111 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 ((mroute6_socket(dev_net(dev), skb) &&
113 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 &ipv6_hdr(skb)->saddr))) {
116 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
118 /* Do not check for IFF_ALLMULTI; multicast routing
119 is not supported in any case.
122 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 newskb, NULL, newskb->dev,
124 ip6_dev_loopback_xmit);
126 if (ipv6_hdr(skb)->hop_limit == 0) {
127 IP6_INC_STATS(dev_net(dev), idev,
128 IPSTATS_MIB_OUTDISCARDS);
134 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
139 neigh = dst_get_neighbour(dst);
141 int res = neigh_output(neigh, skb);
147 IP6_INC_STATS_BH(dev_net(dst->dev),
148 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
153 static int ip6_finish_output(struct sk_buff *skb)
155 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156 dst_allfrag(skb_dst(skb)))
157 return ip6_fragment(skb, ip6_finish_output2);
159 return ip6_finish_output2(skb);
162 int ip6_output(struct sk_buff *skb)
164 struct net_device *dev = skb_dst(skb)->dev;
165 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166 if (unlikely(idev->cnf.disable_ipv6)) {
167 IP6_INC_STATS(dev_net(dev), idev,
168 IPSTATS_MIB_OUTDISCARDS);
173 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
175 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
179 * xmit an sk_buff (used by TCP, SCTP and DCCP)
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183 struct ipv6_txoptions *opt, int tclass)
185 struct net *net = sock_net(sk);
186 struct ipv6_pinfo *np = inet6_sk(sk);
187 struct in6_addr *first_hop = &fl6->daddr;
188 struct dst_entry *dst = skb_dst(skb);
190 u8 proto = fl6->flowi6_proto;
191 int seg_len = skb->len;
196 unsigned int head_room;
198 /* First: exthdrs may take lots of space (~8K for now)
199 MAX_HEADER is not enough.
201 head_room = opt->opt_nflen + opt->opt_flen;
202 seg_len += head_room;
203 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
205 if (skb_headroom(skb) < head_room) {
206 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
208 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209 IPSTATS_MIB_OUTDISCARDS);
215 skb_set_owner_w(skb, sk);
218 ipv6_push_frag_opts(skb, opt, &proto);
220 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
223 skb_push(skb, sizeof(struct ipv6hdr));
224 skb_reset_network_header(skb);
228 * Fill in the IPv6 header
231 hlimit = np->hop_limit;
233 hlimit = ip6_dst_hoplimit(dst);
235 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
237 hdr->payload_len = htons(seg_len);
238 hdr->nexthdr = proto;
239 hdr->hop_limit = hlimit;
241 ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
242 ipv6_addr_copy(&hdr->daddr, first_hop);
244 skb->priority = sk->sk_priority;
245 skb->mark = sk->sk_mark;
248 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250 IPSTATS_MIB_OUT, skb->len);
251 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252 dst->dev, dst_output);
256 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
258 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
259 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
264 EXPORT_SYMBOL(ip6_xmit);
267 * To avoid extra problems ND packets are send through this
268 * routine. It's code duplication but I really want to avoid
269 * extra checks since ipv6_build_header is used by TCP (which
270 * is for us performance critical)
273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
274 const struct in6_addr *saddr, const struct in6_addr *daddr,
277 struct ipv6_pinfo *np = inet6_sk(sk);
280 skb->protocol = htons(ETH_P_IPV6);
283 skb_reset_network_header(skb);
284 skb_put(skb, sizeof(struct ipv6hdr));
287 *(__be32*)hdr = htonl(0x60000000);
289 hdr->payload_len = htons(len);
290 hdr->nexthdr = proto;
291 hdr->hop_limit = np->hop_limit;
293 ipv6_addr_copy(&hdr->saddr, saddr);
294 ipv6_addr_copy(&hdr->daddr, daddr);
299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
301 struct ip6_ra_chain *ra;
302 struct sock *last = NULL;
304 read_lock(&ip6_ra_lock);
305 for (ra = ip6_ra_chain; ra; ra = ra->next) {
306 struct sock *sk = ra->sk;
307 if (sk && ra->sel == sel &&
308 (!sk->sk_bound_dev_if ||
309 sk->sk_bound_dev_if == skb->dev->ifindex)) {
311 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
313 rawv6_rcv(last, skb2);
320 rawv6_rcv(last, skb);
321 read_unlock(&ip6_ra_lock);
324 read_unlock(&ip6_ra_lock);
328 static int ip6_forward_proxy_check(struct sk_buff *skb)
330 struct ipv6hdr *hdr = ipv6_hdr(skb);
331 u8 nexthdr = hdr->nexthdr;
334 if (ipv6_ext_hdr(nexthdr)) {
335 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
339 offset = sizeof(struct ipv6hdr);
341 if (nexthdr == IPPROTO_ICMPV6) {
342 struct icmp6hdr *icmp6;
344 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345 offset + 1 - skb->data)))
348 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
350 switch (icmp6->icmp6_type) {
351 case NDISC_ROUTER_SOLICITATION:
352 case NDISC_ROUTER_ADVERTISEMENT:
353 case NDISC_NEIGHBOUR_SOLICITATION:
354 case NDISC_NEIGHBOUR_ADVERTISEMENT:
356 /* For reaction involving unicast neighbor discovery
357 * message destined to the proxied address, pass it to
367 * The proxying router can't forward traffic sent to a link-local
368 * address, so signal the sender and discard the packet. This
369 * behavior is clarified by the MIPv6 specification.
371 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372 dst_link_failure(skb);
379 static inline int ip6_forward_finish(struct sk_buff *skb)
381 return dst_output(skb);
384 int ip6_forward(struct sk_buff *skb)
386 struct dst_entry *dst = skb_dst(skb);
387 struct ipv6hdr *hdr = ipv6_hdr(skb);
388 struct inet6_skb_parm *opt = IP6CB(skb);
389 struct net *net = dev_net(dst->dev);
393 if (net->ipv6.devconf_all->forwarding == 0)
396 if (skb_warn_if_lro(skb))
399 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
404 if (skb->pkt_type != PACKET_HOST)
407 skb_forward_csum(skb);
410 * We DO NOT make any processing on
411 * RA packets, pushing them to user level AS IS
412 * without ane WARRANTY that application will be able
413 * to interpret them. The reason is that we
414 * cannot make anything clever here.
416 * We are not end-node, so that if packet contains
417 * AH/ESP, we cannot make anything.
418 * Defragmentation also would be mistake, RA packets
419 * cannot be fragmented, because there is no warranty
420 * that different fragments will go along one path. --ANK
423 u8 *ptr = skb_network_header(skb) + opt->ra;
424 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
429 * check and decrement ttl
431 if (hdr->hop_limit <= 1) {
432 /* Force OUTPUT device used as source address */
434 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
435 IP6_INC_STATS_BH(net,
436 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
442 /* XXX: idev->cnf.proxy_ndp? */
443 if (net->ipv6.devconf_all->proxy_ndp &&
444 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
445 int proxied = ip6_forward_proxy_check(skb);
447 return ip6_input(skb);
448 else if (proxied < 0) {
449 IP6_INC_STATS(net, ip6_dst_idev(dst),
450 IPSTATS_MIB_INDISCARDS);
455 if (!xfrm6_route_forward(skb)) {
456 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
461 /* IPv6 specs say nothing about it, but it is clear that we cannot
462 send redirects to source routed frames.
463 We don't send redirects to frames decapsulated from IPsec.
465 n = dst_get_neighbour(dst);
466 if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
467 struct in6_addr *target = NULL;
471 * incoming and outgoing devices are the same
475 rt = (struct rt6_info *) dst;
476 if ((rt->rt6i_flags & RTF_GATEWAY))
477 target = (struct in6_addr*)&n->primary_key;
479 target = &hdr->daddr;
482 rt6_bind_peer(rt, 1);
484 /* Limit redirects both by destination (here)
485 and by source (inside ndisc_send_redirect)
487 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
488 ndisc_send_redirect(skb, n, target);
490 int addrtype = ipv6_addr_type(&hdr->saddr);
492 /* This check is security critical. */
493 if (addrtype == IPV6_ADDR_ANY ||
494 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
496 if (addrtype & IPV6_ADDR_LINKLOCAL) {
497 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
498 ICMPV6_NOT_NEIGHBOUR, 0);
504 if (mtu < IPV6_MIN_MTU)
507 if (skb->len > mtu && !skb_is_gso(skb)) {
508 /* Again, force OUTPUT device used as source address */
510 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
511 IP6_INC_STATS_BH(net,
512 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
513 IP6_INC_STATS_BH(net,
514 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
519 if (skb_cow(skb, dst->dev->hard_header_len)) {
520 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
526 /* Mangling hops number delayed to point after skb COW */
530 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
531 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
535 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
541 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
543 to->pkt_type = from->pkt_type;
544 to->priority = from->priority;
545 to->protocol = from->protocol;
547 skb_dst_set(to, dst_clone(skb_dst(from)));
549 to->mark = from->mark;
551 #ifdef CONFIG_NET_SCHED
552 to->tc_index = from->tc_index;
555 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
556 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
557 to->nf_trace = from->nf_trace;
559 skb_copy_secmark(to, from);
562 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
564 u16 offset = sizeof(struct ipv6hdr);
565 struct ipv6_opt_hdr *exthdr =
566 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
567 unsigned int packet_len = skb->tail - skb->network_header;
569 *nexthdr = &ipv6_hdr(skb)->nexthdr;
571 while (offset + 1 <= packet_len) {
577 case NEXTHDR_ROUTING:
581 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
582 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
592 offset += ipv6_optlen(exthdr);
593 *nexthdr = &exthdr->nexthdr;
594 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
601 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
603 static atomic_t ipv6_fragmentation_id;
607 struct inet_peer *peer;
610 rt6_bind_peer(rt, 1);
611 peer = rt->rt6i_peer;
613 fhdr->identification = htonl(inet_getid(peer, 0));
618 old = atomic_read(&ipv6_fragmentation_id);
622 } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
623 fhdr->identification = htonl(new);
626 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
628 struct sk_buff *frag;
629 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
630 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
631 struct ipv6hdr *tmp_hdr;
633 unsigned int mtu, hlen, left, len;
635 int ptr, offset = 0, err=0;
636 u8 *prevhdr, nexthdr = 0;
637 struct net *net = dev_net(skb_dst(skb)->dev);
639 hlen = ip6_find_1stfragopt(skb, &prevhdr);
642 mtu = ip6_skb_dst_mtu(skb);
644 /* We must not fragment if the socket is set to force MTU discovery
645 * or if the skb it not generated by a local socket.
647 if (!skb->local_df && skb->len > mtu) {
648 skb->dev = skb_dst(skb)->dev;
649 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
650 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
651 IPSTATS_MIB_FRAGFAILS);
656 if (np && np->frag_size < mtu) {
660 mtu -= hlen + sizeof(struct frag_hdr);
662 if (skb_has_frag_list(skb)) {
663 int first_len = skb_pagelen(skb);
664 struct sk_buff *frag2;
666 if (first_len - hlen > mtu ||
667 ((first_len - hlen) & 7) ||
671 skb_walk_frags(skb, frag) {
672 /* Correct geometry. */
673 if (frag->len > mtu ||
674 ((frag->len & 7) && frag->next) ||
675 skb_headroom(frag) < hlen)
676 goto slow_path_clean;
678 /* Partially cloned skb? */
679 if (skb_shared(frag))
680 goto slow_path_clean;
685 frag->destructor = sock_wfree;
687 skb->truesize -= frag->truesize;
692 frag = skb_shinfo(skb)->frag_list;
693 skb_frag_list_init(skb);
696 *prevhdr = NEXTHDR_FRAGMENT;
697 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
699 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
700 IPSTATS_MIB_FRAGFAILS);
704 __skb_pull(skb, hlen);
705 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
706 __skb_push(skb, hlen);
707 skb_reset_network_header(skb);
708 memcpy(skb_network_header(skb), tmp_hdr, hlen);
710 ipv6_select_ident(fh, rt);
711 fh->nexthdr = nexthdr;
713 fh->frag_off = htons(IP6_MF);
714 frag_id = fh->identification;
716 first_len = skb_pagelen(skb);
717 skb->data_len = first_len - skb_headlen(skb);
718 skb->len = first_len;
719 ipv6_hdr(skb)->payload_len = htons(first_len -
720 sizeof(struct ipv6hdr));
725 /* Prepare header of the next frame,
726 * before previous one went down. */
728 frag->ip_summed = CHECKSUM_NONE;
729 skb_reset_transport_header(frag);
730 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
731 __skb_push(frag, hlen);
732 skb_reset_network_header(frag);
733 memcpy(skb_network_header(frag), tmp_hdr,
735 offset += skb->len - hlen - sizeof(struct frag_hdr);
736 fh->nexthdr = nexthdr;
738 fh->frag_off = htons(offset);
739 if (frag->next != NULL)
740 fh->frag_off |= htons(IP6_MF);
741 fh->identification = frag_id;
742 ipv6_hdr(frag)->payload_len =
744 sizeof(struct ipv6hdr));
745 ip6_copy_metadata(frag, skb);
750 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
751 IPSTATS_MIB_FRAGCREATES);
764 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
765 IPSTATS_MIB_FRAGOKS);
766 dst_release(&rt->dst);
776 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
777 IPSTATS_MIB_FRAGFAILS);
778 dst_release(&rt->dst);
782 skb_walk_frags(skb, frag2) {
786 frag2->destructor = NULL;
787 skb->truesize += frag2->truesize;
792 left = skb->len - hlen; /* Space per frame */
793 ptr = hlen; /* Where to start from */
796 * Fragment the datagram.
799 *prevhdr = NEXTHDR_FRAGMENT;
802 * Keep copying data until we run out.
806 /* IF: it doesn't fit, use 'mtu' - the data space left */
809 /* IF: we are not sending up to and including the packet end
810 then align the next start on an eight byte boundary */
818 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
819 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
820 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
821 IPSTATS_MIB_FRAGFAILS);
827 * Set up data on packet
830 ip6_copy_metadata(frag, skb);
831 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
832 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
833 skb_reset_network_header(frag);
834 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
835 frag->transport_header = (frag->network_header + hlen +
836 sizeof(struct frag_hdr));
839 * Charge the memory for the fragment to any owner
843 skb_set_owner_w(frag, skb->sk);
846 * Copy the packet header into the new buffer.
848 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
851 * Build fragment header.
853 fh->nexthdr = nexthdr;
856 ipv6_select_ident(fh, rt);
857 frag_id = fh->identification;
859 fh->identification = frag_id;
862 * Copy a block of the IP datagram.
864 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
868 fh->frag_off = htons(offset);
870 fh->frag_off |= htons(IP6_MF);
871 ipv6_hdr(frag)->payload_len = htons(frag->len -
872 sizeof(struct ipv6hdr));
878 * Put this fragment into the sending queue.
884 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
885 IPSTATS_MIB_FRAGCREATES);
887 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888 IPSTATS_MIB_FRAGOKS);
893 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
894 IPSTATS_MIB_FRAGFAILS);
899 static inline int ip6_rt_check(const struct rt6key *rt_key,
900 const struct in6_addr *fl_addr,
901 const struct in6_addr *addr_cache)
903 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
904 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
907 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
908 struct dst_entry *dst,
909 const struct flowi6 *fl6)
911 struct ipv6_pinfo *np = inet6_sk(sk);
912 struct rt6_info *rt = (struct rt6_info *)dst;
917 /* Yes, checking route validity in not connected
918 * case is not very simple. Take into account,
919 * that we do not support routing by source, TOS,
920 * and MSG_DONTROUTE --ANK (980726)
922 * 1. ip6_rt_check(): If route was host route,
923 * check that cached destination is current.
924 * If it is network route, we still may
925 * check its validity using saved pointer
926 * to the last used address: daddr_cache.
927 * We do not want to save whole address now,
928 * (because main consumer of this service
929 * is tcp, which has not this problem),
930 * so that the last trick works only on connected
932 * 2. oif also should be the same.
934 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
935 #ifdef CONFIG_IPV6_SUBTREES
936 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
938 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
947 static int ip6_dst_lookup_tail(struct sock *sk,
948 struct dst_entry **dst, struct flowi6 *fl6)
950 struct net *net = sock_net(sk);
951 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
957 *dst = ip6_route_output(net, sk, fl6);
959 if ((err = (*dst)->error))
960 goto out_err_release;
962 if (ipv6_addr_any(&fl6->saddr)) {
963 struct rt6_info *rt = (struct rt6_info *) *dst;
964 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
965 sk ? inet6_sk(sk)->srcprefs : 0,
968 goto out_err_release;
971 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
973 * Here if the dst entry we've looked up
974 * has a neighbour entry that is in the INCOMPLETE
975 * state and the src address from the flow is
976 * marked as OPTIMISTIC, we release the found
977 * dst entry and replace it instead with the
978 * dst entry of the nexthop router
981 n = dst_get_neighbour(*dst);
982 if (n && !(n->nud_state & NUD_VALID)) {
983 struct inet6_ifaddr *ifp;
984 struct flowi6 fl_gw6;
988 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
991 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
997 * We need to get the dst entry for the
998 * default router instead
1001 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1002 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1003 *dst = ip6_route_output(net, sk, &fl_gw6);
1004 if ((err = (*dst)->error))
1005 goto out_err_release;
1015 if (err == -ENETUNREACH)
1016 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1023 * ip6_dst_lookup - perform route lookup on flow
1024 * @sk: socket which provides route info
1025 * @dst: pointer to dst_entry * for result
1026 * @fl6: flow to lookup
1028 * This function performs a route lookup on the given flow.
1030 * It returns zero on success, or a standard errno code on error.
1032 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1035 return ip6_dst_lookup_tail(sk, dst, fl6);
1037 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1040 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1041 * @sk: socket which provides route info
1042 * @fl6: flow to lookup
1043 * @final_dst: final destination address for ipsec lookup
1044 * @can_sleep: we are in a sleepable context
1046 * This function performs a route lookup on the given flow.
1048 * It returns a valid dst pointer on success, or a pointer encoded
1051 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1052 const struct in6_addr *final_dst,
1055 struct dst_entry *dst = NULL;
1058 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1060 return ERR_PTR(err);
1062 ipv6_addr_copy(&fl6->daddr, final_dst);
1064 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1066 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1068 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1071 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1072 * @sk: socket which provides the dst cache and route info
1073 * @fl6: flow to lookup
1074 * @final_dst: final destination address for ipsec lookup
1075 * @can_sleep: we are in a sleepable context
1077 * This function performs a route lookup on the given flow with the
1078 * possibility of using the cached route in the socket if it is valid.
1079 * It will take the socket dst lock when operating on the dst cache.
1080 * As a result, this function can only be used in process context.
1082 * It returns a valid dst pointer on success, or a pointer encoded
1085 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1086 const struct in6_addr *final_dst,
1089 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1092 dst = ip6_sk_dst_check(sk, dst, fl6);
1094 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1096 return ERR_PTR(err);
1098 ipv6_addr_copy(&fl6->daddr, final_dst);
1100 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1102 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1104 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1106 static inline int ip6_ufo_append_data(struct sock *sk,
1107 int getfrag(void *from, char *to, int offset, int len,
1108 int odd, struct sk_buff *skb),
1109 void *from, int length, int hh_len, int fragheaderlen,
1110 int transhdrlen, int mtu,unsigned int flags,
1111 struct rt6_info *rt)
1114 struct sk_buff *skb;
1117 /* There is support for UDP large send offload by network
1118 * device, so create one single skb packet containing complete
1121 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1122 skb = sock_alloc_send_skb(sk,
1123 hh_len + fragheaderlen + transhdrlen + 20,
1124 (flags & MSG_DONTWAIT), &err);
1128 /* reserve space for Hardware header */
1129 skb_reserve(skb, hh_len);
1131 /* create space for UDP/IP header */
1132 skb_put(skb,fragheaderlen + transhdrlen);
1134 /* initialize network header pointer */
1135 skb_reset_network_header(skb);
1137 /* initialize protocol header pointer */
1138 skb->transport_header = skb->network_header + fragheaderlen;
1140 skb->ip_summed = CHECKSUM_PARTIAL;
1144 err = skb_append_datato_frags(sk,skb, getfrag, from,
1145 (length - transhdrlen));
1147 struct frag_hdr fhdr;
1149 /* Specify the length of each IPv6 datagram fragment.
1150 * It has to be a multiple of 8.
1152 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1153 sizeof(struct frag_hdr)) & ~7;
1154 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1155 ipv6_select_ident(&fhdr, rt);
1156 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1157 __skb_queue_tail(&sk->sk_write_queue, skb);
1161 /* There is not enough support do UPD LSO,
1162 * so follow normal path
1169 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1172 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1175 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1178 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1181 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1182 int offset, int len, int odd, struct sk_buff *skb),
1183 void *from, int length, int transhdrlen,
1184 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1185 struct rt6_info *rt, unsigned int flags, int dontfrag)
1187 struct inet_sock *inet = inet_sk(sk);
1188 struct ipv6_pinfo *np = inet6_sk(sk);
1189 struct inet_cork *cork;
1190 struct sk_buff *skb;
1191 unsigned int maxfraglen, fragheaderlen;
1199 int csummode = CHECKSUM_NONE;
1202 if (flags&MSG_PROBE)
1204 cork = &inet->cork.base;
1205 if (skb_queue_empty(&sk->sk_write_queue)) {
1210 if (WARN_ON(np->cork.opt))
1213 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1214 if (unlikely(np->cork.opt == NULL))
1217 np->cork.opt->tot_len = opt->tot_len;
1218 np->cork.opt->opt_flen = opt->opt_flen;
1219 np->cork.opt->opt_nflen = opt->opt_nflen;
1221 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1223 if (opt->dst0opt && !np->cork.opt->dst0opt)
1226 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1228 if (opt->dst1opt && !np->cork.opt->dst1opt)
1231 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1233 if (opt->hopopt && !np->cork.opt->hopopt)
1236 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1238 if (opt->srcrt && !np->cork.opt->srcrt)
1241 /* need source address above miyazawa*/
1244 cork->dst = &rt->dst;
1245 inet->cork.fl.u.ip6 = *fl6;
1246 np->cork.hop_limit = hlimit;
1247 np->cork.tclass = tclass;
1248 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1249 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1250 if (np->frag_size < mtu) {
1252 mtu = np->frag_size;
1254 cork->fragsize = mtu;
1255 if (dst_allfrag(rt->dst.path))
1256 cork->flags |= IPCORK_ALLFRAG;
1258 sk->sk_sndmsg_page = NULL;
1259 sk->sk_sndmsg_off = 0;
1260 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1261 length += exthdrlen;
1262 transhdrlen += exthdrlen;
1263 dst_exthdrlen = rt->dst.header_len;
1265 rt = (struct rt6_info *)cork->dst;
1266 fl6 = &inet->cork.fl.u.ip6;
1271 mtu = cork->fragsize;
1274 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1276 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1277 (opt ? opt->opt_nflen : 0);
1278 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1280 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1281 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1282 ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1287 /* For UDP, check if TX timestamp is enabled */
1288 if (sk->sk_type == SOCK_DGRAM) {
1289 err = sock_tx_timestamp(sk, &tx_flags);
1295 * Let's try using as much space as possible.
1296 * Use MTU if total length of the message fits into the MTU.
1297 * Otherwise, we need to reserve fragment header and
1298 * fragment alignment (= 8-15 octects, in total).
1300 * Note that we may need to "move" the data from the tail of
1301 * of the buffer to the new fragment when we split
1304 * FIXME: It may be fragmented into multiple chunks
1305 * at once if non-fragmentable extension headers
1310 cork->length += length;
1312 int proto = sk->sk_protocol;
1313 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1314 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1318 if (proto == IPPROTO_UDP &&
1319 (rt->dst.dev->features & NETIF_F_UFO)) {
1321 err = ip6_ufo_append_data(sk, getfrag, from, length,
1322 hh_len, fragheaderlen,
1323 transhdrlen, mtu, flags, rt);
1330 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1333 while (length > 0) {
1334 /* Check if the remaining data fits into current packet. */
1335 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1337 copy = maxfraglen - skb->len;
1341 unsigned int datalen;
1342 unsigned int fraglen;
1343 unsigned int fraggap;
1344 unsigned int alloclen;
1345 struct sk_buff *skb_prev;
1349 /* There's no room in the current skb */
1351 fraggap = skb_prev->len - maxfraglen;
1356 * If remaining data exceeds the mtu,
1357 * we know we need more fragment(s).
1359 datalen = length + fraggap;
1360 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1361 datalen = maxfraglen - fragheaderlen;
1363 fraglen = datalen + fragheaderlen;
1364 if ((flags & MSG_MORE) &&
1365 !(rt->dst.dev->features&NETIF_F_SG))
1368 alloclen = datalen + fragheaderlen;
1370 alloclen += dst_exthdrlen;
1373 * The last fragment gets additional space at tail.
1374 * Note: we overallocate on fragments with MSG_MODE
1375 * because we have no idea if we're the last one.
1377 if (datalen == length + fraggap)
1378 alloclen += rt->dst.trailer_len;
1381 * We just reserve space for fragment header.
1382 * Note: this may be overallocation if the message
1383 * (without MSG_MORE) fits into the MTU.
1385 alloclen += sizeof(struct frag_hdr);
1388 skb = sock_alloc_send_skb(sk,
1390 (flags & MSG_DONTWAIT), &err);
1393 if (atomic_read(&sk->sk_wmem_alloc) <=
1395 skb = sock_wmalloc(sk,
1396 alloclen + hh_len, 1,
1398 if (unlikely(skb == NULL))
1401 /* Only the initial fragment
1410 * Fill in the control structures
1412 skb->ip_summed = csummode;
1414 /* reserve for fragmentation */
1415 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1417 if (sk->sk_type == SOCK_DGRAM)
1418 skb_shinfo(skb)->tx_flags = tx_flags;
1421 * Find where to start putting bytes
1423 data = skb_put(skb, fraglen + dst_exthdrlen);
1424 skb_set_network_header(skb, exthdrlen + dst_exthdrlen);
1425 data += fragheaderlen + dst_exthdrlen;
1426 skb->transport_header = (skb->network_header +
1429 skb->csum = skb_copy_and_csum_bits(
1430 skb_prev, maxfraglen,
1431 data + transhdrlen, fraggap, 0);
1432 skb_prev->csum = csum_sub(skb_prev->csum,
1435 pskb_trim_unique(skb_prev, maxfraglen);
1437 copy = datalen - transhdrlen - fraggap;
1443 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1450 length -= datalen - fraggap;
1454 csummode = CHECKSUM_NONE;
1457 * Put the packet on the pending queue
1459 __skb_queue_tail(&sk->sk_write_queue, skb);
1466 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1470 if (getfrag(from, skb_put(skb, copy),
1471 offset, copy, off, skb) < 0) {
1472 __skb_trim(skb, off);
1477 int i = skb_shinfo(skb)->nr_frags;
1478 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1479 struct page *page = sk->sk_sndmsg_page;
1480 int off = sk->sk_sndmsg_off;
1483 if (page && (left = PAGE_SIZE - off) > 0) {
1486 if (page != skb_frag_page(frag)) {
1487 if (i == MAX_SKB_FRAGS) {
1491 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1492 skb_frag_ref(skb, i);
1493 frag = &skb_shinfo(skb)->frags[i];
1495 } else if(i < MAX_SKB_FRAGS) {
1496 if (copy > PAGE_SIZE)
1498 page = alloc_pages(sk->sk_allocation, 0);
1503 sk->sk_sndmsg_page = page;
1504 sk->sk_sndmsg_off = 0;
1506 skb_fill_page_desc(skb, i, page, 0, 0);
1507 frag = &skb_shinfo(skb)->frags[i];
1513 skb_frag_address(frag) + skb_frag_size(frag),
1514 offset, copy, skb->len, skb) < 0) {
1518 sk->sk_sndmsg_off += copy;
1519 skb_frag_size_add(frag, copy);
1521 skb->data_len += copy;
1522 skb->truesize += copy;
1523 atomic_add(copy, &sk->sk_wmem_alloc);
1530 cork->length -= length;
1531 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1535 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1538 kfree(np->cork.opt->dst0opt);
1539 kfree(np->cork.opt->dst1opt);
1540 kfree(np->cork.opt->hopopt);
1541 kfree(np->cork.opt->srcrt);
1542 kfree(np->cork.opt);
1543 np->cork.opt = NULL;
1546 if (inet->cork.base.dst) {
1547 dst_release(inet->cork.base.dst);
1548 inet->cork.base.dst = NULL;
1549 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1551 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1554 int ip6_push_pending_frames(struct sock *sk)
1556 struct sk_buff *skb, *tmp_skb;
1557 struct sk_buff **tail_skb;
1558 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1559 struct inet_sock *inet = inet_sk(sk);
1560 struct ipv6_pinfo *np = inet6_sk(sk);
1561 struct net *net = sock_net(sk);
1562 struct ipv6hdr *hdr;
1563 struct ipv6_txoptions *opt = np->cork.opt;
1564 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1565 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1566 unsigned char proto = fl6->flowi6_proto;
1569 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1571 tail_skb = &(skb_shinfo(skb)->frag_list);
1573 /* move skb->data to ip header from ext header */
1574 if (skb->data < skb_network_header(skb))
1575 __skb_pull(skb, skb_network_offset(skb));
1576 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1577 __skb_pull(tmp_skb, skb_network_header_len(skb));
1578 *tail_skb = tmp_skb;
1579 tail_skb = &(tmp_skb->next);
1580 skb->len += tmp_skb->len;
1581 skb->data_len += tmp_skb->len;
1582 skb->truesize += tmp_skb->truesize;
1583 tmp_skb->destructor = NULL;
1587 /* Allow local fragmentation. */
1588 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1591 ipv6_addr_copy(final_dst, &fl6->daddr);
1592 __skb_pull(skb, skb_network_header_len(skb));
1593 if (opt && opt->opt_flen)
1594 ipv6_push_frag_opts(skb, opt, &proto);
1595 if (opt && opt->opt_nflen)
1596 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1598 skb_push(skb, sizeof(struct ipv6hdr));
1599 skb_reset_network_header(skb);
1600 hdr = ipv6_hdr(skb);
1602 *(__be32*)hdr = fl6->flowlabel |
1603 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1605 hdr->hop_limit = np->cork.hop_limit;
1606 hdr->nexthdr = proto;
1607 ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1608 ipv6_addr_copy(&hdr->daddr, final_dst);
1610 skb->priority = sk->sk_priority;
1611 skb->mark = sk->sk_mark;
1613 skb_dst_set(skb, dst_clone(&rt->dst));
1614 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1615 if (proto == IPPROTO_ICMPV6) {
1616 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1618 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1619 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1622 err = ip6_local_out(skb);
1625 err = net_xmit_errno(err);
1631 ip6_cork_release(inet, np);
1634 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1638 void ip6_flush_pending_frames(struct sock *sk)
1640 struct sk_buff *skb;
1642 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1644 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1645 IPSTATS_MIB_OUTDISCARDS);
1649 ip6_cork_release(inet_sk(sk), inet6_sk(sk));