2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
61 int __ip6_local_out(struct sk_buff *skb)
65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN)
68 ipv6_hdr(skb)->payload_len = htons(len);
70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 skb_dst(skb)->dev, dst_output);
74 int ip6_local_out(struct sk_buff *skb)
78 err = __ip6_local_out(skb);
80 err = dst_output(skb);
84 EXPORT_SYMBOL_GPL(ip6_local_out);
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
89 skb_reset_mac_header(newskb);
90 __skb_pull(newskb, skb_network_offset(newskb));
91 newskb->pkt_type = PACKET_LOOPBACK;
92 newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 WARN_ON(!skb_dst(newskb));
99 static int ip6_finish_output2(struct sk_buff *skb)
101 struct dst_entry *dst = skb_dst(skb);
102 struct net_device *dev = dst->dev;
103 struct neighbour *neigh;
105 skb->protocol = htons(ETH_P_IPV6);
108 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
111 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 ((mroute6_socket(dev_net(dev), skb) &&
113 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 &ipv6_hdr(skb)->saddr))) {
116 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
118 /* Do not check for IFF_ALLMULTI; multicast routing
119 is not supported in any case.
122 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 newskb, NULL, newskb->dev,
124 ip6_dev_loopback_xmit);
126 if (ipv6_hdr(skb)->hop_limit == 0) {
127 IP6_INC_STATS(dev_net(dev), idev,
128 IPSTATS_MIB_OUTDISCARDS);
134 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
139 neigh = dst_get_neighbour(dst);
141 int res = neigh_output(neigh, skb);
147 IP6_INC_STATS_BH(dev_net(dst->dev),
148 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
153 static int ip6_finish_output(struct sk_buff *skb)
155 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156 dst_allfrag(skb_dst(skb)))
157 return ip6_fragment(skb, ip6_finish_output2);
159 return ip6_finish_output2(skb);
162 int ip6_output(struct sk_buff *skb)
164 struct net_device *dev = skb_dst(skb)->dev;
165 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166 if (unlikely(idev->cnf.disable_ipv6)) {
167 IP6_INC_STATS(dev_net(dev), idev,
168 IPSTATS_MIB_OUTDISCARDS);
173 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
175 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
179 * xmit an sk_buff (used by TCP, SCTP and DCCP)
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183 struct ipv6_txoptions *opt, int tclass)
185 struct net *net = sock_net(sk);
186 struct ipv6_pinfo *np = inet6_sk(sk);
187 struct in6_addr *first_hop = &fl6->daddr;
188 struct dst_entry *dst = skb_dst(skb);
190 u8 proto = fl6->flowi6_proto;
191 int seg_len = skb->len;
196 unsigned int head_room;
198 /* First: exthdrs may take lots of space (~8K for now)
199 MAX_HEADER is not enough.
201 head_room = opt->opt_nflen + opt->opt_flen;
202 seg_len += head_room;
203 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
205 if (skb_headroom(skb) < head_room) {
206 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
208 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209 IPSTATS_MIB_OUTDISCARDS);
215 skb_set_owner_w(skb, sk);
218 ipv6_push_frag_opts(skb, opt, &proto);
220 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
223 skb_push(skb, sizeof(struct ipv6hdr));
224 skb_reset_network_header(skb);
228 * Fill in the IPv6 header
231 hlimit = np->hop_limit;
233 hlimit = ip6_dst_hoplimit(dst);
235 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
237 hdr->payload_len = htons(seg_len);
238 hdr->nexthdr = proto;
239 hdr->hop_limit = hlimit;
241 ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
242 ipv6_addr_copy(&hdr->daddr, first_hop);
244 skb->priority = sk->sk_priority;
245 skb->mark = sk->sk_mark;
248 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250 IPSTATS_MIB_OUT, skb->len);
251 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252 dst->dev, dst_output);
256 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
258 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
259 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
264 EXPORT_SYMBOL(ip6_xmit);
267 * To avoid extra problems ND packets are send through this
268 * routine. It's code duplication but I really want to avoid
269 * extra checks since ipv6_build_header is used by TCP (which
270 * is for us performance critical)
273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
274 const struct in6_addr *saddr, const struct in6_addr *daddr,
277 struct ipv6_pinfo *np = inet6_sk(sk);
280 skb->protocol = htons(ETH_P_IPV6);
283 skb_reset_network_header(skb);
284 skb_put(skb, sizeof(struct ipv6hdr));
287 *(__be32*)hdr = htonl(0x60000000);
289 hdr->payload_len = htons(len);
290 hdr->nexthdr = proto;
291 hdr->hop_limit = np->hop_limit;
293 ipv6_addr_copy(&hdr->saddr, saddr);
294 ipv6_addr_copy(&hdr->daddr, daddr);
299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
301 struct ip6_ra_chain *ra;
302 struct sock *last = NULL;
304 read_lock(&ip6_ra_lock);
305 for (ra = ip6_ra_chain; ra; ra = ra->next) {
306 struct sock *sk = ra->sk;
307 if (sk && ra->sel == sel &&
308 (!sk->sk_bound_dev_if ||
309 sk->sk_bound_dev_if == skb->dev->ifindex)) {
311 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
313 rawv6_rcv(last, skb2);
320 rawv6_rcv(last, skb);
321 read_unlock(&ip6_ra_lock);
324 read_unlock(&ip6_ra_lock);
328 static int ip6_forward_proxy_check(struct sk_buff *skb)
330 struct ipv6hdr *hdr = ipv6_hdr(skb);
331 u8 nexthdr = hdr->nexthdr;
334 if (ipv6_ext_hdr(nexthdr)) {
335 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
339 offset = sizeof(struct ipv6hdr);
341 if (nexthdr == IPPROTO_ICMPV6) {
342 struct icmp6hdr *icmp6;
344 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345 offset + 1 - skb->data)))
348 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
350 switch (icmp6->icmp6_type) {
351 case NDISC_ROUTER_SOLICITATION:
352 case NDISC_ROUTER_ADVERTISEMENT:
353 case NDISC_NEIGHBOUR_SOLICITATION:
354 case NDISC_NEIGHBOUR_ADVERTISEMENT:
356 /* For reaction involving unicast neighbor discovery
357 * message destined to the proxied address, pass it to
367 * The proxying router can't forward traffic sent to a link-local
368 * address, so signal the sender and discard the packet. This
369 * behavior is clarified by the MIPv6 specification.
371 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372 dst_link_failure(skb);
379 static inline int ip6_forward_finish(struct sk_buff *skb)
381 return dst_output(skb);
384 int ip6_forward(struct sk_buff *skb)
386 struct dst_entry *dst = skb_dst(skb);
387 struct ipv6hdr *hdr = ipv6_hdr(skb);
388 struct inet6_skb_parm *opt = IP6CB(skb);
389 struct net *net = dev_net(dst->dev);
393 if (net->ipv6.devconf_all->forwarding == 0)
396 if (skb_warn_if_lro(skb))
399 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
404 if (skb->pkt_type != PACKET_HOST)
407 skb_forward_csum(skb);
410 * We DO NOT make any processing on
411 * RA packets, pushing them to user level AS IS
412 * without ane WARRANTY that application will be able
413 * to interpret them. The reason is that we
414 * cannot make anything clever here.
416 * We are not end-node, so that if packet contains
417 * AH/ESP, we cannot make anything.
418 * Defragmentation also would be mistake, RA packets
419 * cannot be fragmented, because there is no warranty
420 * that different fragments will go along one path. --ANK
423 u8 *ptr = skb_network_header(skb) + opt->ra;
424 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
429 * check and decrement ttl
431 if (hdr->hop_limit <= 1) {
432 /* Force OUTPUT device used as source address */
434 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
435 IP6_INC_STATS_BH(net,
436 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
442 /* XXX: idev->cnf.proxy_ndp? */
443 if (net->ipv6.devconf_all->proxy_ndp &&
444 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
445 int proxied = ip6_forward_proxy_check(skb);
447 return ip6_input(skb);
448 else if (proxied < 0) {
449 IP6_INC_STATS(net, ip6_dst_idev(dst),
450 IPSTATS_MIB_INDISCARDS);
455 if (!xfrm6_route_forward(skb)) {
456 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
461 /* IPv6 specs say nothing about it, but it is clear that we cannot
462 send redirects to source routed frames.
463 We don't send redirects to frames decapsulated from IPsec.
465 n = dst_get_neighbour(dst);
466 if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
467 struct in6_addr *target = NULL;
471 * incoming and outgoing devices are the same
475 rt = (struct rt6_info *) dst;
476 if ((rt->rt6i_flags & RTF_GATEWAY))
477 target = (struct in6_addr*)&n->primary_key;
479 target = &hdr->daddr;
482 rt6_bind_peer(rt, 1);
484 /* Limit redirects both by destination (here)
485 and by source (inside ndisc_send_redirect)
487 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
488 ndisc_send_redirect(skb, n, target);
490 int addrtype = ipv6_addr_type(&hdr->saddr);
492 /* This check is security critical. */
493 if (addrtype == IPV6_ADDR_ANY ||
494 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
496 if (addrtype & IPV6_ADDR_LINKLOCAL) {
497 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
498 ICMPV6_NOT_NEIGHBOUR, 0);
504 if (mtu < IPV6_MIN_MTU)
507 if (skb->len > mtu && !skb_is_gso(skb)) {
508 /* Again, force OUTPUT device used as source address */
510 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
511 IP6_INC_STATS_BH(net,
512 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
513 IP6_INC_STATS_BH(net,
514 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
519 if (skb_cow(skb, dst->dev->hard_header_len)) {
520 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
526 /* Mangling hops number delayed to point after skb COW */
530 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
531 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
535 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
541 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
543 to->pkt_type = from->pkt_type;
544 to->priority = from->priority;
545 to->protocol = from->protocol;
547 skb_dst_set(to, dst_clone(skb_dst(from)));
549 to->mark = from->mark;
551 #ifdef CONFIG_NET_SCHED
552 to->tc_index = from->tc_index;
555 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
556 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
557 to->nf_trace = from->nf_trace;
559 skb_copy_secmark(to, from);
562 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
564 u16 offset = sizeof(struct ipv6hdr);
565 struct ipv6_opt_hdr *exthdr =
566 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
567 unsigned int packet_len = skb->tail - skb->network_header;
569 *nexthdr = &ipv6_hdr(skb)->nexthdr;
571 while (offset + 1 <= packet_len) {
577 case NEXTHDR_ROUTING:
581 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
582 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
592 offset += ipv6_optlen(exthdr);
593 *nexthdr = &exthdr->nexthdr;
594 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
601 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
603 static atomic_t ipv6_fragmentation_id;
606 if (rt && !(rt->dst.flags & DST_NOPEER)) {
607 struct inet_peer *peer;
610 rt6_bind_peer(rt, 1);
611 peer = rt->rt6i_peer;
613 fhdr->identification = htonl(inet_getid(peer, 0));
618 old = atomic_read(&ipv6_fragmentation_id);
622 } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
623 fhdr->identification = htonl(new);
626 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
628 struct sk_buff *frag;
629 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
630 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
631 struct ipv6hdr *tmp_hdr;
633 unsigned int mtu, hlen, left, len;
635 int ptr, offset = 0, err=0;
636 u8 *prevhdr, nexthdr = 0;
637 struct net *net = dev_net(skb_dst(skb)->dev);
639 hlen = ip6_find_1stfragopt(skb, &prevhdr);
642 mtu = ip6_skb_dst_mtu(skb);
644 /* We must not fragment if the socket is set to force MTU discovery
645 * or if the skb it not generated by a local socket.
647 if (!skb->local_df && skb->len > mtu) {
648 skb->dev = skb_dst(skb)->dev;
649 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
650 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
651 IPSTATS_MIB_FRAGFAILS);
656 if (np && np->frag_size < mtu) {
660 mtu -= hlen + sizeof(struct frag_hdr);
662 if (skb_has_frag_list(skb)) {
663 int first_len = skb_pagelen(skb);
664 struct sk_buff *frag2;
666 if (first_len - hlen > mtu ||
667 ((first_len - hlen) & 7) ||
671 skb_walk_frags(skb, frag) {
672 /* Correct geometry. */
673 if (frag->len > mtu ||
674 ((frag->len & 7) && frag->next) ||
675 skb_headroom(frag) < hlen)
676 goto slow_path_clean;
678 /* Partially cloned skb? */
679 if (skb_shared(frag))
680 goto slow_path_clean;
685 frag->destructor = sock_wfree;
687 skb->truesize -= frag->truesize;
692 frag = skb_shinfo(skb)->frag_list;
693 skb_frag_list_init(skb);
696 *prevhdr = NEXTHDR_FRAGMENT;
697 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
699 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
700 IPSTATS_MIB_FRAGFAILS);
704 __skb_pull(skb, hlen);
705 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
706 __skb_push(skb, hlen);
707 skb_reset_network_header(skb);
708 memcpy(skb_network_header(skb), tmp_hdr, hlen);
710 ipv6_select_ident(fh, rt);
711 fh->nexthdr = nexthdr;
713 fh->frag_off = htons(IP6_MF);
714 frag_id = fh->identification;
716 first_len = skb_pagelen(skb);
717 skb->data_len = first_len - skb_headlen(skb);
718 skb->len = first_len;
719 ipv6_hdr(skb)->payload_len = htons(first_len -
720 sizeof(struct ipv6hdr));
725 /* Prepare header of the next frame,
726 * before previous one went down. */
728 frag->ip_summed = CHECKSUM_NONE;
729 skb_reset_transport_header(frag);
730 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
731 __skb_push(frag, hlen);
732 skb_reset_network_header(frag);
733 memcpy(skb_network_header(frag), tmp_hdr,
735 offset += skb->len - hlen - sizeof(struct frag_hdr);
736 fh->nexthdr = nexthdr;
738 fh->frag_off = htons(offset);
739 if (frag->next != NULL)
740 fh->frag_off |= htons(IP6_MF);
741 fh->identification = frag_id;
742 ipv6_hdr(frag)->payload_len =
744 sizeof(struct ipv6hdr));
745 ip6_copy_metadata(frag, skb);
750 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
751 IPSTATS_MIB_FRAGCREATES);
764 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
765 IPSTATS_MIB_FRAGOKS);
766 dst_release(&rt->dst);
776 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
777 IPSTATS_MIB_FRAGFAILS);
778 dst_release(&rt->dst);
782 skb_walk_frags(skb, frag2) {
786 frag2->destructor = NULL;
787 skb->truesize += frag2->truesize;
792 left = skb->len - hlen; /* Space per frame */
793 ptr = hlen; /* Where to start from */
796 * Fragment the datagram.
799 *prevhdr = NEXTHDR_FRAGMENT;
802 * Keep copying data until we run out.
806 /* IF: it doesn't fit, use 'mtu' - the data space left */
809 /* IF: we are not sending up to and including the packet end
810 then align the next start on an eight byte boundary */
818 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
819 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
820 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
821 IPSTATS_MIB_FRAGFAILS);
827 * Set up data on packet
830 ip6_copy_metadata(frag, skb);
831 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
832 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
833 skb_reset_network_header(frag);
834 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
835 frag->transport_header = (frag->network_header + hlen +
836 sizeof(struct frag_hdr));
839 * Charge the memory for the fragment to any owner
843 skb_set_owner_w(frag, skb->sk);
846 * Copy the packet header into the new buffer.
848 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
851 * Build fragment header.
853 fh->nexthdr = nexthdr;
856 ipv6_select_ident(fh, rt);
857 frag_id = fh->identification;
859 fh->identification = frag_id;
862 * Copy a block of the IP datagram.
864 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
868 fh->frag_off = htons(offset);
870 fh->frag_off |= htons(IP6_MF);
871 ipv6_hdr(frag)->payload_len = htons(frag->len -
872 sizeof(struct ipv6hdr));
878 * Put this fragment into the sending queue.
884 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
885 IPSTATS_MIB_FRAGCREATES);
887 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888 IPSTATS_MIB_FRAGOKS);
893 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
894 IPSTATS_MIB_FRAGFAILS);
899 static inline int ip6_rt_check(const struct rt6key *rt_key,
900 const struct in6_addr *fl_addr,
901 const struct in6_addr *addr_cache)
903 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
904 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
907 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
908 struct dst_entry *dst,
909 const struct flowi6 *fl6)
911 struct ipv6_pinfo *np = inet6_sk(sk);
917 if (dst->ops->family != AF_INET6) {
922 rt = (struct rt6_info *)dst;
923 /* Yes, checking route validity in not connected
924 * case is not very simple. Take into account,
925 * that we do not support routing by source, TOS,
926 * and MSG_DONTROUTE --ANK (980726)
928 * 1. ip6_rt_check(): If route was host route,
929 * check that cached destination is current.
930 * If it is network route, we still may
931 * check its validity using saved pointer
932 * to the last used address: daddr_cache.
933 * We do not want to save whole address now,
934 * (because main consumer of this service
935 * is tcp, which has not this problem),
936 * so that the last trick works only on connected
938 * 2. oif also should be the same.
940 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
941 #ifdef CONFIG_IPV6_SUBTREES
942 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
944 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
953 static int ip6_dst_lookup_tail(struct sock *sk,
954 struct dst_entry **dst, struct flowi6 *fl6)
956 struct net *net = sock_net(sk);
957 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
963 *dst = ip6_route_output(net, sk, fl6);
965 if ((err = (*dst)->error))
966 goto out_err_release;
968 if (ipv6_addr_any(&fl6->saddr)) {
969 struct rt6_info *rt = (struct rt6_info *) *dst;
970 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
971 sk ? inet6_sk(sk)->srcprefs : 0,
974 goto out_err_release;
977 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
979 * Here if the dst entry we've looked up
980 * has a neighbour entry that is in the INCOMPLETE
981 * state and the src address from the flow is
982 * marked as OPTIMISTIC, we release the found
983 * dst entry and replace it instead with the
984 * dst entry of the nexthop router
987 n = dst_get_neighbour(*dst);
988 if (n && !(n->nud_state & NUD_VALID)) {
989 struct inet6_ifaddr *ifp;
990 struct flowi6 fl_gw6;
994 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
997 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1003 * We need to get the dst entry for the
1004 * default router instead
1007 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1008 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1009 *dst = ip6_route_output(net, sk, &fl_gw6);
1010 if ((err = (*dst)->error))
1011 goto out_err_release;
1021 if (err == -ENETUNREACH)
1022 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1029 * ip6_dst_lookup - perform route lookup on flow
1030 * @sk: socket which provides route info
1031 * @dst: pointer to dst_entry * for result
1032 * @fl6: flow to lookup
1034 * This function performs a route lookup on the given flow.
1036 * It returns zero on success, or a standard errno code on error.
1038 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1041 return ip6_dst_lookup_tail(sk, dst, fl6);
1043 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1046 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1047 * @sk: socket which provides route info
1048 * @fl6: flow to lookup
1049 * @final_dst: final destination address for ipsec lookup
1050 * @can_sleep: we are in a sleepable context
1052 * This function performs a route lookup on the given flow.
1054 * It returns a valid dst pointer on success, or a pointer encoded
1057 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1058 const struct in6_addr *final_dst,
1061 struct dst_entry *dst = NULL;
1064 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1066 return ERR_PTR(err);
1068 ipv6_addr_copy(&fl6->daddr, final_dst);
1070 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1072 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1074 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1077 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1078 * @sk: socket which provides the dst cache and route info
1079 * @fl6: flow to lookup
1080 * @final_dst: final destination address for ipsec lookup
1081 * @can_sleep: we are in a sleepable context
1083 * This function performs a route lookup on the given flow with the
1084 * possibility of using the cached route in the socket if it is valid.
1085 * It will take the socket dst lock when operating on the dst cache.
1086 * As a result, this function can only be used in process context.
1088 * It returns a valid dst pointer on success, or a pointer encoded
1091 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1092 const struct in6_addr *final_dst,
1095 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1098 dst = ip6_sk_dst_check(sk, dst, fl6);
1100 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1102 return ERR_PTR(err);
1104 ipv6_addr_copy(&fl6->daddr, final_dst);
1106 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1108 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1110 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1112 static inline int ip6_ufo_append_data(struct sock *sk,
1113 int getfrag(void *from, char *to, int offset, int len,
1114 int odd, struct sk_buff *skb),
1115 void *from, int length, int hh_len, int fragheaderlen,
1116 int transhdrlen, int mtu,unsigned int flags,
1117 struct rt6_info *rt)
1120 struct sk_buff *skb;
1123 /* There is support for UDP large send offload by network
1124 * device, so create one single skb packet containing complete
1127 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1128 skb = sock_alloc_send_skb(sk,
1129 hh_len + fragheaderlen + transhdrlen + 20,
1130 (flags & MSG_DONTWAIT), &err);
1134 /* reserve space for Hardware header */
1135 skb_reserve(skb, hh_len);
1137 /* create space for UDP/IP header */
1138 skb_put(skb,fragheaderlen + transhdrlen);
1140 /* initialize network header pointer */
1141 skb_reset_network_header(skb);
1143 /* initialize protocol header pointer */
1144 skb->transport_header = skb->network_header + fragheaderlen;
1146 skb->ip_summed = CHECKSUM_PARTIAL;
1150 err = skb_append_datato_frags(sk,skb, getfrag, from,
1151 (length - transhdrlen));
1153 struct frag_hdr fhdr;
1155 /* Specify the length of each IPv6 datagram fragment.
1156 * It has to be a multiple of 8.
1158 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1159 sizeof(struct frag_hdr)) & ~7;
1160 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1161 ipv6_select_ident(&fhdr, rt);
1162 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1163 __skb_queue_tail(&sk->sk_write_queue, skb);
1167 /* There is not enough support do UPD LSO,
1168 * so follow normal path
1175 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1178 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1181 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1184 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1187 static void ip6_append_data_mtu(int *mtu,
1189 unsigned int fragheaderlen,
1190 struct sk_buff *skb,
1191 struct rt6_info *rt)
1193 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1195 /* first fragment, reserve header_len */
1196 *mtu = *mtu - rt->dst.header_len;
1200 * this fragment is not first, the headers
1201 * space is regarded as data space.
1203 *mtu = dst_mtu(rt->dst.path);
1205 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1206 + fragheaderlen - sizeof(struct frag_hdr);
1210 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1211 int offset, int len, int odd, struct sk_buff *skb),
1212 void *from, int length, int transhdrlen,
1213 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1214 struct rt6_info *rt, unsigned int flags, int dontfrag)
1216 struct inet_sock *inet = inet_sk(sk);
1217 struct ipv6_pinfo *np = inet6_sk(sk);
1218 struct inet_cork *cork;
1219 struct sk_buff *skb, *skb_prev = NULL;
1220 unsigned int maxfraglen, fragheaderlen;
1228 int csummode = CHECKSUM_NONE;
1231 if (flags&MSG_PROBE)
1233 cork = &inet->cork.base;
1234 if (skb_queue_empty(&sk->sk_write_queue)) {
1239 if (WARN_ON(np->cork.opt))
1242 np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1243 if (unlikely(np->cork.opt == NULL))
1246 np->cork.opt->tot_len = opt->tot_len;
1247 np->cork.opt->opt_flen = opt->opt_flen;
1248 np->cork.opt->opt_nflen = opt->opt_nflen;
1250 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1252 if (opt->dst0opt && !np->cork.opt->dst0opt)
1255 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1257 if (opt->dst1opt && !np->cork.opt->dst1opt)
1260 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1262 if (opt->hopopt && !np->cork.opt->hopopt)
1265 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1267 if (opt->srcrt && !np->cork.opt->srcrt)
1270 /* need source address above miyazawa*/
1273 cork->dst = &rt->dst;
1274 inet->cork.fl.u.ip6 = *fl6;
1275 np->cork.hop_limit = hlimit;
1276 np->cork.tclass = tclass;
1277 if (rt->dst.flags & DST_XFRM_TUNNEL)
1278 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1279 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1281 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1282 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1283 if (np->frag_size < mtu) {
1285 mtu = np->frag_size;
1287 cork->fragsize = mtu;
1288 if (dst_allfrag(rt->dst.path))
1289 cork->flags |= IPCORK_ALLFRAG;
1291 sk->sk_sndmsg_page = NULL;
1292 sk->sk_sndmsg_off = 0;
1293 exthdrlen = (opt ? opt->opt_flen : 0);
1294 length += exthdrlen;
1295 transhdrlen += exthdrlen;
1296 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1298 rt = (struct rt6_info *)cork->dst;
1299 fl6 = &inet->cork.fl.u.ip6;
1304 mtu = cork->fragsize;
1307 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1309 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1310 (opt ? opt->opt_nflen : 0);
1311 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1313 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1314 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1315 ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1320 /* For UDP, check if TX timestamp is enabled */
1321 if (sk->sk_type == SOCK_DGRAM) {
1322 err = sock_tx_timestamp(sk, &tx_flags);
1328 * Let's try using as much space as possible.
1329 * Use MTU if total length of the message fits into the MTU.
1330 * Otherwise, we need to reserve fragment header and
1331 * fragment alignment (= 8-15 octects, in total).
1333 * Note that we may need to "move" the data from the tail of
1334 * of the buffer to the new fragment when we split
1337 * FIXME: It may be fragmented into multiple chunks
1338 * at once if non-fragmentable extension headers
1343 cork->length += length;
1345 int proto = sk->sk_protocol;
1346 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1347 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1351 if (proto == IPPROTO_UDP &&
1352 (rt->dst.dev->features & NETIF_F_UFO)) {
1354 err = ip6_ufo_append_data(sk, getfrag, from, length,
1355 hh_len, fragheaderlen,
1356 transhdrlen, mtu, flags, rt);
1363 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1366 while (length > 0) {
1367 /* Check if the remaining data fits into current packet. */
1368 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1370 copy = maxfraglen - skb->len;
1374 unsigned int datalen;
1375 unsigned int fraglen;
1376 unsigned int fraggap;
1377 unsigned int alloclen;
1379 /* There's no room in the current skb */
1381 fraggap = skb->len - maxfraglen;
1384 /* update mtu and maxfraglen if necessary */
1385 if (skb == NULL || skb_prev == NULL)
1386 ip6_append_data_mtu(&mtu, &maxfraglen,
1387 fragheaderlen, skb, rt);
1392 * If remaining data exceeds the mtu,
1393 * we know we need more fragment(s).
1395 datalen = length + fraggap;
1397 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1398 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1399 if ((flags & MSG_MORE) &&
1400 !(rt->dst.dev->features&NETIF_F_SG))
1403 alloclen = datalen + fragheaderlen;
1405 alloclen += dst_exthdrlen;
1407 if (datalen != length + fraggap) {
1409 * this is not the last fragment, the trailer
1410 * space is regarded as data space.
1412 datalen += rt->dst.trailer_len;
1415 alloclen += rt->dst.trailer_len;
1416 fraglen = datalen + fragheaderlen;
1419 * We just reserve space for fragment header.
1420 * Note: this may be overallocation if the message
1421 * (without MSG_MORE) fits into the MTU.
1423 alloclen += sizeof(struct frag_hdr);
1426 skb = sock_alloc_send_skb(sk,
1428 (flags & MSG_DONTWAIT), &err);
1431 if (atomic_read(&sk->sk_wmem_alloc) <=
1433 skb = sock_wmalloc(sk,
1434 alloclen + hh_len, 1,
1436 if (unlikely(skb == NULL))
1439 /* Only the initial fragment
1448 * Fill in the control structures
1450 skb->ip_summed = csummode;
1452 /* reserve for fragmentation and ipsec header */
1453 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1456 if (sk->sk_type == SOCK_DGRAM)
1457 skb_shinfo(skb)->tx_flags = tx_flags;
1460 * Find where to start putting bytes
1462 data = skb_put(skb, fraglen);
1463 skb_set_network_header(skb, exthdrlen);
1464 data += fragheaderlen;
1465 skb->transport_header = (skb->network_header +
1468 skb->csum = skb_copy_and_csum_bits(
1469 skb_prev, maxfraglen,
1470 data + transhdrlen, fraggap, 0);
1471 skb_prev->csum = csum_sub(skb_prev->csum,
1474 pskb_trim_unique(skb_prev, maxfraglen);
1476 copy = datalen - transhdrlen - fraggap;
1482 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1489 length -= datalen - fraggap;
1493 csummode = CHECKSUM_NONE;
1496 * Put the packet on the pending queue
1498 __skb_queue_tail(&sk->sk_write_queue, skb);
1505 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1509 if (getfrag(from, skb_put(skb, copy),
1510 offset, copy, off, skb) < 0) {
1511 __skb_trim(skb, off);
1516 int i = skb_shinfo(skb)->nr_frags;
1517 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1518 struct page *page = sk->sk_sndmsg_page;
1519 int off = sk->sk_sndmsg_off;
1522 if (page && (left = PAGE_SIZE - off) > 0) {
1525 if (page != skb_frag_page(frag)) {
1526 if (i == MAX_SKB_FRAGS) {
1530 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1531 skb_frag_ref(skb, i);
1532 frag = &skb_shinfo(skb)->frags[i];
1534 } else if(i < MAX_SKB_FRAGS) {
1535 if (copy > PAGE_SIZE)
1537 page = alloc_pages(sk->sk_allocation, 0);
1542 sk->sk_sndmsg_page = page;
1543 sk->sk_sndmsg_off = 0;
1545 skb_fill_page_desc(skb, i, page, 0, 0);
1546 frag = &skb_shinfo(skb)->frags[i];
1552 skb_frag_address(frag) + skb_frag_size(frag),
1553 offset, copy, skb->len, skb) < 0) {
1557 sk->sk_sndmsg_off += copy;
1558 skb_frag_size_add(frag, copy);
1560 skb->data_len += copy;
1561 skb->truesize += copy;
1562 atomic_add(copy, &sk->sk_wmem_alloc);
1569 cork->length -= length;
1570 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1574 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1577 kfree(np->cork.opt->dst0opt);
1578 kfree(np->cork.opt->dst1opt);
1579 kfree(np->cork.opt->hopopt);
1580 kfree(np->cork.opt->srcrt);
1581 kfree(np->cork.opt);
1582 np->cork.opt = NULL;
1585 if (inet->cork.base.dst) {
1586 dst_release(inet->cork.base.dst);
1587 inet->cork.base.dst = NULL;
1588 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1590 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1593 int ip6_push_pending_frames(struct sock *sk)
1595 struct sk_buff *skb, *tmp_skb;
1596 struct sk_buff **tail_skb;
1597 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1598 struct inet_sock *inet = inet_sk(sk);
1599 struct ipv6_pinfo *np = inet6_sk(sk);
1600 struct net *net = sock_net(sk);
1601 struct ipv6hdr *hdr;
1602 struct ipv6_txoptions *opt = np->cork.opt;
1603 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1604 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1605 unsigned char proto = fl6->flowi6_proto;
1608 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1610 tail_skb = &(skb_shinfo(skb)->frag_list);
1612 /* move skb->data to ip header from ext header */
1613 if (skb->data < skb_network_header(skb))
1614 __skb_pull(skb, skb_network_offset(skb));
1615 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1616 __skb_pull(tmp_skb, skb_network_header_len(skb));
1617 *tail_skb = tmp_skb;
1618 tail_skb = &(tmp_skb->next);
1619 skb->len += tmp_skb->len;
1620 skb->data_len += tmp_skb->len;
1621 skb->truesize += tmp_skb->truesize;
1622 tmp_skb->destructor = NULL;
1626 /* Allow local fragmentation. */
1627 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1630 ipv6_addr_copy(final_dst, &fl6->daddr);
1631 __skb_pull(skb, skb_network_header_len(skb));
1632 if (opt && opt->opt_flen)
1633 ipv6_push_frag_opts(skb, opt, &proto);
1634 if (opt && opt->opt_nflen)
1635 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1637 skb_push(skb, sizeof(struct ipv6hdr));
1638 skb_reset_network_header(skb);
1639 hdr = ipv6_hdr(skb);
1641 *(__be32*)hdr = fl6->flowlabel |
1642 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1644 hdr->hop_limit = np->cork.hop_limit;
1645 hdr->nexthdr = proto;
1646 ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1647 ipv6_addr_copy(&hdr->daddr, final_dst);
1649 skb->priority = sk->sk_priority;
1650 skb->mark = sk->sk_mark;
1652 skb_dst_set(skb, dst_clone(&rt->dst));
1653 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1654 if (proto == IPPROTO_ICMPV6) {
1655 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1657 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1658 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1661 err = ip6_local_out(skb);
1664 err = net_xmit_errno(err);
1670 ip6_cork_release(inet, np);
1673 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1677 void ip6_flush_pending_frames(struct sock *sk)
1679 struct sk_buff *skb;
1681 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1683 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1684 IPSTATS_MIB_OUTDISCARDS);
1688 ip6_cork_release(inet_sk(sk), inet6_sk(sk));