2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
61 int __ip6_local_out(struct sk_buff *skb)
65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN)
68 ipv6_hdr(skb)->payload_len = htons(len);
70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 skb_dst(skb)->dev, dst_output);
74 int ip6_local_out(struct sk_buff *skb)
78 err = __ip6_local_out(skb);
80 err = dst_output(skb);
84 EXPORT_SYMBOL_GPL(ip6_local_out);
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
89 skb_reset_mac_header(newskb);
90 __skb_pull(newskb, skb_network_offset(newskb));
91 newskb->pkt_type = PACKET_LOOPBACK;
92 newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 WARN_ON(!skb_dst(newskb));
99 static int ip6_finish_output2(struct sk_buff *skb)
101 struct dst_entry *dst = skb_dst(skb);
102 struct net_device *dev = dst->dev;
103 struct neighbour *neigh;
105 skb->protocol = htons(ETH_P_IPV6);
108 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
111 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 ((mroute6_socket(dev_net(dev), skb) &&
113 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 &ipv6_hdr(skb)->saddr))) {
116 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
118 /* Do not check for IFF_ALLMULTI; multicast routing
119 is not supported in any case.
122 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 newskb, NULL, newskb->dev,
124 ip6_dev_loopback_xmit);
126 if (ipv6_hdr(skb)->hop_limit == 0) {
127 IP6_INC_STATS(dev_net(dev), idev,
128 IPSTATS_MIB_OUTDISCARDS);
134 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
139 neigh = dst_get_neighbour(dst);
141 int res = neigh_output(neigh, skb);
147 IP6_INC_STATS(dev_net(dst->dev),
148 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
153 static int ip6_finish_output(struct sk_buff *skb)
155 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156 dst_allfrag(skb_dst(skb)))
157 return ip6_fragment(skb, ip6_finish_output2);
159 return ip6_finish_output2(skb);
162 int ip6_output(struct sk_buff *skb)
164 struct net_device *dev = skb_dst(skb)->dev;
165 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166 if (unlikely(idev->cnf.disable_ipv6)) {
167 IP6_INC_STATS(dev_net(dev), idev,
168 IPSTATS_MIB_OUTDISCARDS);
173 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
175 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
179 * xmit an sk_buff (used by TCP, SCTP and DCCP)
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183 struct ipv6_txoptions *opt, int tclass)
185 struct net *net = sock_net(sk);
186 struct ipv6_pinfo *np = inet6_sk(sk);
187 struct in6_addr *first_hop = &fl6->daddr;
188 struct dst_entry *dst = skb_dst(skb);
190 u8 proto = fl6->flowi6_proto;
191 int seg_len = skb->len;
196 unsigned int head_room;
198 /* First: exthdrs may take lots of space (~8K for now)
199 MAX_HEADER is not enough.
201 head_room = opt->opt_nflen + opt->opt_flen;
202 seg_len += head_room;
203 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
205 if (skb_headroom(skb) < head_room) {
206 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
208 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209 IPSTATS_MIB_OUTDISCARDS);
215 skb_set_owner_w(skb, sk);
218 ipv6_push_frag_opts(skb, opt, &proto);
220 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
223 skb_push(skb, sizeof(struct ipv6hdr));
224 skb_reset_network_header(skb);
228 * Fill in the IPv6 header
231 hlimit = np->hop_limit;
233 hlimit = ip6_dst_hoplimit(dst);
235 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
237 hdr->payload_len = htons(seg_len);
238 hdr->nexthdr = proto;
239 hdr->hop_limit = hlimit;
241 ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
242 ipv6_addr_copy(&hdr->daddr, first_hop);
244 skb->priority = sk->sk_priority;
245 skb->mark = sk->sk_mark;
248 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250 IPSTATS_MIB_OUT, skb->len);
251 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252 dst->dev, dst_output);
256 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
258 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
259 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
264 EXPORT_SYMBOL(ip6_xmit);
267 * To avoid extra problems ND packets are send through this
268 * routine. It's code duplication but I really want to avoid
269 * extra checks since ipv6_build_header is used by TCP (which
270 * is for us performance critical)
273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
274 const struct in6_addr *saddr, const struct in6_addr *daddr,
277 struct ipv6_pinfo *np = inet6_sk(sk);
280 skb->protocol = htons(ETH_P_IPV6);
283 skb_reset_network_header(skb);
284 skb_put(skb, sizeof(struct ipv6hdr));
287 *(__be32*)hdr = htonl(0x60000000);
289 hdr->payload_len = htons(len);
290 hdr->nexthdr = proto;
291 hdr->hop_limit = np->hop_limit;
293 ipv6_addr_copy(&hdr->saddr, saddr);
294 ipv6_addr_copy(&hdr->daddr, daddr);
299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
301 struct ip6_ra_chain *ra;
302 struct sock *last = NULL;
304 read_lock(&ip6_ra_lock);
305 for (ra = ip6_ra_chain; ra; ra = ra->next) {
306 struct sock *sk = ra->sk;
307 if (sk && ra->sel == sel &&
308 (!sk->sk_bound_dev_if ||
309 sk->sk_bound_dev_if == skb->dev->ifindex)) {
311 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
313 rawv6_rcv(last, skb2);
320 rawv6_rcv(last, skb);
321 read_unlock(&ip6_ra_lock);
324 read_unlock(&ip6_ra_lock);
328 static int ip6_forward_proxy_check(struct sk_buff *skb)
330 struct ipv6hdr *hdr = ipv6_hdr(skb);
331 u8 nexthdr = hdr->nexthdr;
334 if (ipv6_ext_hdr(nexthdr)) {
335 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
339 offset = sizeof(struct ipv6hdr);
341 if (nexthdr == IPPROTO_ICMPV6) {
342 struct icmp6hdr *icmp6;
344 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345 offset + 1 - skb->data)))
348 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
350 switch (icmp6->icmp6_type) {
351 case NDISC_ROUTER_SOLICITATION:
352 case NDISC_ROUTER_ADVERTISEMENT:
353 case NDISC_NEIGHBOUR_SOLICITATION:
354 case NDISC_NEIGHBOUR_ADVERTISEMENT:
356 /* For reaction involving unicast neighbor discovery
357 * message destined to the proxied address, pass it to
367 * The proxying router can't forward traffic sent to a link-local
368 * address, so signal the sender and discard the packet. This
369 * behavior is clarified by the MIPv6 specification.
371 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372 dst_link_failure(skb);
379 static inline int ip6_forward_finish(struct sk_buff *skb)
381 return dst_output(skb);
384 int ip6_forward(struct sk_buff *skb)
386 struct dst_entry *dst = skb_dst(skb);
387 struct ipv6hdr *hdr = ipv6_hdr(skb);
388 struct inet6_skb_parm *opt = IP6CB(skb);
389 struct net *net = dev_net(dst->dev);
393 if (net->ipv6.devconf_all->forwarding == 0)
396 if (skb_warn_if_lro(skb))
399 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
404 if (skb->pkt_type != PACKET_HOST)
407 skb_forward_csum(skb);
410 * We DO NOT make any processing on
411 * RA packets, pushing them to user level AS IS
412 * without ane WARRANTY that application will be able
413 * to interpret them. The reason is that we
414 * cannot make anything clever here.
416 * We are not end-node, so that if packet contains
417 * AH/ESP, we cannot make anything.
418 * Defragmentation also would be mistake, RA packets
419 * cannot be fragmented, because there is no warranty
420 * that different fragments will go along one path. --ANK
423 u8 *ptr = skb_network_header(skb) + opt->ra;
424 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
429 * check and decrement ttl
431 if (hdr->hop_limit <= 1) {
432 /* Force OUTPUT device used as source address */
434 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
435 IP6_INC_STATS_BH(net,
436 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
442 /* XXX: idev->cnf.proxy_ndp? */
443 if (net->ipv6.devconf_all->proxy_ndp &&
444 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
445 int proxied = ip6_forward_proxy_check(skb);
447 return ip6_input(skb);
448 else if (proxied < 0) {
449 IP6_INC_STATS(net, ip6_dst_idev(dst),
450 IPSTATS_MIB_INDISCARDS);
455 if (!xfrm6_route_forward(skb)) {
456 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
461 /* IPv6 specs say nothing about it, but it is clear that we cannot
462 send redirects to source routed frames.
463 We don't send redirects to frames decapsulated from IPsec.
465 n = dst_get_neighbour(dst);
466 if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
467 struct in6_addr *target = NULL;
471 * incoming and outgoing devices are the same
475 rt = (struct rt6_info *) dst;
476 if ((rt->rt6i_flags & RTF_GATEWAY))
477 target = (struct in6_addr*)&n->primary_key;
479 target = &hdr->daddr;
482 rt6_bind_peer(rt, 1);
484 /* Limit redirects both by destination (here)
485 and by source (inside ndisc_send_redirect)
487 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
488 ndisc_send_redirect(skb, n, target);
490 int addrtype = ipv6_addr_type(&hdr->saddr);
492 /* This check is security critical. */
493 if (addrtype == IPV6_ADDR_ANY ||
494 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
496 if (addrtype & IPV6_ADDR_LINKLOCAL) {
497 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
498 ICMPV6_NOT_NEIGHBOUR, 0);
504 if (mtu < IPV6_MIN_MTU)
507 if (skb->len > mtu && !skb_is_gso(skb)) {
508 /* Again, force OUTPUT device used as source address */
510 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
511 IP6_INC_STATS_BH(net,
512 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
513 IP6_INC_STATS_BH(net,
514 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
519 if (skb_cow(skb, dst->dev->hard_header_len)) {
520 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
526 /* Mangling hops number delayed to point after skb COW */
530 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
531 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
535 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
541 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
543 to->pkt_type = from->pkt_type;
544 to->priority = from->priority;
545 to->protocol = from->protocol;
547 skb_dst_set(to, dst_clone(skb_dst(from)));
549 to->mark = from->mark;
551 #ifdef CONFIG_NET_SCHED
552 to->tc_index = from->tc_index;
555 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
556 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
557 to->nf_trace = from->nf_trace;
559 skb_copy_secmark(to, from);
562 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
564 u16 offset = sizeof(struct ipv6hdr);
565 struct ipv6_opt_hdr *exthdr =
566 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
567 unsigned int packet_len = skb->tail - skb->network_header;
569 *nexthdr = &ipv6_hdr(skb)->nexthdr;
571 while (offset + 1 <= packet_len) {
577 case NEXTHDR_ROUTING:
581 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
582 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
592 offset += ipv6_optlen(exthdr);
593 *nexthdr = &exthdr->nexthdr;
594 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
601 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
603 static atomic_t ipv6_fragmentation_id;
606 if (rt && !(rt->dst.flags & DST_NOPEER)) {
607 struct inet_peer *peer;
610 rt6_bind_peer(rt, 1);
611 peer = rt->rt6i_peer;
613 fhdr->identification = htonl(inet_getid(peer, 0));
617 ident = atomic_inc_return(&ipv6_fragmentation_id);
618 fhdr->identification = htonl(ident);
621 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
623 struct sk_buff *frag;
624 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
625 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
626 struct ipv6hdr *tmp_hdr;
628 unsigned int mtu, hlen, left, len;
630 int ptr, offset = 0, err=0;
631 u8 *prevhdr, nexthdr = 0;
632 struct net *net = dev_net(skb_dst(skb)->dev);
634 hlen = ip6_find_1stfragopt(skb, &prevhdr);
637 mtu = ip6_skb_dst_mtu(skb);
639 /* We must not fragment if the socket is set to force MTU discovery
640 * or if the skb it not generated by a local socket.
642 if (!skb->local_df && skb->len > mtu) {
643 skb->dev = skb_dst(skb)->dev;
644 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
645 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
646 IPSTATS_MIB_FRAGFAILS);
651 if (np && np->frag_size < mtu) {
655 mtu -= hlen + sizeof(struct frag_hdr);
657 if (skb_has_frag_list(skb)) {
658 int first_len = skb_pagelen(skb);
659 struct sk_buff *frag2;
661 if (first_len - hlen > mtu ||
662 ((first_len - hlen) & 7) ||
666 skb_walk_frags(skb, frag) {
667 /* Correct geometry. */
668 if (frag->len > mtu ||
669 ((frag->len & 7) && frag->next) ||
670 skb_headroom(frag) < hlen)
671 goto slow_path_clean;
673 /* Partially cloned skb? */
674 if (skb_shared(frag))
675 goto slow_path_clean;
680 frag->destructor = sock_wfree;
682 skb->truesize -= frag->truesize;
687 frag = skb_shinfo(skb)->frag_list;
688 skb_frag_list_init(skb);
691 *prevhdr = NEXTHDR_FRAGMENT;
692 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
694 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
695 IPSTATS_MIB_FRAGFAILS);
699 __skb_pull(skb, hlen);
700 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
701 __skb_push(skb, hlen);
702 skb_reset_network_header(skb);
703 memcpy(skb_network_header(skb), tmp_hdr, hlen);
705 ipv6_select_ident(fh, rt);
706 fh->nexthdr = nexthdr;
708 fh->frag_off = htons(IP6_MF);
709 frag_id = fh->identification;
711 first_len = skb_pagelen(skb);
712 skb->data_len = first_len - skb_headlen(skb);
713 skb->len = first_len;
714 ipv6_hdr(skb)->payload_len = htons(first_len -
715 sizeof(struct ipv6hdr));
720 /* Prepare header of the next frame,
721 * before previous one went down. */
723 frag->ip_summed = CHECKSUM_NONE;
724 skb_reset_transport_header(frag);
725 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
726 __skb_push(frag, hlen);
727 skb_reset_network_header(frag);
728 memcpy(skb_network_header(frag), tmp_hdr,
730 offset += skb->len - hlen - sizeof(struct frag_hdr);
731 fh->nexthdr = nexthdr;
733 fh->frag_off = htons(offset);
734 if (frag->next != NULL)
735 fh->frag_off |= htons(IP6_MF);
736 fh->identification = frag_id;
737 ipv6_hdr(frag)->payload_len =
739 sizeof(struct ipv6hdr));
740 ip6_copy_metadata(frag, skb);
745 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
746 IPSTATS_MIB_FRAGCREATES);
759 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
760 IPSTATS_MIB_FRAGOKS);
761 dst_release(&rt->dst);
771 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
772 IPSTATS_MIB_FRAGFAILS);
773 dst_release(&rt->dst);
777 skb_walk_frags(skb, frag2) {
781 frag2->destructor = NULL;
782 skb->truesize += frag2->truesize;
787 left = skb->len - hlen; /* Space per frame */
788 ptr = hlen; /* Where to start from */
791 * Fragment the datagram.
794 *prevhdr = NEXTHDR_FRAGMENT;
797 * Keep copying data until we run out.
801 /* IF: it doesn't fit, use 'mtu' - the data space left */
804 /* IF: we are not sending up to and including the packet end
805 then align the next start on an eight byte boundary */
813 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
814 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
815 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
816 IPSTATS_MIB_FRAGFAILS);
822 * Set up data on packet
825 ip6_copy_metadata(frag, skb);
826 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
827 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
828 skb_reset_network_header(frag);
829 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
830 frag->transport_header = (frag->network_header + hlen +
831 sizeof(struct frag_hdr));
834 * Charge the memory for the fragment to any owner
838 skb_set_owner_w(frag, skb->sk);
841 * Copy the packet header into the new buffer.
843 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
846 * Build fragment header.
848 fh->nexthdr = nexthdr;
851 ipv6_select_ident(fh, rt);
852 frag_id = fh->identification;
854 fh->identification = frag_id;
857 * Copy a block of the IP datagram.
859 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
863 fh->frag_off = htons(offset);
865 fh->frag_off |= htons(IP6_MF);
866 ipv6_hdr(frag)->payload_len = htons(frag->len -
867 sizeof(struct ipv6hdr));
873 * Put this fragment into the sending queue.
879 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
880 IPSTATS_MIB_FRAGCREATES);
882 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
883 IPSTATS_MIB_FRAGOKS);
888 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
889 IPSTATS_MIB_FRAGFAILS);
894 static inline int ip6_rt_check(const struct rt6key *rt_key,
895 const struct in6_addr *fl_addr,
896 const struct in6_addr *addr_cache)
898 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
899 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
902 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
903 struct dst_entry *dst,
904 const struct flowi6 *fl6)
906 struct ipv6_pinfo *np = inet6_sk(sk);
912 if (dst->ops->family != AF_INET6) {
917 rt = (struct rt6_info *)dst;
918 /* Yes, checking route validity in not connected
919 * case is not very simple. Take into account,
920 * that we do not support routing by source, TOS,
921 * and MSG_DONTROUTE --ANK (980726)
923 * 1. ip6_rt_check(): If route was host route,
924 * check that cached destination is current.
925 * If it is network route, we still may
926 * check its validity using saved pointer
927 * to the last used address: daddr_cache.
928 * We do not want to save whole address now,
929 * (because main consumer of this service
930 * is tcp, which has not this problem),
931 * so that the last trick works only on connected
933 * 2. oif also should be the same.
935 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
936 #ifdef CONFIG_IPV6_SUBTREES
937 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
939 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
948 static int ip6_dst_lookup_tail(struct sock *sk,
949 struct dst_entry **dst, struct flowi6 *fl6)
951 struct net *net = sock_net(sk);
952 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
958 *dst = ip6_route_output(net, sk, fl6);
960 if ((err = (*dst)->error))
961 goto out_err_release;
963 if (ipv6_addr_any(&fl6->saddr)) {
964 struct rt6_info *rt = (struct rt6_info *) *dst;
965 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
966 sk ? inet6_sk(sk)->srcprefs : 0,
969 goto out_err_release;
972 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
974 * Here if the dst entry we've looked up
975 * has a neighbour entry that is in the INCOMPLETE
976 * state and the src address from the flow is
977 * marked as OPTIMISTIC, we release the found
978 * dst entry and replace it instead with the
979 * dst entry of the nexthop router
982 n = dst_get_neighbour(*dst);
983 if (n && !(n->nud_state & NUD_VALID)) {
984 struct inet6_ifaddr *ifp;
985 struct flowi6 fl_gw6;
989 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
992 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
998 * We need to get the dst entry for the
999 * default router instead
1002 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1003 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1004 *dst = ip6_route_output(net, sk, &fl_gw6);
1005 if ((err = (*dst)->error))
1006 goto out_err_release;
1016 if (err == -ENETUNREACH)
1017 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1024 * ip6_dst_lookup - perform route lookup on flow
1025 * @sk: socket which provides route info
1026 * @dst: pointer to dst_entry * for result
1027 * @fl6: flow to lookup
1029 * This function performs a route lookup on the given flow.
1031 * It returns zero on success, or a standard errno code on error.
1033 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1036 return ip6_dst_lookup_tail(sk, dst, fl6);
1038 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1041 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1042 * @sk: socket which provides route info
1043 * @fl6: flow to lookup
1044 * @final_dst: final destination address for ipsec lookup
1045 * @can_sleep: we are in a sleepable context
1047 * This function performs a route lookup on the given flow.
1049 * It returns a valid dst pointer on success, or a pointer encoded
1052 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1053 const struct in6_addr *final_dst,
1056 struct dst_entry *dst = NULL;
1059 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1061 return ERR_PTR(err);
1063 ipv6_addr_copy(&fl6->daddr, final_dst);
1065 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1067 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1069 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1072 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1073 * @sk: socket which provides the dst cache and route info
1074 * @fl6: flow to lookup
1075 * @final_dst: final destination address for ipsec lookup
1076 * @can_sleep: we are in a sleepable context
1078 * This function performs a route lookup on the given flow with the
1079 * possibility of using the cached route in the socket if it is valid.
1080 * It will take the socket dst lock when operating on the dst cache.
1081 * As a result, this function can only be used in process context.
1083 * It returns a valid dst pointer on success, or a pointer encoded
1086 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1087 const struct in6_addr *final_dst,
1090 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1093 dst = ip6_sk_dst_check(sk, dst, fl6);
1095 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1097 return ERR_PTR(err);
1099 ipv6_addr_copy(&fl6->daddr, final_dst);
1101 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1103 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1105 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1107 static inline int ip6_ufo_append_data(struct sock *sk,
1108 int getfrag(void *from, char *to, int offset, int len,
1109 int odd, struct sk_buff *skb),
1110 void *from, int length, int hh_len, int fragheaderlen,
1111 int transhdrlen, int mtu,unsigned int flags,
1112 struct rt6_info *rt)
1115 struct sk_buff *skb;
1118 /* There is support for UDP large send offload by network
1119 * device, so create one single skb packet containing complete
1122 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1123 struct frag_hdr fhdr;
1125 skb = sock_alloc_send_skb(sk,
1126 hh_len + fragheaderlen + transhdrlen + 20,
1127 (flags & MSG_DONTWAIT), &err);
1131 /* reserve space for Hardware header */
1132 skb_reserve(skb, hh_len);
1134 /* create space for UDP/IP header */
1135 skb_put(skb,fragheaderlen + transhdrlen);
1137 /* initialize network header pointer */
1138 skb_reset_network_header(skb);
1140 /* initialize protocol header pointer */
1141 skb->transport_header = skb->network_header + fragheaderlen;
1143 skb->ip_summed = CHECKSUM_PARTIAL;
1146 /* Specify the length of each IPv6 datagram fragment.
1147 * It has to be a multiple of 8.
1149 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1150 sizeof(struct frag_hdr)) & ~7;
1151 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1152 ipv6_select_ident(&fhdr, rt);
1153 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1154 __skb_queue_tail(&sk->sk_write_queue, skb);
1157 return skb_append_datato_frags(sk, skb, getfrag, from,
1158 (length - transhdrlen));
1161 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1164 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1167 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1170 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1173 static void ip6_append_data_mtu(unsigned int *mtu,
1175 unsigned int fragheaderlen,
1176 struct sk_buff *skb,
1177 struct rt6_info *rt,
1178 unsigned int orig_mtu)
1180 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1182 /* first fragment, reserve header_len */
1183 *mtu = orig_mtu - rt->dst.header_len;
1187 * this fragment is not first, the headers
1188 * space is regarded as data space.
1192 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1193 + fragheaderlen - sizeof(struct frag_hdr);
1197 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1198 int offset, int len, int odd, struct sk_buff *skb),
1199 void *from, int length, int transhdrlen,
1200 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1201 struct rt6_info *rt, unsigned int flags, int dontfrag)
1203 struct inet_sock *inet = inet_sk(sk);
1204 struct ipv6_pinfo *np = inet6_sk(sk);
1205 struct inet_cork *cork;
1206 struct sk_buff *skb, *skb_prev = NULL;
1207 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1214 int csummode = CHECKSUM_NONE;
1217 if (flags&MSG_PROBE)
1219 cork = &inet->cork.base;
1220 if (skb_queue_empty(&sk->sk_write_queue)) {
1225 if (WARN_ON(np->cork.opt))
1228 np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1229 if (unlikely(np->cork.opt == NULL))
1232 np->cork.opt->tot_len = opt->tot_len;
1233 np->cork.opt->opt_flen = opt->opt_flen;
1234 np->cork.opt->opt_nflen = opt->opt_nflen;
1236 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1238 if (opt->dst0opt && !np->cork.opt->dst0opt)
1241 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1243 if (opt->dst1opt && !np->cork.opt->dst1opt)
1246 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1248 if (opt->hopopt && !np->cork.opt->hopopt)
1251 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1253 if (opt->srcrt && !np->cork.opt->srcrt)
1256 /* need source address above miyazawa*/
1259 cork->dst = &rt->dst;
1260 inet->cork.fl.u.ip6 = *fl6;
1261 np->cork.hop_limit = hlimit;
1262 np->cork.tclass = tclass;
1263 if (rt->dst.flags & DST_XFRM_TUNNEL)
1264 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1265 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1267 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1268 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1269 if (np->frag_size < mtu) {
1271 mtu = np->frag_size;
1273 cork->fragsize = mtu;
1274 if (dst_allfrag(rt->dst.path))
1275 cork->flags |= IPCORK_ALLFRAG;
1277 sk->sk_sndmsg_page = NULL;
1278 sk->sk_sndmsg_off = 0;
1279 exthdrlen = (opt ? opt->opt_flen : 0);
1280 length += exthdrlen;
1281 transhdrlen += exthdrlen;
1282 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1284 rt = (struct rt6_info *)cork->dst;
1285 fl6 = &inet->cork.fl.u.ip6;
1290 mtu = cork->fragsize;
1294 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1296 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1297 (opt ? opt->opt_nflen : 0);
1298 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1300 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1301 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1302 ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1307 /* For UDP, check if TX timestamp is enabled */
1308 if (sk->sk_type == SOCK_DGRAM) {
1309 err = sock_tx_timestamp(sk, &tx_flags);
1315 * Let's try using as much space as possible.
1316 * Use MTU if total length of the message fits into the MTU.
1317 * Otherwise, we need to reserve fragment header and
1318 * fragment alignment (= 8-15 octects, in total).
1320 * Note that we may need to "move" the data from the tail of
1321 * of the buffer to the new fragment when we split
1324 * FIXME: It may be fragmented into multiple chunks
1325 * at once if non-fragmentable extension headers
1330 if ((length > mtu) && dontfrag && (sk->sk_protocol == IPPROTO_UDP ||
1331 sk->sk_protocol == IPPROTO_RAW)) {
1332 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1336 skb = skb_peek_tail(&sk->sk_write_queue);
1337 cork->length += length;
1338 if (((length > mtu) ||
1339 (skb && skb_has_frags(skb))) &&
1340 (sk->sk_protocol == IPPROTO_UDP) &&
1341 (rt->dst.dev->features & NETIF_F_UFO)) {
1342 err = ip6_ufo_append_data(sk, getfrag, from, length,
1343 hh_len, fragheaderlen,
1344 transhdrlen, mtu, flags, rt);
1353 while (length > 0) {
1354 /* Check if the remaining data fits into current packet. */
1355 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1357 copy = maxfraglen - skb->len;
1361 unsigned int datalen;
1362 unsigned int fraglen;
1363 unsigned int fraggap;
1364 unsigned int alloclen;
1366 /* There's no room in the current skb */
1368 fraggap = skb->len - maxfraglen;
1371 /* update mtu and maxfraglen if necessary */
1372 if (skb == NULL || skb_prev == NULL)
1373 ip6_append_data_mtu(&mtu, &maxfraglen,
1374 fragheaderlen, skb, rt,
1380 * If remaining data exceeds the mtu,
1381 * we know we need more fragment(s).
1383 datalen = length + fraggap;
1385 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1386 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1387 if ((flags & MSG_MORE) &&
1388 !(rt->dst.dev->features&NETIF_F_SG))
1391 alloclen = datalen + fragheaderlen;
1393 alloclen += dst_exthdrlen;
1395 if (datalen != length + fraggap) {
1397 * this is not the last fragment, the trailer
1398 * space is regarded as data space.
1400 datalen += rt->dst.trailer_len;
1403 alloclen += rt->dst.trailer_len;
1404 fraglen = datalen + fragheaderlen;
1407 * We just reserve space for fragment header.
1408 * Note: this may be overallocation if the message
1409 * (without MSG_MORE) fits into the MTU.
1411 alloclen += sizeof(struct frag_hdr);
1414 skb = sock_alloc_send_skb(sk,
1416 (flags & MSG_DONTWAIT), &err);
1419 if (atomic_read(&sk->sk_wmem_alloc) <=
1421 skb = sock_wmalloc(sk,
1422 alloclen + hh_len, 1,
1424 if (unlikely(skb == NULL))
1427 /* Only the initial fragment
1436 * Fill in the control structures
1438 skb->ip_summed = csummode;
1440 /* reserve for fragmentation and ipsec header */
1441 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1444 if (sk->sk_type == SOCK_DGRAM)
1445 skb_shinfo(skb)->tx_flags = tx_flags;
1448 * Find where to start putting bytes
1450 data = skb_put(skb, fraglen);
1451 skb_set_network_header(skb, exthdrlen);
1452 data += fragheaderlen;
1453 skb->transport_header = (skb->network_header +
1456 skb->csum = skb_copy_and_csum_bits(
1457 skb_prev, maxfraglen,
1458 data + transhdrlen, fraggap, 0);
1459 skb_prev->csum = csum_sub(skb_prev->csum,
1462 pskb_trim_unique(skb_prev, maxfraglen);
1464 copy = datalen - transhdrlen - fraggap;
1470 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1477 length -= datalen - fraggap;
1481 csummode = CHECKSUM_NONE;
1484 * Put the packet on the pending queue
1486 __skb_queue_tail(&sk->sk_write_queue, skb);
1493 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1497 if (getfrag(from, skb_put(skb, copy),
1498 offset, copy, off, skb) < 0) {
1499 __skb_trim(skb, off);
1504 int i = skb_shinfo(skb)->nr_frags;
1505 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1506 struct page *page = sk->sk_sndmsg_page;
1507 int off = sk->sk_sndmsg_off;
1510 if (page && (left = PAGE_SIZE - off) > 0) {
1513 if (page != skb_frag_page(frag)) {
1514 if (i == MAX_SKB_FRAGS) {
1518 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1519 skb_frag_ref(skb, i);
1520 frag = &skb_shinfo(skb)->frags[i];
1522 } else if(i < MAX_SKB_FRAGS) {
1523 if (copy > PAGE_SIZE)
1525 page = alloc_pages(sk->sk_allocation, 0);
1530 sk->sk_sndmsg_page = page;
1531 sk->sk_sndmsg_off = 0;
1533 skb_fill_page_desc(skb, i, page, 0, 0);
1534 frag = &skb_shinfo(skb)->frags[i];
1540 skb_frag_address(frag) + skb_frag_size(frag),
1541 offset, copy, skb->len, skb) < 0) {
1545 sk->sk_sndmsg_off += copy;
1546 skb_frag_size_add(frag, copy);
1548 skb->data_len += copy;
1549 skb->truesize += copy;
1550 atomic_add(copy, &sk->sk_wmem_alloc);
1557 cork->length -= length;
1558 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1562 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1565 kfree(np->cork.opt->dst0opt);
1566 kfree(np->cork.opt->dst1opt);
1567 kfree(np->cork.opt->hopopt);
1568 kfree(np->cork.opt->srcrt);
1569 kfree(np->cork.opt);
1570 np->cork.opt = NULL;
1573 if (inet->cork.base.dst) {
1574 dst_release(inet->cork.base.dst);
1575 inet->cork.base.dst = NULL;
1576 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1578 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1581 int ip6_push_pending_frames(struct sock *sk)
1583 struct sk_buff *skb, *tmp_skb;
1584 struct sk_buff **tail_skb;
1585 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1586 struct inet_sock *inet = inet_sk(sk);
1587 struct ipv6_pinfo *np = inet6_sk(sk);
1588 struct net *net = sock_net(sk);
1589 struct ipv6hdr *hdr;
1590 struct ipv6_txoptions *opt = np->cork.opt;
1591 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1592 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1593 unsigned char proto = fl6->flowi6_proto;
1596 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1598 tail_skb = &(skb_shinfo(skb)->frag_list);
1600 /* move skb->data to ip header from ext header */
1601 if (skb->data < skb_network_header(skb))
1602 __skb_pull(skb, skb_network_offset(skb));
1603 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1604 __skb_pull(tmp_skb, skb_network_header_len(skb));
1605 *tail_skb = tmp_skb;
1606 tail_skb = &(tmp_skb->next);
1607 skb->len += tmp_skb->len;
1608 skb->data_len += tmp_skb->len;
1609 skb->truesize += tmp_skb->truesize;
1610 tmp_skb->destructor = NULL;
1614 /* Allow local fragmentation. */
1615 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1618 ipv6_addr_copy(final_dst, &fl6->daddr);
1619 __skb_pull(skb, skb_network_header_len(skb));
1620 if (opt && opt->opt_flen)
1621 ipv6_push_frag_opts(skb, opt, &proto);
1622 if (opt && opt->opt_nflen)
1623 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1625 skb_push(skb, sizeof(struct ipv6hdr));
1626 skb_reset_network_header(skb);
1627 hdr = ipv6_hdr(skb);
1629 *(__be32*)hdr = fl6->flowlabel |
1630 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1632 hdr->hop_limit = np->cork.hop_limit;
1633 hdr->nexthdr = proto;
1634 ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1635 ipv6_addr_copy(&hdr->daddr, final_dst);
1637 skb->priority = sk->sk_priority;
1638 skb->mark = sk->sk_mark;
1640 skb_dst_set(skb, dst_clone(&rt->dst));
1641 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1642 if (proto == IPPROTO_ICMPV6) {
1643 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1645 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1646 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1649 err = ip6_local_out(skb);
1652 err = net_xmit_errno(err);
1658 ip6_cork_release(inet, np);
1661 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1665 void ip6_flush_pending_frames(struct sock *sk)
1667 struct sk_buff *skb;
1669 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1671 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1672 IPSTATS_MIB_OUTDISCARDS);
1676 ip6_cork_release(inet_sk(sk), inet6_sk(sk));