2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
61 int __ip6_local_out(struct sk_buff *skb)
65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN)
68 ipv6_hdr(skb)->payload_len = htons(len);
70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 skb_dst(skb)->dev, dst_output);
74 int ip6_local_out(struct sk_buff *skb)
78 err = __ip6_local_out(skb);
80 err = dst_output(skb);
84 EXPORT_SYMBOL_GPL(ip6_local_out);
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
89 skb_reset_mac_header(newskb);
90 __skb_pull(newskb, skb_network_offset(newskb));
91 newskb->pkt_type = PACKET_LOOPBACK;
92 newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 WARN_ON(!skb_dst(newskb));
99 static int ip6_finish_output2(struct sk_buff *skb)
101 struct dst_entry *dst = skb_dst(skb);
102 struct net_device *dev = dst->dev;
103 struct neighbour *neigh;
105 skb->protocol = htons(ETH_P_IPV6);
108 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
111 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 ((mroute6_socket(dev_net(dev), skb) &&
113 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 &ipv6_hdr(skb)->saddr))) {
116 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
118 /* Do not check for IFF_ALLMULTI; multicast routing
119 is not supported in any case.
122 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 newskb, NULL, newskb->dev,
124 ip6_dev_loopback_xmit);
126 if (ipv6_hdr(skb)->hop_limit == 0) {
127 IP6_INC_STATS(dev_net(dev), idev,
128 IPSTATS_MIB_OUTDISCARDS);
134 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
139 neigh = dst_get_neighbour(dst);
141 int res = neigh_output(neigh, skb);
147 IP6_INC_STATS(dev_net(dst->dev),
148 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
153 static int ip6_finish_output(struct sk_buff *skb)
155 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156 dst_allfrag(skb_dst(skb)))
157 return ip6_fragment(skb, ip6_finish_output2);
159 return ip6_finish_output2(skb);
162 int ip6_output(struct sk_buff *skb)
164 struct net_device *dev = skb_dst(skb)->dev;
165 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166 if (unlikely(idev->cnf.disable_ipv6)) {
167 IP6_INC_STATS(dev_net(dev), idev,
168 IPSTATS_MIB_OUTDISCARDS);
173 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
175 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
179 * xmit an sk_buff (used by TCP, SCTP and DCCP)
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183 struct ipv6_txoptions *opt, int tclass)
185 struct net *net = sock_net(sk);
186 struct ipv6_pinfo *np = inet6_sk(sk);
187 struct in6_addr *first_hop = &fl6->daddr;
188 struct dst_entry *dst = skb_dst(skb);
190 u8 proto = fl6->flowi6_proto;
191 int seg_len = skb->len;
196 unsigned int head_room;
198 /* First: exthdrs may take lots of space (~8K for now)
199 MAX_HEADER is not enough.
201 head_room = opt->opt_nflen + opt->opt_flen;
202 seg_len += head_room;
203 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
205 if (skb_headroom(skb) < head_room) {
206 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
208 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209 IPSTATS_MIB_OUTDISCARDS);
215 skb_set_owner_w(skb, sk);
218 ipv6_push_frag_opts(skb, opt, &proto);
220 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
223 skb_push(skb, sizeof(struct ipv6hdr));
224 skb_reset_network_header(skb);
228 * Fill in the IPv6 header
231 hlimit = np->hop_limit;
233 hlimit = ip6_dst_hoplimit(dst);
235 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
237 hdr->payload_len = htons(seg_len);
238 hdr->nexthdr = proto;
239 hdr->hop_limit = hlimit;
241 ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
242 ipv6_addr_copy(&hdr->daddr, first_hop);
244 skb->priority = sk->sk_priority;
245 skb->mark = sk->sk_mark;
248 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250 IPSTATS_MIB_OUT, skb->len);
251 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252 dst->dev, dst_output);
256 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
258 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
259 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
264 EXPORT_SYMBOL(ip6_xmit);
267 * To avoid extra problems ND packets are send through this
268 * routine. It's code duplication but I really want to avoid
269 * extra checks since ipv6_build_header is used by TCP (which
270 * is for us performance critical)
273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
274 const struct in6_addr *saddr, const struct in6_addr *daddr,
277 struct ipv6_pinfo *np = inet6_sk(sk);
280 skb->protocol = htons(ETH_P_IPV6);
283 skb_reset_network_header(skb);
284 skb_put(skb, sizeof(struct ipv6hdr));
287 *(__be32*)hdr = htonl(0x60000000);
289 hdr->payload_len = htons(len);
290 hdr->nexthdr = proto;
291 hdr->hop_limit = np->hop_limit;
293 ipv6_addr_copy(&hdr->saddr, saddr);
294 ipv6_addr_copy(&hdr->daddr, daddr);
299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
301 struct ip6_ra_chain *ra;
302 struct sock *last = NULL;
304 read_lock(&ip6_ra_lock);
305 for (ra = ip6_ra_chain; ra; ra = ra->next) {
306 struct sock *sk = ra->sk;
307 if (sk && ra->sel == sel &&
308 (!sk->sk_bound_dev_if ||
309 sk->sk_bound_dev_if == skb->dev->ifindex)) {
311 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
313 rawv6_rcv(last, skb2);
320 rawv6_rcv(last, skb);
321 read_unlock(&ip6_ra_lock);
324 read_unlock(&ip6_ra_lock);
328 static int ip6_forward_proxy_check(struct sk_buff *skb)
330 struct ipv6hdr *hdr = ipv6_hdr(skb);
331 u8 nexthdr = hdr->nexthdr;
334 if (ipv6_ext_hdr(nexthdr)) {
335 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
339 offset = sizeof(struct ipv6hdr);
341 if (nexthdr == IPPROTO_ICMPV6) {
342 struct icmp6hdr *icmp6;
344 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345 offset + 1 - skb->data)))
348 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
350 switch (icmp6->icmp6_type) {
351 case NDISC_ROUTER_SOLICITATION:
352 case NDISC_ROUTER_ADVERTISEMENT:
353 case NDISC_NEIGHBOUR_SOLICITATION:
354 case NDISC_NEIGHBOUR_ADVERTISEMENT:
356 /* For reaction involving unicast neighbor discovery
357 * message destined to the proxied address, pass it to
367 * The proxying router can't forward traffic sent to a link-local
368 * address, so signal the sender and discard the packet. This
369 * behavior is clarified by the MIPv6 specification.
371 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372 dst_link_failure(skb);
379 static inline int ip6_forward_finish(struct sk_buff *skb)
381 return dst_output(skb);
384 int ip6_forward(struct sk_buff *skb)
386 struct dst_entry *dst = skb_dst(skb);
387 struct ipv6hdr *hdr = ipv6_hdr(skb);
388 struct inet6_skb_parm *opt = IP6CB(skb);
389 struct net *net = dev_net(dst->dev);
393 if (net->ipv6.devconf_all->forwarding == 0)
396 if (skb_warn_if_lro(skb))
399 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
404 if (skb->pkt_type != PACKET_HOST)
407 skb_forward_csum(skb);
410 * We DO NOT make any processing on
411 * RA packets, pushing them to user level AS IS
412 * without ane WARRANTY that application will be able
413 * to interpret them. The reason is that we
414 * cannot make anything clever here.
416 * We are not end-node, so that if packet contains
417 * AH/ESP, we cannot make anything.
418 * Defragmentation also would be mistake, RA packets
419 * cannot be fragmented, because there is no warranty
420 * that different fragments will go along one path. --ANK
423 u8 *ptr = skb_network_header(skb) + opt->ra;
424 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
429 * check and decrement ttl
431 if (hdr->hop_limit <= 1) {
432 /* Force OUTPUT device used as source address */
434 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
435 IP6_INC_STATS_BH(net,
436 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
442 /* XXX: idev->cnf.proxy_ndp? */
443 if (net->ipv6.devconf_all->proxy_ndp &&
444 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
445 int proxied = ip6_forward_proxy_check(skb);
447 return ip6_input(skb);
448 else if (proxied < 0) {
449 IP6_INC_STATS(net, ip6_dst_idev(dst),
450 IPSTATS_MIB_INDISCARDS);
455 if (!xfrm6_route_forward(skb)) {
456 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
461 /* IPv6 specs say nothing about it, but it is clear that we cannot
462 send redirects to source routed frames.
463 We don't send redirects to frames decapsulated from IPsec.
465 n = dst_get_neighbour(dst);
466 if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
467 struct in6_addr *target = NULL;
471 * incoming and outgoing devices are the same
475 rt = (struct rt6_info *) dst;
476 if ((rt->rt6i_flags & RTF_GATEWAY))
477 target = (struct in6_addr*)&n->primary_key;
479 target = &hdr->daddr;
482 rt6_bind_peer(rt, 1);
484 /* Limit redirects both by destination (here)
485 and by source (inside ndisc_send_redirect)
487 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
488 ndisc_send_redirect(skb, n, target);
490 int addrtype = ipv6_addr_type(&hdr->saddr);
492 /* This check is security critical. */
493 if (addrtype == IPV6_ADDR_ANY ||
494 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
496 if (addrtype & IPV6_ADDR_LINKLOCAL) {
497 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
498 ICMPV6_NOT_NEIGHBOUR, 0);
504 if (mtu < IPV6_MIN_MTU)
507 if (skb->len > mtu && !skb_is_gso(skb)) {
508 /* Again, force OUTPUT device used as source address */
510 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
511 IP6_INC_STATS_BH(net,
512 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
513 IP6_INC_STATS_BH(net,
514 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
519 if (skb_cow(skb, dst->dev->hard_header_len)) {
520 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
526 /* Mangling hops number delayed to point after skb COW */
530 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
531 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
535 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
541 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
543 to->pkt_type = from->pkt_type;
544 to->priority = from->priority;
545 to->protocol = from->protocol;
547 skb_dst_set(to, dst_clone(skb_dst(from)));
549 to->mark = from->mark;
551 #ifdef CONFIG_NET_SCHED
552 to->tc_index = from->tc_index;
555 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
556 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
557 to->nf_trace = from->nf_trace;
559 skb_copy_secmark(to, from);
562 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
564 u16 offset = sizeof(struct ipv6hdr);
565 struct ipv6_opt_hdr *exthdr =
566 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
567 unsigned int packet_len = skb->tail - skb->network_header;
569 *nexthdr = &ipv6_hdr(skb)->nexthdr;
571 while (offset + 1 <= packet_len) {
577 case NEXTHDR_ROUTING:
581 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
582 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
592 offset += ipv6_optlen(exthdr);
593 *nexthdr = &exthdr->nexthdr;
594 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
601 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
603 static u32 ip6_idents_hashrnd __read_mostly;
604 static bool hashrnd_initialized = false;
607 if (unlikely(!hashrnd_initialized)) {
608 hashrnd_initialized = true;
609 get_random_bytes(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
611 hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd);
612 hash = __ipv6_addr_jhash(&rt->rt6i_src.addr, hash);
614 id = ip_idents_reserve(hash, 1);
615 fhdr->identification = htonl(id);
618 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
620 struct sk_buff *frag;
621 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
622 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
623 struct ipv6hdr *tmp_hdr;
625 unsigned int mtu, hlen, left, len;
627 int ptr, offset = 0, err=0;
628 u8 *prevhdr, nexthdr = 0;
629 struct net *net = dev_net(skb_dst(skb)->dev);
631 hlen = ip6_find_1stfragopt(skb, &prevhdr);
634 mtu = ip6_skb_dst_mtu(skb);
636 /* We must not fragment if the socket is set to force MTU discovery
637 * or if the skb it not generated by a local socket.
639 if (!skb->local_df && skb->len > mtu) {
640 skb->dev = skb_dst(skb)->dev;
641 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
642 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
643 IPSTATS_MIB_FRAGFAILS);
648 if (np && np->frag_size < mtu) {
652 mtu -= hlen + sizeof(struct frag_hdr);
654 if (skb_has_frag_list(skb)) {
655 int first_len = skb_pagelen(skb);
656 struct sk_buff *frag2;
658 if (first_len - hlen > mtu ||
659 ((first_len - hlen) & 7) ||
663 skb_walk_frags(skb, frag) {
664 /* Correct geometry. */
665 if (frag->len > mtu ||
666 ((frag->len & 7) && frag->next) ||
667 skb_headroom(frag) < hlen)
668 goto slow_path_clean;
670 /* Partially cloned skb? */
671 if (skb_shared(frag))
672 goto slow_path_clean;
677 frag->destructor = sock_wfree;
679 skb->truesize -= frag->truesize;
684 frag = skb_shinfo(skb)->frag_list;
685 skb_frag_list_init(skb);
688 *prevhdr = NEXTHDR_FRAGMENT;
689 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
691 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
692 IPSTATS_MIB_FRAGFAILS);
696 __skb_pull(skb, hlen);
697 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
698 __skb_push(skb, hlen);
699 skb_reset_network_header(skb);
700 memcpy(skb_network_header(skb), tmp_hdr, hlen);
702 ipv6_select_ident(fh, rt);
703 fh->nexthdr = nexthdr;
705 fh->frag_off = htons(IP6_MF);
706 frag_id = fh->identification;
708 first_len = skb_pagelen(skb);
709 skb->data_len = first_len - skb_headlen(skb);
710 skb->len = first_len;
711 ipv6_hdr(skb)->payload_len = htons(first_len -
712 sizeof(struct ipv6hdr));
717 /* Prepare header of the next frame,
718 * before previous one went down. */
720 frag->ip_summed = CHECKSUM_NONE;
721 skb_reset_transport_header(frag);
722 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
723 __skb_push(frag, hlen);
724 skb_reset_network_header(frag);
725 memcpy(skb_network_header(frag), tmp_hdr,
727 offset += skb->len - hlen - sizeof(struct frag_hdr);
728 fh->nexthdr = nexthdr;
730 fh->frag_off = htons(offset);
731 if (frag->next != NULL)
732 fh->frag_off |= htons(IP6_MF);
733 fh->identification = frag_id;
734 ipv6_hdr(frag)->payload_len =
736 sizeof(struct ipv6hdr));
737 ip6_copy_metadata(frag, skb);
742 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
743 IPSTATS_MIB_FRAGCREATES);
756 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
757 IPSTATS_MIB_FRAGOKS);
758 dst_release(&rt->dst);
768 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
769 IPSTATS_MIB_FRAGFAILS);
770 dst_release(&rt->dst);
774 skb_walk_frags(skb, frag2) {
778 frag2->destructor = NULL;
779 skb->truesize += frag2->truesize;
784 left = skb->len - hlen; /* Space per frame */
785 ptr = hlen; /* Where to start from */
788 * Fragment the datagram.
791 *prevhdr = NEXTHDR_FRAGMENT;
794 * Keep copying data until we run out.
798 /* IF: it doesn't fit, use 'mtu' - the data space left */
801 /* IF: we are not sending up to and including the packet end
802 then align the next start on an eight byte boundary */
810 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
811 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
812 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
813 IPSTATS_MIB_FRAGFAILS);
819 * Set up data on packet
822 ip6_copy_metadata(frag, skb);
823 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
824 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
825 skb_reset_network_header(frag);
826 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
827 frag->transport_header = (frag->network_header + hlen +
828 sizeof(struct frag_hdr));
831 * Charge the memory for the fragment to any owner
835 skb_set_owner_w(frag, skb->sk);
838 * Copy the packet header into the new buffer.
840 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
843 * Build fragment header.
845 fh->nexthdr = nexthdr;
848 ipv6_select_ident(fh, rt);
849 frag_id = fh->identification;
851 fh->identification = frag_id;
854 * Copy a block of the IP datagram.
856 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
860 fh->frag_off = htons(offset);
862 fh->frag_off |= htons(IP6_MF);
863 ipv6_hdr(frag)->payload_len = htons(frag->len -
864 sizeof(struct ipv6hdr));
870 * Put this fragment into the sending queue.
876 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
877 IPSTATS_MIB_FRAGCREATES);
879 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
880 IPSTATS_MIB_FRAGOKS);
885 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
886 IPSTATS_MIB_FRAGFAILS);
891 static inline int ip6_rt_check(const struct rt6key *rt_key,
892 const struct in6_addr *fl_addr,
893 const struct in6_addr *addr_cache)
895 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
896 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
899 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
900 struct dst_entry *dst,
901 const struct flowi6 *fl6)
903 struct ipv6_pinfo *np = inet6_sk(sk);
909 if (dst->ops->family != AF_INET6) {
914 rt = (struct rt6_info *)dst;
915 /* Yes, checking route validity in not connected
916 * case is not very simple. Take into account,
917 * that we do not support routing by source, TOS,
918 * and MSG_DONTROUTE --ANK (980726)
920 * 1. ip6_rt_check(): If route was host route,
921 * check that cached destination is current.
922 * If it is network route, we still may
923 * check its validity using saved pointer
924 * to the last used address: daddr_cache.
925 * We do not want to save whole address now,
926 * (because main consumer of this service
927 * is tcp, which has not this problem),
928 * so that the last trick works only on connected
930 * 2. oif also should be the same.
932 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
933 #ifdef CONFIG_IPV6_SUBTREES
934 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
936 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
945 static int ip6_dst_lookup_tail(struct sock *sk,
946 struct dst_entry **dst, struct flowi6 *fl6)
948 struct net *net = sock_net(sk);
949 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
955 *dst = ip6_route_output(net, sk, fl6);
957 if ((err = (*dst)->error))
958 goto out_err_release;
960 if (ipv6_addr_any(&fl6->saddr)) {
961 struct rt6_info *rt = (struct rt6_info *) *dst;
962 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
963 sk ? inet6_sk(sk)->srcprefs : 0,
966 goto out_err_release;
969 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
971 * Here if the dst entry we've looked up
972 * has a neighbour entry that is in the INCOMPLETE
973 * state and the src address from the flow is
974 * marked as OPTIMISTIC, we release the found
975 * dst entry and replace it instead with the
976 * dst entry of the nexthop router
979 n = dst_get_neighbour(*dst);
980 if (n && !(n->nud_state & NUD_VALID)) {
981 struct inet6_ifaddr *ifp;
982 struct flowi6 fl_gw6;
986 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
989 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
995 * We need to get the dst entry for the
996 * default router instead
999 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1000 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1001 *dst = ip6_route_output(net, sk, &fl_gw6);
1002 if ((err = (*dst)->error))
1003 goto out_err_release;
1013 if (err == -ENETUNREACH)
1014 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1021 * ip6_dst_lookup - perform route lookup on flow
1022 * @sk: socket which provides route info
1023 * @dst: pointer to dst_entry * for result
1024 * @fl6: flow to lookup
1026 * This function performs a route lookup on the given flow.
1028 * It returns zero on success, or a standard errno code on error.
1030 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1033 return ip6_dst_lookup_tail(sk, dst, fl6);
1035 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1038 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1039 * @sk: socket which provides route info
1040 * @fl6: flow to lookup
1041 * @final_dst: final destination address for ipsec lookup
1042 * @can_sleep: we are in a sleepable context
1044 * This function performs a route lookup on the given flow.
1046 * It returns a valid dst pointer on success, or a pointer encoded
1049 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1050 const struct in6_addr *final_dst,
1053 struct dst_entry *dst = NULL;
1056 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1058 return ERR_PTR(err);
1060 ipv6_addr_copy(&fl6->daddr, final_dst);
1062 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1064 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1066 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1069 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1070 * @sk: socket which provides the dst cache and route info
1071 * @fl6: flow to lookup
1072 * @final_dst: final destination address for ipsec lookup
1073 * @can_sleep: we are in a sleepable context
1075 * This function performs a route lookup on the given flow with the
1076 * possibility of using the cached route in the socket if it is valid.
1077 * It will take the socket dst lock when operating on the dst cache.
1078 * As a result, this function can only be used in process context.
1080 * It returns a valid dst pointer on success, or a pointer encoded
1083 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1084 const struct in6_addr *final_dst,
1087 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1090 dst = ip6_sk_dst_check(sk, dst, fl6);
1092 err = ip6_dst_lookup_tail(sk, &dst, fl6);
1094 return ERR_PTR(err);
1096 ipv6_addr_copy(&fl6->daddr, final_dst);
1098 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1100 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1102 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1104 static inline int ip6_ufo_append_data(struct sock *sk,
1105 int getfrag(void *from, char *to, int offset, int len,
1106 int odd, struct sk_buff *skb),
1107 void *from, int length, int hh_len, int fragheaderlen,
1108 int transhdrlen, int mtu,unsigned int flags,
1109 struct rt6_info *rt)
1112 struct sk_buff *skb;
1115 /* There is support for UDP large send offload by network
1116 * device, so create one single skb packet containing complete
1119 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1120 struct frag_hdr fhdr;
1122 skb = sock_alloc_send_skb(sk,
1123 hh_len + fragheaderlen + transhdrlen + 20,
1124 (flags & MSG_DONTWAIT), &err);
1128 /* reserve space for Hardware header */
1129 skb_reserve(skb, hh_len);
1131 /* create space for UDP/IP header */
1132 skb_put(skb,fragheaderlen + transhdrlen);
1134 /* initialize network header pointer */
1135 skb_reset_network_header(skb);
1137 /* initialize protocol header pointer */
1138 skb->transport_header = skb->network_header + fragheaderlen;
1140 skb->ip_summed = CHECKSUM_PARTIAL;
1143 /* Specify the length of each IPv6 datagram fragment.
1144 * It has to be a multiple of 8.
1146 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1147 sizeof(struct frag_hdr)) & ~7;
1148 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1149 ipv6_select_ident(&fhdr, rt);
1150 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1151 __skb_queue_tail(&sk->sk_write_queue, skb);
1154 return skb_append_datato_frags(sk, skb, getfrag, from,
1155 (length - transhdrlen));
1158 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1161 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1164 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1167 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1170 static void ip6_append_data_mtu(unsigned int *mtu,
1172 unsigned int fragheaderlen,
1173 struct sk_buff *skb,
1174 struct rt6_info *rt,
1175 unsigned int orig_mtu)
1177 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1179 /* first fragment, reserve header_len */
1180 *mtu = orig_mtu - rt->dst.header_len;
1184 * this fragment is not first, the headers
1185 * space is regarded as data space.
1189 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1190 + fragheaderlen - sizeof(struct frag_hdr);
1194 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1195 int offset, int len, int odd, struct sk_buff *skb),
1196 void *from, int length, int transhdrlen,
1197 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1198 struct rt6_info *rt, unsigned int flags, int dontfrag)
1200 struct inet_sock *inet = inet_sk(sk);
1201 struct ipv6_pinfo *np = inet6_sk(sk);
1202 struct inet_cork *cork;
1203 struct sk_buff *skb, *skb_prev = NULL;
1204 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1211 int csummode = CHECKSUM_NONE;
1214 if (flags&MSG_PROBE)
1216 cork = &inet->cork.base;
1217 if (skb_queue_empty(&sk->sk_write_queue)) {
1222 if (WARN_ON(np->cork.opt))
1225 np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1226 if (unlikely(np->cork.opt == NULL))
1229 np->cork.opt->tot_len = opt->tot_len;
1230 np->cork.opt->opt_flen = opt->opt_flen;
1231 np->cork.opt->opt_nflen = opt->opt_nflen;
1233 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1235 if (opt->dst0opt && !np->cork.opt->dst0opt)
1238 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1240 if (opt->dst1opt && !np->cork.opt->dst1opt)
1243 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1245 if (opt->hopopt && !np->cork.opt->hopopt)
1248 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1250 if (opt->srcrt && !np->cork.opt->srcrt)
1253 /* need source address above miyazawa*/
1256 cork->dst = &rt->dst;
1257 inet->cork.fl.u.ip6 = *fl6;
1258 np->cork.hop_limit = hlimit;
1259 np->cork.tclass = tclass;
1260 if (rt->dst.flags & DST_XFRM_TUNNEL)
1261 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1262 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1264 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1265 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1266 if (np->frag_size < mtu) {
1268 mtu = np->frag_size;
1270 cork->fragsize = mtu;
1271 if (dst_allfrag(rt->dst.path))
1272 cork->flags |= IPCORK_ALLFRAG;
1274 sk->sk_sndmsg_page = NULL;
1275 sk->sk_sndmsg_off = 0;
1276 exthdrlen = (opt ? opt->opt_flen : 0);
1277 length += exthdrlen;
1278 transhdrlen += exthdrlen;
1279 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1281 rt = (struct rt6_info *)cork->dst;
1282 fl6 = &inet->cork.fl.u.ip6;
1287 mtu = cork->fragsize;
1291 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1293 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1294 (opt ? opt->opt_nflen : 0);
1295 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1297 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1298 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1299 ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1304 /* For UDP, check if TX timestamp is enabled */
1305 if (sk->sk_type == SOCK_DGRAM) {
1306 err = sock_tx_timestamp(sk, &tx_flags);
1312 * Let's try using as much space as possible.
1313 * Use MTU if total length of the message fits into the MTU.
1314 * Otherwise, we need to reserve fragment header and
1315 * fragment alignment (= 8-15 octects, in total).
1317 * Note that we may need to "move" the data from the tail of
1318 * of the buffer to the new fragment when we split
1321 * FIXME: It may be fragmented into multiple chunks
1322 * at once if non-fragmentable extension headers
1327 if ((length > mtu) && dontfrag && (sk->sk_protocol == IPPROTO_UDP ||
1328 sk->sk_protocol == IPPROTO_RAW)) {
1329 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1333 skb = skb_peek_tail(&sk->sk_write_queue);
1334 cork->length += length;
1335 if (((length > mtu) ||
1336 (skb && skb_has_frags(skb))) &&
1337 (sk->sk_protocol == IPPROTO_UDP) &&
1338 (rt->dst.dev->features & NETIF_F_UFO)) {
1339 err = ip6_ufo_append_data(sk, getfrag, from, length,
1340 hh_len, fragheaderlen,
1341 transhdrlen, mtu, flags, rt);
1350 while (length > 0) {
1351 /* Check if the remaining data fits into current packet. */
1352 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1354 copy = maxfraglen - skb->len;
1358 unsigned int datalen;
1359 unsigned int fraglen;
1360 unsigned int fraggap;
1361 unsigned int alloclen;
1363 /* There's no room in the current skb */
1365 fraggap = skb->len - maxfraglen;
1368 /* update mtu and maxfraglen if necessary */
1369 if (skb == NULL || skb_prev == NULL)
1370 ip6_append_data_mtu(&mtu, &maxfraglen,
1371 fragheaderlen, skb, rt,
1377 * If remaining data exceeds the mtu,
1378 * we know we need more fragment(s).
1380 datalen = length + fraggap;
1382 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1383 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1384 if ((flags & MSG_MORE) &&
1385 !(rt->dst.dev->features&NETIF_F_SG))
1388 alloclen = datalen + fragheaderlen;
1390 alloclen += dst_exthdrlen;
1392 if (datalen != length + fraggap) {
1394 * this is not the last fragment, the trailer
1395 * space is regarded as data space.
1397 datalen += rt->dst.trailer_len;
1400 alloclen += rt->dst.trailer_len;
1401 fraglen = datalen + fragheaderlen;
1404 * We just reserve space for fragment header.
1405 * Note: this may be overallocation if the message
1406 * (without MSG_MORE) fits into the MTU.
1408 alloclen += sizeof(struct frag_hdr);
1411 skb = sock_alloc_send_skb(sk,
1413 (flags & MSG_DONTWAIT), &err);
1416 if (atomic_read(&sk->sk_wmem_alloc) <=
1418 skb = sock_wmalloc(sk,
1419 alloclen + hh_len, 1,
1421 if (unlikely(skb == NULL))
1424 /* Only the initial fragment
1433 * Fill in the control structures
1435 skb->ip_summed = csummode;
1437 /* reserve for fragmentation and ipsec header */
1438 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1441 if (sk->sk_type == SOCK_DGRAM)
1442 skb_shinfo(skb)->tx_flags = tx_flags;
1445 * Find where to start putting bytes
1447 data = skb_put(skb, fraglen);
1448 skb_set_network_header(skb, exthdrlen);
1449 data += fragheaderlen;
1450 skb->transport_header = (skb->network_header +
1453 skb->csum = skb_copy_and_csum_bits(
1454 skb_prev, maxfraglen,
1455 data + transhdrlen, fraggap, 0);
1456 skb_prev->csum = csum_sub(skb_prev->csum,
1459 pskb_trim_unique(skb_prev, maxfraglen);
1461 copy = datalen - transhdrlen - fraggap;
1467 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1474 length -= datalen - fraggap;
1478 csummode = CHECKSUM_NONE;
1481 * Put the packet on the pending queue
1483 __skb_queue_tail(&sk->sk_write_queue, skb);
1490 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1494 if (getfrag(from, skb_put(skb, copy),
1495 offset, copy, off, skb) < 0) {
1496 __skb_trim(skb, off);
1501 int i = skb_shinfo(skb)->nr_frags;
1502 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1503 struct page *page = sk->sk_sndmsg_page;
1504 int off = sk->sk_sndmsg_off;
1507 if (page && (left = PAGE_SIZE - off) > 0) {
1510 if (page != skb_frag_page(frag)) {
1511 if (i == MAX_SKB_FRAGS) {
1515 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1516 skb_frag_ref(skb, i);
1517 frag = &skb_shinfo(skb)->frags[i];
1519 } else if(i < MAX_SKB_FRAGS) {
1520 if (copy > PAGE_SIZE)
1522 page = alloc_pages(sk->sk_allocation, 0);
1527 sk->sk_sndmsg_page = page;
1528 sk->sk_sndmsg_off = 0;
1530 skb_fill_page_desc(skb, i, page, 0, 0);
1531 frag = &skb_shinfo(skb)->frags[i];
1537 skb_frag_address(frag) + skb_frag_size(frag),
1538 offset, copy, skb->len, skb) < 0) {
1542 sk->sk_sndmsg_off += copy;
1543 skb_frag_size_add(frag, copy);
1545 skb->data_len += copy;
1546 skb->truesize += copy;
1547 atomic_add(copy, &sk->sk_wmem_alloc);
1554 cork->length -= length;
1555 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1559 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1562 kfree(np->cork.opt->dst0opt);
1563 kfree(np->cork.opt->dst1opt);
1564 kfree(np->cork.opt->hopopt);
1565 kfree(np->cork.opt->srcrt);
1566 kfree(np->cork.opt);
1567 np->cork.opt = NULL;
1570 if (inet->cork.base.dst) {
1571 dst_release(inet->cork.base.dst);
1572 inet->cork.base.dst = NULL;
1573 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1575 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1578 int ip6_push_pending_frames(struct sock *sk)
1580 struct sk_buff *skb, *tmp_skb;
1581 struct sk_buff **tail_skb;
1582 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1583 struct inet_sock *inet = inet_sk(sk);
1584 struct ipv6_pinfo *np = inet6_sk(sk);
1585 struct net *net = sock_net(sk);
1586 struct ipv6hdr *hdr;
1587 struct ipv6_txoptions *opt = np->cork.opt;
1588 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1589 struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1590 unsigned char proto = fl6->flowi6_proto;
1593 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1595 tail_skb = &(skb_shinfo(skb)->frag_list);
1597 /* move skb->data to ip header from ext header */
1598 if (skb->data < skb_network_header(skb))
1599 __skb_pull(skb, skb_network_offset(skb));
1600 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1601 __skb_pull(tmp_skb, skb_network_header_len(skb));
1602 *tail_skb = tmp_skb;
1603 tail_skb = &(tmp_skb->next);
1604 skb->len += tmp_skb->len;
1605 skb->data_len += tmp_skb->len;
1606 skb->truesize += tmp_skb->truesize;
1607 tmp_skb->destructor = NULL;
1611 /* Allow local fragmentation. */
1612 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1615 ipv6_addr_copy(final_dst, &fl6->daddr);
1616 __skb_pull(skb, skb_network_header_len(skb));
1617 if (opt && opt->opt_flen)
1618 ipv6_push_frag_opts(skb, opt, &proto);
1619 if (opt && opt->opt_nflen)
1620 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1622 skb_push(skb, sizeof(struct ipv6hdr));
1623 skb_reset_network_header(skb);
1624 hdr = ipv6_hdr(skb);
1626 *(__be32*)hdr = fl6->flowlabel |
1627 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1629 hdr->hop_limit = np->cork.hop_limit;
1630 hdr->nexthdr = proto;
1631 ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1632 ipv6_addr_copy(&hdr->daddr, final_dst);
1634 skb->priority = sk->sk_priority;
1635 skb->mark = sk->sk_mark;
1637 skb_dst_set(skb, dst_clone(&rt->dst));
1638 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1639 if (proto == IPPROTO_ICMPV6) {
1640 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1642 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1643 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1646 err = ip6_local_out(skb);
1649 err = net_xmit_errno(err);
1655 ip6_cork_release(inet, np);
1658 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1662 void ip6_flush_pending_frames(struct sock *sk)
1664 struct sk_buff *skb;
1666 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1668 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1669 IPSTATS_MIB_OUTDISCARDS);
1673 ip6_cork_release(inet_sk(sk), inet6_sk(sk));