2 * IPv6 output functions
3 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * Based on linux/net/ipv4/ip_output.c
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
61 int __ip6_local_out(struct sk_buff *skb)
65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN)
68 ipv6_hdr(skb)->payload_len = htons(len);
70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 skb_dst(skb)->dev, dst_output);
74 int ip6_local_out(struct sk_buff *skb)
78 err = __ip6_local_out(skb);
80 err = dst_output(skb);
84 EXPORT_SYMBOL_GPL(ip6_local_out);
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
89 skb_reset_mac_header(newskb);
90 __skb_pull(newskb, skb_network_offset(newskb));
91 newskb->pkt_type = PACKET_LOOPBACK;
92 newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 WARN_ON(!skb_dst(newskb));
99 static int ip6_finish_output2(struct sk_buff *skb)
101 struct dst_entry *dst = skb_dst(skb);
102 struct net_device *dev = dst->dev;
104 skb->protocol = htons(ETH_P_IPV6);
107 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
108 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
111 ((mroute6_socket(dev_net(dev), skb) &&
112 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
113 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
114 &ipv6_hdr(skb)->saddr))) {
115 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117 /* Do not check for IFF_ALLMULTI; multicast routing
118 is not supported in any case.
121 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
122 newskb, NULL, newskb->dev,
123 ip6_dev_loopback_xmit);
125 if (ipv6_hdr(skb)->hop_limit == 0) {
126 IP6_INC_STATS(dev_net(dev), idev,
127 IPSTATS_MIB_OUTDISCARDS);
133 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
138 return neigh_hh_output(dst->hh, skb);
139 else if (dst->neighbour)
140 return dst->neighbour->output(skb);
142 IP6_INC_STATS_BH(dev_net(dst->dev),
143 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
148 static int ip6_finish_output(struct sk_buff *skb)
150 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
151 dst_allfrag(skb_dst(skb)))
152 return ip6_fragment(skb, ip6_finish_output2);
154 return ip6_finish_output2(skb);
157 int ip6_output(struct sk_buff *skb)
159 struct net_device *dev = skb_dst(skb)->dev;
160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161 if (unlikely(idev->cnf.disable_ipv6)) {
162 IP6_INC_STATS(dev_net(dev), idev,
163 IPSTATS_MIB_OUTDISCARDS);
168 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
170 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
174 * xmit an sk_buff (used by TCP, SCTP and DCCP)
177 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
178 struct ipv6_txoptions *opt)
180 struct net *net = sock_net(sk);
181 struct ipv6_pinfo *np = inet6_sk(sk);
182 struct in6_addr *first_hop = &fl->fl6_dst;
183 struct dst_entry *dst = skb_dst(skb);
185 u8 proto = fl->proto;
186 int seg_len = skb->len;
192 unsigned int head_room;
194 /* First: exthdrs may take lots of space (~8K for now)
195 MAX_HEADER is not enough.
197 head_room = opt->opt_nflen + opt->opt_flen;
198 seg_len += head_room;
199 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
201 if (skb_headroom(skb) < head_room) {
202 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
204 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
205 IPSTATS_MIB_OUTDISCARDS);
211 skb_set_owner_w(skb, sk);
214 ipv6_push_frag_opts(skb, opt, &proto);
216 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
219 skb_push(skb, sizeof(struct ipv6hdr));
220 skb_reset_network_header(skb);
224 * Fill in the IPv6 header
228 hlimit = np->hop_limit;
231 hlimit = ip6_dst_hoplimit(dst);
233 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
235 hdr->payload_len = htons(seg_len);
236 hdr->nexthdr = proto;
237 hdr->hop_limit = hlimit;
239 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
240 ipv6_addr_copy(&hdr->daddr, first_hop);
242 skb->priority = sk->sk_priority;
243 skb->mark = sk->sk_mark;
246 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
247 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
248 IPSTATS_MIB_OUT, skb->len);
249 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
250 dst->dev, dst_output);
254 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
256 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
257 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
262 EXPORT_SYMBOL(ip6_xmit);
265 * To avoid extra problems ND packets are send through this
266 * routine. It's code duplication but I really want to avoid
267 * extra checks since ipv6_build_header is used by TCP (which
268 * is for us performance critical)
271 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
272 const struct in6_addr *saddr, const struct in6_addr *daddr,
275 struct ipv6_pinfo *np = inet6_sk(sk);
278 skb->protocol = htons(ETH_P_IPV6);
281 skb_reset_network_header(skb);
282 skb_put(skb, sizeof(struct ipv6hdr));
285 *(__be32*)hdr = htonl(0x60000000);
287 hdr->payload_len = htons(len);
288 hdr->nexthdr = proto;
289 hdr->hop_limit = np->hop_limit;
291 ipv6_addr_copy(&hdr->saddr, saddr);
292 ipv6_addr_copy(&hdr->daddr, daddr);
297 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
299 struct ip6_ra_chain *ra;
300 struct sock *last = NULL;
302 read_lock(&ip6_ra_lock);
303 for (ra = ip6_ra_chain; ra; ra = ra->next) {
304 struct sock *sk = ra->sk;
305 if (sk && ra->sel == sel &&
306 (!sk->sk_bound_dev_if ||
307 sk->sk_bound_dev_if == skb->dev->ifindex)) {
309 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
311 rawv6_rcv(last, skb2);
318 rawv6_rcv(last, skb);
319 read_unlock(&ip6_ra_lock);
322 read_unlock(&ip6_ra_lock);
326 static int ip6_forward_proxy_check(struct sk_buff *skb)
328 struct ipv6hdr *hdr = ipv6_hdr(skb);
329 u8 nexthdr = hdr->nexthdr;
332 if (ipv6_ext_hdr(nexthdr)) {
333 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
337 offset = sizeof(struct ipv6hdr);
339 if (nexthdr == IPPROTO_ICMPV6) {
340 struct icmp6hdr *icmp6;
342 if (!pskb_may_pull(skb, (skb_network_header(skb) +
343 offset + 1 - skb->data)))
346 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
348 switch (icmp6->icmp6_type) {
349 case NDISC_ROUTER_SOLICITATION:
350 case NDISC_ROUTER_ADVERTISEMENT:
351 case NDISC_NEIGHBOUR_SOLICITATION:
352 case NDISC_NEIGHBOUR_ADVERTISEMENT:
354 /* For reaction involving unicast neighbor discovery
355 * message destined to the proxied address, pass it to
365 * The proxying router can't forward traffic sent to a link-local
366 * address, so signal the sender and discard the packet. This
367 * behavior is clarified by the MIPv6 specification.
369 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
370 dst_link_failure(skb);
377 static inline int ip6_forward_finish(struct sk_buff *skb)
379 return dst_output(skb);
382 int ip6_forward(struct sk_buff *skb)
384 struct dst_entry *dst = skb_dst(skb);
385 struct ipv6hdr *hdr = ipv6_hdr(skb);
386 struct inet6_skb_parm *opt = IP6CB(skb);
387 struct net *net = dev_net(dst->dev);
390 if (net->ipv6.devconf_all->forwarding == 0)
393 if (skb_warn_if_lro(skb))
396 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
397 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
401 if (skb->pkt_type != PACKET_HOST)
404 skb_forward_csum(skb);
407 * We DO NOT make any processing on
408 * RA packets, pushing them to user level AS IS
409 * without ane WARRANTY that application will be able
410 * to interpret them. The reason is that we
411 * cannot make anything clever here.
413 * We are not end-node, so that if packet contains
414 * AH/ESP, we cannot make anything.
415 * Defragmentation also would be mistake, RA packets
416 * cannot be fragmented, because there is no warranty
417 * that different fragments will go along one path. --ANK
420 u8 *ptr = skb_network_header(skb) + opt->ra;
421 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
426 * check and decrement ttl
428 if (hdr->hop_limit <= 1) {
429 /* Force OUTPUT device used as source address */
431 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
432 IP6_INC_STATS_BH(net,
433 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
439 /* XXX: idev->cnf.proxy_ndp? */
440 if (net->ipv6.devconf_all->proxy_ndp &&
441 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
442 int proxied = ip6_forward_proxy_check(skb);
444 return ip6_input(skb);
445 else if (proxied < 0) {
446 IP6_INC_STATS(net, ip6_dst_idev(dst),
447 IPSTATS_MIB_INDISCARDS);
452 if (!xfrm6_route_forward(skb)) {
453 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
458 /* IPv6 specs say nothing about it, but it is clear that we cannot
459 send redirects to source routed frames.
460 We don't send redirects to frames decapsulated from IPsec.
462 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
463 !skb_sec_path(skb)) {
464 struct in6_addr *target = NULL;
466 struct neighbour *n = dst->neighbour;
469 * incoming and outgoing devices are the same
473 rt = (struct rt6_info *) dst;
474 if ((rt->rt6i_flags & RTF_GATEWAY))
475 target = (struct in6_addr*)&n->primary_key;
477 target = &hdr->daddr;
480 rt6_bind_peer(rt, 1);
482 /* Limit redirects both by destination (here)
483 and by source (inside ndisc_send_redirect)
485 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
486 ndisc_send_redirect(skb, n, target);
488 int addrtype = ipv6_addr_type(&hdr->saddr);
490 /* This check is security critical. */
491 if (addrtype == IPV6_ADDR_ANY ||
492 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
494 if (addrtype & IPV6_ADDR_LINKLOCAL) {
495 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496 ICMPV6_NOT_NEIGHBOUR, 0);
502 if (mtu < IPV6_MIN_MTU)
505 if (skb->len > mtu && !skb_is_gso(skb)) {
506 /* Again, force OUTPUT device used as source address */
508 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509 IP6_INC_STATS_BH(net,
510 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511 IP6_INC_STATS_BH(net,
512 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
517 if (skb_cow(skb, dst->dev->hard_header_len)) {
518 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
524 /* Mangling hops number delayed to point after skb COW */
528 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
533 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
541 to->pkt_type = from->pkt_type;
542 to->priority = from->priority;
543 to->protocol = from->protocol;
545 skb_dst_set(to, dst_clone(skb_dst(from)));
547 to->mark = from->mark;
549 #ifdef CONFIG_NET_SCHED
550 to->tc_index = from->tc_index;
553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
554 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
555 to->nf_trace = from->nf_trace;
557 skb_copy_secmark(to, from);
560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
562 u16 offset = sizeof(struct ipv6hdr);
563 struct ipv6_opt_hdr *exthdr =
564 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
565 unsigned int packet_len = skb->tail - skb->network_header;
567 *nexthdr = &ipv6_hdr(skb)->nexthdr;
569 while (offset + 1 <= packet_len) {
575 case NEXTHDR_ROUTING:
579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
580 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
590 offset += ipv6_optlen(exthdr);
591 *nexthdr = &exthdr->nexthdr;
592 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
599 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
601 struct sk_buff *frag;
602 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
603 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
604 struct ipv6hdr *tmp_hdr;
606 unsigned int mtu, hlen, left, len;
608 int ptr, offset = 0, err=0;
609 u8 *prevhdr, nexthdr = 0;
610 struct net *net = dev_net(skb_dst(skb)->dev);
612 hlen = ip6_find_1stfragopt(skb, &prevhdr);
615 mtu = ip6_skb_dst_mtu(skb);
617 /* We must not fragment if the socket is set to force MTU discovery
618 * or if the skb it not generated by a local socket.
620 if (!skb->local_df && skb->len > mtu) {
621 skb->dev = skb_dst(skb)->dev;
622 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
623 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
624 IPSTATS_MIB_FRAGFAILS);
629 if (np && np->frag_size < mtu) {
633 mtu -= hlen + sizeof(struct frag_hdr);
635 if (skb_has_frag_list(skb)) {
636 int first_len = skb_pagelen(skb);
637 struct sk_buff *frag2;
639 if (first_len - hlen > mtu ||
640 ((first_len - hlen) & 7) ||
644 skb_walk_frags(skb, frag) {
645 /* Correct geometry. */
646 if (frag->len > mtu ||
647 ((frag->len & 7) && frag->next) ||
648 skb_headroom(frag) < hlen)
649 goto slow_path_clean;
651 /* Partially cloned skb? */
652 if (skb_shared(frag))
653 goto slow_path_clean;
658 frag->destructor = sock_wfree;
660 skb->truesize -= frag->truesize;
665 frag = skb_shinfo(skb)->frag_list;
666 skb_frag_list_init(skb);
669 *prevhdr = NEXTHDR_FRAGMENT;
670 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
672 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
673 IPSTATS_MIB_FRAGFAILS);
677 __skb_pull(skb, hlen);
678 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
679 __skb_push(skb, hlen);
680 skb_reset_network_header(skb);
681 memcpy(skb_network_header(skb), tmp_hdr, hlen);
683 ipv6_select_ident(fh);
684 fh->nexthdr = nexthdr;
686 fh->frag_off = htons(IP6_MF);
687 frag_id = fh->identification;
689 first_len = skb_pagelen(skb);
690 skb->data_len = first_len - skb_headlen(skb);
691 skb->len = first_len;
692 ipv6_hdr(skb)->payload_len = htons(first_len -
693 sizeof(struct ipv6hdr));
698 /* Prepare header of the next frame,
699 * before previous one went down. */
701 frag->ip_summed = CHECKSUM_NONE;
702 skb_reset_transport_header(frag);
703 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
704 __skb_push(frag, hlen);
705 skb_reset_network_header(frag);
706 memcpy(skb_network_header(frag), tmp_hdr,
708 offset += skb->len - hlen - sizeof(struct frag_hdr);
709 fh->nexthdr = nexthdr;
711 fh->frag_off = htons(offset);
712 if (frag->next != NULL)
713 fh->frag_off |= htons(IP6_MF);
714 fh->identification = frag_id;
715 ipv6_hdr(frag)->payload_len =
717 sizeof(struct ipv6hdr));
718 ip6_copy_metadata(frag, skb);
723 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
724 IPSTATS_MIB_FRAGCREATES);
737 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
738 IPSTATS_MIB_FRAGOKS);
739 dst_release(&rt->dst);
749 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
750 IPSTATS_MIB_FRAGFAILS);
751 dst_release(&rt->dst);
755 skb_walk_frags(skb, frag2) {
759 frag2->destructor = NULL;
760 skb->truesize += frag2->truesize;
765 left = skb->len - hlen; /* Space per frame */
766 ptr = hlen; /* Where to start from */
769 * Fragment the datagram.
772 *prevhdr = NEXTHDR_FRAGMENT;
775 * Keep copying data until we run out.
779 /* IF: it doesn't fit, use 'mtu' - the data space left */
782 /* IF: we are not sending upto and including the packet end
783 then align the next start on an eight byte boundary */
791 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
792 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
793 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
794 IPSTATS_MIB_FRAGFAILS);
800 * Set up data on packet
803 ip6_copy_metadata(frag, skb);
804 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
805 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
806 skb_reset_network_header(frag);
807 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
808 frag->transport_header = (frag->network_header + hlen +
809 sizeof(struct frag_hdr));
812 * Charge the memory for the fragment to any owner
816 skb_set_owner_w(frag, skb->sk);
819 * Copy the packet header into the new buffer.
821 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
824 * Build fragment header.
826 fh->nexthdr = nexthdr;
829 ipv6_select_ident(fh);
830 frag_id = fh->identification;
832 fh->identification = frag_id;
835 * Copy a block of the IP datagram.
837 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
841 fh->frag_off = htons(offset);
843 fh->frag_off |= htons(IP6_MF);
844 ipv6_hdr(frag)->payload_len = htons(frag->len -
845 sizeof(struct ipv6hdr));
851 * Put this fragment into the sending queue.
857 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
858 IPSTATS_MIB_FRAGCREATES);
860 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
861 IPSTATS_MIB_FRAGOKS);
866 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
867 IPSTATS_MIB_FRAGFAILS);
872 static inline int ip6_rt_check(struct rt6key *rt_key,
873 struct in6_addr *fl_addr,
874 struct in6_addr *addr_cache)
876 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
877 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
880 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
881 struct dst_entry *dst,
884 struct ipv6_pinfo *np = inet6_sk(sk);
885 struct rt6_info *rt = (struct rt6_info *)dst;
890 /* Yes, checking route validity in not connected
891 * case is not very simple. Take into account,
892 * that we do not support routing by source, TOS,
893 * and MSG_DONTROUTE --ANK (980726)
895 * 1. ip6_rt_check(): If route was host route,
896 * check that cached destination is current.
897 * If it is network route, we still may
898 * check its validity using saved pointer
899 * to the last used address: daddr_cache.
900 * We do not want to save whole address now,
901 * (because main consumer of this service
902 * is tcp, which has not this problem),
903 * so that the last trick works only on connected
905 * 2. oif also should be the same.
907 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
908 #ifdef CONFIG_IPV6_SUBTREES
909 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
911 (fl->oif && fl->oif != dst->dev->ifindex)) {
920 static int ip6_dst_lookup_tail(struct sock *sk,
921 struct dst_entry **dst, struct flowi *fl)
924 struct net *net = sock_net(sk);
927 *dst = ip6_route_output(net, sk, fl);
929 if ((err = (*dst)->error))
930 goto out_err_release;
932 if (ipv6_addr_any(&fl->fl6_src)) {
933 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
935 sk ? inet6_sk(sk)->srcprefs : 0,
938 goto out_err_release;
941 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
943 * Here if the dst entry we've looked up
944 * has a neighbour entry that is in the INCOMPLETE
945 * state and the src address from the flow is
946 * marked as OPTIMISTIC, we release the found
947 * dst entry and replace it instead with the
948 * dst entry of the nexthop router
950 if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
951 struct inet6_ifaddr *ifp;
955 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
958 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
964 * We need to get the dst entry for the
965 * default router instead
968 memcpy(&fl_gw, fl, sizeof(struct flowi));
969 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
970 *dst = ip6_route_output(net, sk, &fl_gw);
971 if ((err = (*dst)->error))
972 goto out_err_release;
980 if (err == -ENETUNREACH)
981 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
988 * ip6_dst_lookup - perform route lookup on flow
989 * @sk: socket which provides route info
990 * @dst: pointer to dst_entry * for result
991 * @fl: flow to lookup
993 * This function performs a route lookup on the given flow.
995 * It returns zero on success, or a standard errno code on error.
997 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1000 return ip6_dst_lookup_tail(sk, dst, fl);
1002 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1005 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1006 * @sk: socket which provides route info
1007 * @fl: flow to lookup
1008 * @final_dst: final destination address for ipsec lookup
1009 * @can_sleep: we are in a sleepable context
1011 * This function performs a route lookup on the given flow.
1013 * It returns a valid dst pointer on success, or a pointer encoded
1016 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi *fl,
1017 const struct in6_addr *final_dst,
1020 struct dst_entry *dst = NULL;
1023 err = ip6_dst_lookup_tail(sk, &dst, fl);
1025 return ERR_PTR(err);
1027 ipv6_addr_copy(&fl->fl6_dst, final_dst);
1029 fl->flags |= FLOWI_FLAG_CAN_SLEEP;
1030 err = __xfrm_lookup(sock_net(sk), &dst, fl, sk, 0);
1031 if (err == -EREMOTE)
1032 return ip6_dst_blackhole(sock_net(sk), dst);
1034 return ERR_PTR(err);
1036 err = xfrm_lookup(sock_net(sk), &dst, fl, sk, 0);
1038 return ERR_PTR(err);
1042 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1045 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1046 * @sk: socket which provides the dst cache and route info
1047 * @fl: flow to lookup
1048 * @final_dst: final destination address for ipsec lookup
1049 * @can_sleep: we are in a sleepable context
1051 * This function performs a route lookup on the given flow with the
1052 * possibility of using the cached route in the socket if it is valid.
1053 * It will take the socket dst lock when operating on the dst cache.
1054 * As a result, this function can only be used in process context.
1056 * It returns a valid dst pointer on success, or a pointer encoded
1059 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi *fl,
1060 const struct in6_addr *final_dst,
1063 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1066 dst = ip6_sk_dst_check(sk, dst, fl);
1068 err = ip6_dst_lookup_tail(sk, &dst, fl);
1070 return ERR_PTR(err);
1072 ipv6_addr_copy(&fl->fl6_dst, final_dst);
1074 fl->flags |= FLOWI_FLAG_CAN_SLEEP;
1075 err = __xfrm_lookup(sock_net(sk), &dst, fl, sk, 0);
1076 if (err == -EREMOTE)
1077 return ip6_dst_blackhole(sock_net(sk), dst);
1079 return ERR_PTR(err);
1081 err = xfrm_lookup(sock_net(sk), &dst, fl, sk, 0);
1083 return ERR_PTR(err);
1087 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1089 static inline int ip6_ufo_append_data(struct sock *sk,
1090 int getfrag(void *from, char *to, int offset, int len,
1091 int odd, struct sk_buff *skb),
1092 void *from, int length, int hh_len, int fragheaderlen,
1093 int transhdrlen, int mtu,unsigned int flags)
1096 struct sk_buff *skb;
1099 /* There is support for UDP large send offload by network
1100 * device, so create one single skb packet containing complete
1103 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1104 skb = sock_alloc_send_skb(sk,
1105 hh_len + fragheaderlen + transhdrlen + 20,
1106 (flags & MSG_DONTWAIT), &err);
1110 /* reserve space for Hardware header */
1111 skb_reserve(skb, hh_len);
1113 /* create space for UDP/IP header */
1114 skb_put(skb,fragheaderlen + transhdrlen);
1116 /* initialize network header pointer */
1117 skb_reset_network_header(skb);
1119 /* initialize protocol header pointer */
1120 skb->transport_header = skb->network_header + fragheaderlen;
1122 skb->ip_summed = CHECKSUM_PARTIAL;
1126 err = skb_append_datato_frags(sk,skb, getfrag, from,
1127 (length - transhdrlen));
1129 struct frag_hdr fhdr;
1131 /* Specify the length of each IPv6 datagram fragment.
1132 * It has to be a multiple of 8.
1134 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1135 sizeof(struct frag_hdr)) & ~7;
1136 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1137 ipv6_select_ident(&fhdr);
1138 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1139 __skb_queue_tail(&sk->sk_write_queue, skb);
1143 /* There is not enough support do UPD LSO,
1144 * so follow normal path
1151 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1154 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1157 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1160 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1163 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1164 int offset, int len, int odd, struct sk_buff *skb),
1165 void *from, int length, int transhdrlen,
1166 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1167 struct rt6_info *rt, unsigned int flags, int dontfrag)
1169 struct inet_sock *inet = inet_sk(sk);
1170 struct ipv6_pinfo *np = inet6_sk(sk);
1171 struct sk_buff *skb;
1172 unsigned int maxfraglen, fragheaderlen;
1179 int csummode = CHECKSUM_NONE;
1182 if (flags&MSG_PROBE)
1184 if (skb_queue_empty(&sk->sk_write_queue)) {
1189 if (WARN_ON(np->cork.opt))
1192 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1193 if (unlikely(np->cork.opt == NULL))
1196 np->cork.opt->tot_len = opt->tot_len;
1197 np->cork.opt->opt_flen = opt->opt_flen;
1198 np->cork.opt->opt_nflen = opt->opt_nflen;
1200 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1202 if (opt->dst0opt && !np->cork.opt->dst0opt)
1205 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1207 if (opt->dst1opt && !np->cork.opt->dst1opt)
1210 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1212 if (opt->hopopt && !np->cork.opt->hopopt)
1215 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1217 if (opt->srcrt && !np->cork.opt->srcrt)
1220 /* need source address above miyazawa*/
1223 inet->cork.dst = &rt->dst;
1224 inet->cork.fl = *fl;
1225 np->cork.hop_limit = hlimit;
1226 np->cork.tclass = tclass;
1227 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1228 rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1229 if (np->frag_size < mtu) {
1231 mtu = np->frag_size;
1233 inet->cork.fragsize = mtu;
1234 if (dst_allfrag(rt->dst.path))
1235 inet->cork.flags |= IPCORK_ALLFRAG;
1236 inet->cork.length = 0;
1237 sk->sk_sndmsg_page = NULL;
1238 sk->sk_sndmsg_off = 0;
1239 exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1240 rt->rt6i_nfheader_len;
1241 length += exthdrlen;
1242 transhdrlen += exthdrlen;
1244 rt = (struct rt6_info *)inet->cork.dst;
1245 fl = &inet->cork.fl;
1249 mtu = inet->cork.fragsize;
1252 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1254 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1255 (opt ? opt->opt_nflen : 0);
1256 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1258 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1259 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1260 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1265 /* For UDP, check if TX timestamp is enabled */
1266 if (sk->sk_type == SOCK_DGRAM) {
1267 err = sock_tx_timestamp(sk, &tx_flags);
1273 * Let's try using as much space as possible.
1274 * Use MTU if total length of the message fits into the MTU.
1275 * Otherwise, we need to reserve fragment header and
1276 * fragment alignment (= 8-15 octects, in total).
1278 * Note that we may need to "move" the data from the tail of
1279 * of the buffer to the new fragment when we split
1282 * FIXME: It may be fragmented into multiple chunks
1283 * at once if non-fragmentable extension headers
1288 inet->cork.length += length;
1290 int proto = sk->sk_protocol;
1291 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1292 ipv6_local_rxpmtu(sk, fl, mtu-exthdrlen);
1296 if (proto == IPPROTO_UDP &&
1297 (rt->dst.dev->features & NETIF_F_UFO)) {
1299 err = ip6_ufo_append_data(sk, getfrag, from, length,
1300 hh_len, fragheaderlen,
1301 transhdrlen, mtu, flags);
1308 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1311 while (length > 0) {
1312 /* Check if the remaining data fits into current packet. */
1313 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1315 copy = maxfraglen - skb->len;
1319 unsigned int datalen;
1320 unsigned int fraglen;
1321 unsigned int fraggap;
1322 unsigned int alloclen;
1323 struct sk_buff *skb_prev;
1327 /* There's no room in the current skb */
1329 fraggap = skb_prev->len - maxfraglen;
1334 * If remaining data exceeds the mtu,
1335 * we know we need more fragment(s).
1337 datalen = length + fraggap;
1338 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1339 datalen = maxfraglen - fragheaderlen;
1341 fraglen = datalen + fragheaderlen;
1342 if ((flags & MSG_MORE) &&
1343 !(rt->dst.dev->features&NETIF_F_SG))
1346 alloclen = datalen + fragheaderlen;
1349 * The last fragment gets additional space at tail.
1350 * Note: we overallocate on fragments with MSG_MODE
1351 * because we have no idea if we're the last one.
1353 if (datalen == length + fraggap)
1354 alloclen += rt->dst.trailer_len;
1357 * We just reserve space for fragment header.
1358 * Note: this may be overallocation if the message
1359 * (without MSG_MORE) fits into the MTU.
1361 alloclen += sizeof(struct frag_hdr);
1364 skb = sock_alloc_send_skb(sk,
1366 (flags & MSG_DONTWAIT), &err);
1369 if (atomic_read(&sk->sk_wmem_alloc) <=
1371 skb = sock_wmalloc(sk,
1372 alloclen + hh_len, 1,
1374 if (unlikely(skb == NULL))
1377 /* Only the initial fragment
1386 * Fill in the control structures
1388 skb->ip_summed = csummode;
1390 /* reserve for fragmentation */
1391 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1393 if (sk->sk_type == SOCK_DGRAM)
1394 skb_shinfo(skb)->tx_flags = tx_flags;
1397 * Find where to start putting bytes
1399 data = skb_put(skb, fraglen);
1400 skb_set_network_header(skb, exthdrlen);
1401 data += fragheaderlen;
1402 skb->transport_header = (skb->network_header +
1405 skb->csum = skb_copy_and_csum_bits(
1406 skb_prev, maxfraglen,
1407 data + transhdrlen, fraggap, 0);
1408 skb_prev->csum = csum_sub(skb_prev->csum,
1411 pskb_trim_unique(skb_prev, maxfraglen);
1413 copy = datalen - transhdrlen - fraggap;
1418 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1425 length -= datalen - fraggap;
1428 csummode = CHECKSUM_NONE;
1431 * Put the packet on the pending queue
1433 __skb_queue_tail(&sk->sk_write_queue, skb);
1440 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1444 if (getfrag(from, skb_put(skb, copy),
1445 offset, copy, off, skb) < 0) {
1446 __skb_trim(skb, off);
1451 int i = skb_shinfo(skb)->nr_frags;
1452 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1453 struct page *page = sk->sk_sndmsg_page;
1454 int off = sk->sk_sndmsg_off;
1457 if (page && (left = PAGE_SIZE - off) > 0) {
1460 if (page != frag->page) {
1461 if (i == MAX_SKB_FRAGS) {
1466 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1467 frag = &skb_shinfo(skb)->frags[i];
1469 } else if(i < MAX_SKB_FRAGS) {
1470 if (copy > PAGE_SIZE)
1472 page = alloc_pages(sk->sk_allocation, 0);
1477 sk->sk_sndmsg_page = page;
1478 sk->sk_sndmsg_off = 0;
1480 skb_fill_page_desc(skb, i, page, 0, 0);
1481 frag = &skb_shinfo(skb)->frags[i];
1486 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1490 sk->sk_sndmsg_off += copy;
1493 skb->data_len += copy;
1494 skb->truesize += copy;
1495 atomic_add(copy, &sk->sk_wmem_alloc);
1502 inet->cork.length -= length;
1503 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1507 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1510 kfree(np->cork.opt->dst0opt);
1511 kfree(np->cork.opt->dst1opt);
1512 kfree(np->cork.opt->hopopt);
1513 kfree(np->cork.opt->srcrt);
1514 kfree(np->cork.opt);
1515 np->cork.opt = NULL;
1518 if (inet->cork.dst) {
1519 dst_release(inet->cork.dst);
1520 inet->cork.dst = NULL;
1521 inet->cork.flags &= ~IPCORK_ALLFRAG;
1523 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1526 int ip6_push_pending_frames(struct sock *sk)
1528 struct sk_buff *skb, *tmp_skb;
1529 struct sk_buff **tail_skb;
1530 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1531 struct inet_sock *inet = inet_sk(sk);
1532 struct ipv6_pinfo *np = inet6_sk(sk);
1533 struct net *net = sock_net(sk);
1534 struct ipv6hdr *hdr;
1535 struct ipv6_txoptions *opt = np->cork.opt;
1536 struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1537 struct flowi *fl = &inet->cork.fl;
1538 unsigned char proto = fl->proto;
1541 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1543 tail_skb = &(skb_shinfo(skb)->frag_list);
1545 /* move skb->data to ip header from ext header */
1546 if (skb->data < skb_network_header(skb))
1547 __skb_pull(skb, skb_network_offset(skb));
1548 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1549 __skb_pull(tmp_skb, skb_network_header_len(skb));
1550 *tail_skb = tmp_skb;
1551 tail_skb = &(tmp_skb->next);
1552 skb->len += tmp_skb->len;
1553 skb->data_len += tmp_skb->len;
1554 skb->truesize += tmp_skb->truesize;
1555 tmp_skb->destructor = NULL;
1559 /* Allow local fragmentation. */
1560 if (np->pmtudisc < IPV6_PMTUDISC_DO)
1563 ipv6_addr_copy(final_dst, &fl->fl6_dst);
1564 __skb_pull(skb, skb_network_header_len(skb));
1565 if (opt && opt->opt_flen)
1566 ipv6_push_frag_opts(skb, opt, &proto);
1567 if (opt && opt->opt_nflen)
1568 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1570 skb_push(skb, sizeof(struct ipv6hdr));
1571 skb_reset_network_header(skb);
1572 hdr = ipv6_hdr(skb);
1574 *(__be32*)hdr = fl->fl6_flowlabel |
1575 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1577 hdr->hop_limit = np->cork.hop_limit;
1578 hdr->nexthdr = proto;
1579 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1580 ipv6_addr_copy(&hdr->daddr, final_dst);
1582 skb->priority = sk->sk_priority;
1583 skb->mark = sk->sk_mark;
1585 skb_dst_set(skb, dst_clone(&rt->dst));
1586 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1587 if (proto == IPPROTO_ICMPV6) {
1588 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1590 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1591 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1594 err = ip6_local_out(skb);
1597 err = net_xmit_errno(err);
1603 ip6_cork_release(inet, np);
1606 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1610 void ip6_flush_pending_frames(struct sock *sk)
1612 struct sk_buff *skb;
1614 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1616 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1617 IPSTATS_MIB_OUTDISCARDS);
1621 ip6_cork_release(inet_sk(sk), inet6_sk(sk));