net/ipv6/ip6_output.c

   1 /*
   2  *      IPv6 output functions
   3  *      Linux INET6 implementation
   4  *
   5  *      Authors:
   6  *      Pedro Roque             <roque@di.fc.ul.pt>
   7  *
   8  *      Based on linux/net/ipv4/ip_output.c
   9  *
  10  *      This program is free software; you can redistribute it and/or
  11  *      modify it under the terms of the GNU General Public License
  12  *      as published by the Free Software Foundation; either version
  13  *      2 of the License, or (at your option) any later version.
  14  *
  15  *      Changes:
  16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17  *                              extension headers are implemented.
  18  *                              route changes now work.
  19  *                              ip6_forward does not confuse sniffers.
  20  *                              etc.
  21  *
  22  *      H. von Brand    :       Added missing #include <linux/string.h>
  23  *      Imran Patel     :       frag id should be in NBO
  24  *      Kazunori MIYAZAWA @USAGI
  25  *                      :       add ip6_append_data and related functions
  26  *                              for datagram xmit
  27  */
  28
  29 #include <linux/errno.h>
  30 #include <linux/kernel.h>
  31 #include <linux/string.h>
  32 #include <linux/socket.h>
  33 #include <linux/net.h>
  34 #include <linux/netdevice.h>
  35 #include <linux/if_arp.h>
  36 #include <linux/in6.h>
  37 #include <linux/tcp.h>
  38 #include <linux/route.h>
  39 #include <linux/module.h>
  40 #include <linux/slab.h>
  41
  42 #include <linux/netfilter.h>
  43 #include <linux/netfilter_ipv6.h>
  44
  45 #include <net/sock.h>
  46 #include <net/snmp.h>
  47
  48 #include <net/ipv6.h>
  49 #include <net/ndisc.h>
  50 #include <net/protocol.h>
  51 #include <net/ip6_route.h>
  52 #include <net/addrconf.h>
  53 #include <net/rawv6.h>
  54 #include <net/icmp.h>
  55 #include <net/xfrm.h>
  56 #include <net/checksum.h>
  57 #include <linux/mroute6.h>
  58
  59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61 int __ip6_local_out(struct sk_buff *skb)
  62 {
  63         int len;
  64
  65         len = skb->len - sizeof(struct ipv6hdr);
  66         if (len > IPV6_MAXPLEN)
  67                 len = 0;
  68         ipv6_hdr(skb)->payload_len = htons(len);
  69
  70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  71                        skb_dst(skb)->dev, dst_output);
  72 }
  73
  74 int ip6_local_out(struct sk_buff *skb)
  75 {
  76         int err;
  77
  78         err = __ip6_local_out(skb);
  79         if (likely(err == 1))
  80                 err = dst_output(skb);
  81
  82         return err;
  83 }
  84 EXPORT_SYMBOL_GPL(ip6_local_out);
  85
  86 /* dev_loopback_xmit for use with netfilter. */
  87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
  88 {
  89         skb_reset_mac_header(newskb);
  90         __skb_pull(newskb, skb_network_offset(newskb));
  91         newskb->pkt_type = PACKET_LOOPBACK;
  92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
  93         WARN_ON(!skb_dst(newskb));
  94
  95         netif_rx_ni(newskb);
  96         return 0;
  97 }
  98
  99 static int ip6_finish_output2(struct sk_buff *skb)
 100 {
 101         struct dst_entry *dst = skb_dst(skb);
 102         struct net_device *dev = dst->dev;
 103         struct neighbour *neigh;
 104
 105         skb->protocol = htons(ETH_P_IPV6);
 106         skb->dev = dev;
 107
 108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 110
 111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
 112                     ((mroute6_socket(dev_net(dev), skb) &&
 113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 115                                          &ipv6_hdr(skb)->saddr))) {
 116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 117
 118                         /* Do not check for IFF_ALLMULTI; multicast routing
 119                            is not supported in any case.
 120                          */
 121                         if (newskb)
 122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 123                                         newskb, NULL, newskb->dev,
 124                                         ip6_dev_loopback_xmit);
 125
 126                         if (ipv6_hdr(skb)->hop_limit == 0) {
 127                                 IP6_INC_STATS(dev_net(dev), idev,
 128                                               IPSTATS_MIB_OUTDISCARDS);
 129                                 kfree_skb(skb);
 130                                 return 0;
 131                         }
 132                 }
 133
 134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 135                                 skb->len);
 136         }
 137
 138         rcu_read_lock();
 139         neigh = dst_get_neighbour(dst);
 140         if (neigh) {
 141                 int res = neigh_output(neigh, skb);
 142
 143                 rcu_read_unlock();
 144                 return res;
 145         }
 146         rcu_read_unlock();
 147         IP6_INC_STATS_BH(dev_net(dst->dev),
 148                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 149         kfree_skb(skb);
 150         return -EINVAL;
 151 }
 152
 153 static int ip6_finish_output(struct sk_buff *skb)
 154 {
 155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 156             dst_allfrag(skb_dst(skb)))
 157                 return ip6_fragment(skb, ip6_finish_output2);
 158         else
 159                 return ip6_finish_output2(skb);
 160 }
 161
 162 int ip6_output(struct sk_buff *skb)
 163 {
 164         struct net_device *dev = skb_dst(skb)->dev;
 165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 166         if (unlikely(idev->cnf.disable_ipv6)) {
 167                 IP6_INC_STATS(dev_net(dev), idev,
 168                               IPSTATS_MIB_OUTDISCARDS);
 169                 kfree_skb(skb);
 170                 return 0;
 171         }
 172
 173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 174                             ip6_finish_output,
 175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 176 }
 177
 178 /*
 179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 180  */
 181
 182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 183              struct ipv6_txoptions *opt, int tclass)
 184 {
 185         struct net *net = sock_net(sk);
 186         struct ipv6_pinfo *np = inet6_sk(sk);
 187         struct in6_addr *first_hop = &fl6->daddr;
 188         struct dst_entry *dst = skb_dst(skb);
 189         struct ipv6hdr *hdr;
 190         u8  proto = fl6->flowi6_proto;
 191         int seg_len = skb->len;
 192         int hlimit = -1;
 193         u32 mtu;
 194
 195         if (opt) {
 196                 unsigned int head_room;
 197
 198                 /* First: exthdrs may take lots of space (~8K for now)
 199                    MAX_HEADER is not enough.
 200                  */
 201                 head_room = opt->opt_nflen + opt->opt_flen;
 202                 seg_len += head_room;
 203                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 204
 205                 if (skb_headroom(skb) < head_room) {
 206                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 207                         if (skb2 == NULL) {
 208                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 209                                               IPSTATS_MIB_OUTDISCARDS);
 210                                 kfree_skb(skb);
 211                                 return -ENOBUFS;
 212                         }
 213                         kfree_skb(skb);
 214                         skb = skb2;
 215                         skb_set_owner_w(skb, sk);
 216                 }
 217                 if (opt->opt_flen)
 218                         ipv6_push_frag_opts(skb, opt, &proto);
 219                 if (opt->opt_nflen)
 220                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 221         }
 222
 223         skb_push(skb, sizeof(struct ipv6hdr));
 224         skb_reset_network_header(skb);
 225         hdr = ipv6_hdr(skb);
 226
 227         /*
 228          *      Fill in the IPv6 header
 229          */
 230         if (np)
 231                 hlimit = np->hop_limit;
 232         if (hlimit < 0)
 233                 hlimit = ip6_dst_hoplimit(dst);
 234
 235         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
 236
 237         hdr->payload_len = htons(seg_len);
 238         hdr->nexthdr = proto;
 239         hdr->hop_limit = hlimit;
 240
 241         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
 242         ipv6_addr_copy(&hdr->daddr, first_hop);
 243
 244         skb->priority = sk->sk_priority;
 245         skb->mark = sk->sk_mark;
 246
 247         mtu = dst_mtu(dst);
 248         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 249                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 250                               IPSTATS_MIB_OUT, skb->len);
 251                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 252                                dst->dev, dst_output);
 253         }
 254
 255         if (net_ratelimit())
 256                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 257         skb->dev = dst->dev;
 258         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 259         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 260         kfree_skb(skb);
 261         return -EMSGSIZE;
 262 }
 263
 264 EXPORT_SYMBOL(ip6_xmit);
 265
 266 /*
 267  *      To avoid extra problems ND packets are send through this
 268  *      routine. It's code duplication but I really want to avoid
 269  *      extra checks since ipv6_build_header is used by TCP (which
 270  *      is for us performance critical)
 271  */
 272
 273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 274                const struct in6_addr *saddr, const struct in6_addr *daddr,
 275                int proto, int len)
 276 {
 277         struct ipv6_pinfo *np = inet6_sk(sk);
 278         struct ipv6hdr *hdr;
 279
 280         skb->protocol = htons(ETH_P_IPV6);
 281         skb->dev = dev;
 282
 283         skb_reset_network_header(skb);
 284         skb_put(skb, sizeof(struct ipv6hdr));
 285         hdr = ipv6_hdr(skb);
 286
 287         *(__be32*)hdr = htonl(0x60000000);
 288
 289         hdr->payload_len = htons(len);
 290         hdr->nexthdr = proto;
 291         hdr->hop_limit = np->hop_limit;
 292
 293         ipv6_addr_copy(&hdr->saddr, saddr);
 294         ipv6_addr_copy(&hdr->daddr, daddr);
 295
 296         return 0;
 297 }
 298
 299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 300 {
 301         struct ip6_ra_chain *ra;
 302         struct sock *last = NULL;
 303
 304         read_lock(&ip6_ra_lock);
 305         for (ra = ip6_ra_chain; ra; ra = ra->next) {
 306                 struct sock *sk = ra->sk;
 307                 if (sk && ra->sel == sel &&
 308                     (!sk->sk_bound_dev_if ||
 309                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
 310                         if (last) {
 311                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 312                                 if (skb2)
 313                                         rawv6_rcv(last, skb2);
 314                         }
 315                         last = sk;
 316                 }
 317         }
 318
 319         if (last) {
 320                 rawv6_rcv(last, skb);
 321                 read_unlock(&ip6_ra_lock);
 322                 return 1;
 323         }
 324         read_unlock(&ip6_ra_lock);
 325         return 0;
 326 }
 327
 328 static int ip6_forward_proxy_check(struct sk_buff *skb)
 329 {
 330         struct ipv6hdr *hdr = ipv6_hdr(skb);
 331         u8 nexthdr = hdr->nexthdr;
 332         int offset;
 333
 334         if (ipv6_ext_hdr(nexthdr)) {
 335                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 336                 if (offset < 0)
 337                         return 0;
 338         } else
 339                 offset = sizeof(struct ipv6hdr);
 340
 341         if (nexthdr == IPPROTO_ICMPV6) {
 342                 struct icmp6hdr *icmp6;
 343
 344                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
 345                                          offset + 1 - skb->data)))
 346                         return 0;
 347
 348                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 349
 350                 switch (icmp6->icmp6_type) {
 351                 case NDISC_ROUTER_SOLICITATION:
 352                 case NDISC_ROUTER_ADVERTISEMENT:
 353                 case NDISC_NEIGHBOUR_SOLICITATION:
 354                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
 355                 case NDISC_REDIRECT:
 356                         /* For reaction involving unicast neighbor discovery
 357                          * message destined to the proxied address, pass it to
 358                          * input function.
 359                          */
 360                         return 1;
 361                 default:
 362                         break;
 363                 }
 364         }
 365
 366         /*
 367          * The proxying router can't forward traffic sent to a link-local
 368          * address, so signal the sender and discard the packet. This
 369          * behavior is clarified by the MIPv6 specification.
 370          */
 371         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 372                 dst_link_failure(skb);
 373                 return -1;
 374         }
 375
 376         return 0;
 377 }
 378
 379 static inline int ip6_forward_finish(struct sk_buff *skb)
 380 {
 381         return dst_output(skb);
 382 }
 383
 384 int ip6_forward(struct sk_buff *skb)
 385 {
 386         struct dst_entry *dst = skb_dst(skb);
 387         struct ipv6hdr *hdr = ipv6_hdr(skb);
 388         struct inet6_skb_parm *opt = IP6CB(skb);
 389         struct net *net = dev_net(dst->dev);
 390         struct neighbour *n;
 391         u32 mtu;
 392
 393         if (net->ipv6.devconf_all->forwarding == 0)
 394                 goto error;
 395
 396         if (skb_warn_if_lro(skb))
 397                 goto drop;
 398
 399         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 400                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 401                 goto drop;
 402         }
 403
 404         if (skb->pkt_type != PACKET_HOST)
 405                 goto drop;
 406
 407         skb_forward_csum(skb);
 408
 409         /*
 410          *      We DO NOT make any processing on
 411          *      RA packets, pushing them to user level AS IS
 412          *      without ane WARRANTY that application will be able
 413          *      to interpret them. The reason is that we
 414          *      cannot make anything clever here.
 415          *
 416          *      We are not end-node, so that if packet contains
 417          *      AH/ESP, we cannot make anything.
 418          *      Defragmentation also would be mistake, RA packets
 419          *      cannot be fragmented, because there is no warranty
 420          *      that different fragments will go along one path. --ANK
 421          */
 422         if (opt->ra) {
 423                 u8 *ptr = skb_network_header(skb) + opt->ra;
 424                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 425                         return 0;
 426         }
 427
 428         /*
 429          *      check and decrement ttl
 430          */
 431         if (hdr->hop_limit <= 1) {
 432                 /* Force OUTPUT device used as source address */
 433                 skb->dev = dst->dev;
 434                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 435                 IP6_INC_STATS_BH(net,
 436                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 437
 438                 kfree_skb(skb);
 439                 return -ETIMEDOUT;
 440         }
 441
 442         /* XXX: idev->cnf.proxy_ndp? */
 443         if (net->ipv6.devconf_all->proxy_ndp &&
 444             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 445                 int proxied = ip6_forward_proxy_check(skb);
 446                 if (proxied > 0)
 447                         return ip6_input(skb);
 448                 else if (proxied < 0) {
 449                         IP6_INC_STATS(net, ip6_dst_idev(dst),
 450                                       IPSTATS_MIB_INDISCARDS);
 451                         goto drop;
 452                 }
 453         }
 454
 455         if (!xfrm6_route_forward(skb)) {
 456                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 457                 goto drop;
 458         }
 459         dst = skb_dst(skb);
 460
 461         /* IPv6 specs say nothing about it, but it is clear that we cannot
 462            send redirects to source routed frames.
 463            We don't send redirects to frames decapsulated from IPsec.
 464          */
 465         n = dst_get_neighbour(dst);
 466         if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
 467                 struct in6_addr *target = NULL;
 468                 struct rt6_info *rt;
 469
 470                 /*
 471                  *      incoming and outgoing devices are the same
 472                  *      send a redirect.
 473                  */
 474
 475                 rt = (struct rt6_info *) dst;
 476                 if ((rt->rt6i_flags & RTF_GATEWAY))
 477                         target = (struct in6_addr*)&n->primary_key;
 478                 else
 479                         target = &hdr->daddr;
 480
 481                 if (!rt->rt6i_peer)
 482                         rt6_bind_peer(rt, 1);
 483
 484                 /* Limit redirects both by destination (here)
 485                    and by source (inside ndisc_send_redirect)
 486                  */
 487                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
 488                         ndisc_send_redirect(skb, n, target);
 489         } else {
 490                 int addrtype = ipv6_addr_type(&hdr->saddr);
 491
 492                 /* This check is security critical. */
 493                 if (addrtype == IPV6_ADDR_ANY ||
 494                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 495                         goto error;
 496                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
 497                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 498                                     ICMPV6_NOT_NEIGHBOUR, 0);
 499                         goto error;
 500                 }
 501         }
 502
 503         mtu = dst_mtu(dst);
 504         if (mtu < IPV6_MIN_MTU)
 505                 mtu = IPV6_MIN_MTU;
 506
 507         if (skb->len > mtu && !skb_is_gso(skb)) {
 508                 /* Again, force OUTPUT device used as source address */
 509                 skb->dev = dst->dev;
 510                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 511                 IP6_INC_STATS_BH(net,
 512                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 513                 IP6_INC_STATS_BH(net,
 514                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 515                 kfree_skb(skb);
 516                 return -EMSGSIZE;
 517         }
 518
 519         if (skb_cow(skb, dst->dev->hard_header_len)) {
 520                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 521                 goto drop;
 522         }
 523
 524         hdr = ipv6_hdr(skb);
 525
 526         /* Mangling hops number delayed to point after skb COW */
 527
 528         hdr->hop_limit--;
 529
 530         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 531         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 532                        ip6_forward_finish);
 533
 534 error:
 535         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 536 drop:
 537         kfree_skb(skb);
 538         return -EINVAL;
 539 }
 540
 541 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 542 {
 543         to->pkt_type = from->pkt_type;
 544         to->priority = from->priority;
 545         to->protocol = from->protocol;
 546         skb_dst_drop(to);
 547         skb_dst_set(to, dst_clone(skb_dst(from)));
 548         to->dev = from->dev;
 549         to->mark = from->mark;
 550
 551 #ifdef CONFIG_NET_SCHED
 552         to->tc_index = from->tc_index;
 553 #endif
 554         nf_copy(to, from);
 555 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 556     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 557         to->nf_trace = from->nf_trace;
 558 #endif
 559         skb_copy_secmark(to, from);
 560 }
 561
 562 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 563 {
 564         u16 offset = sizeof(struct ipv6hdr);
 565         struct ipv6_opt_hdr *exthdr =
 566                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 567         unsigned int packet_len = skb->tail - skb->network_header;
 568         int found_rhdr = 0;
 569         *nexthdr = &ipv6_hdr(skb)->nexthdr;
 570
 571         while (offset + 1 <= packet_len) {
 572
 573                 switch (**nexthdr) {
 574
 575                 case NEXTHDR_HOP:
 576                         break;
 577                 case NEXTHDR_ROUTING:
 578                         found_rhdr = 1;
 579                         break;
 580                 case NEXTHDR_DEST:
 581 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 582                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 583                                 break;
 584 #endif
 585                         if (found_rhdr)
 586                                 return offset;
 587                         break;
 588                 default :
 589                         return offset;
 590                 }
 591
 592                 offset += ipv6_optlen(exthdr);
 593                 *nexthdr = &exthdr->nexthdr;
 594                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 595                                                  offset);
 596         }
 597
 598         return offset;
 599 }
 600
 601 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
 602 {
 603         static atomic_t ipv6_fragmentation_id;
 604         int old, new;
 605
 606         if (rt && !(rt->dst.flags & DST_NOPEER)) {
 607                 struct inet_peer *peer;
 608
 609                 if (!rt->rt6i_peer)
 610                         rt6_bind_peer(rt, 1);
 611                 peer = rt->rt6i_peer;
 612                 if (peer) {
 613                         fhdr->identification = htonl(inet_getid(peer, 0));
 614                         return;
 615                 }
 616         }
 617         do {
 618                 old = atomic_read(&ipv6_fragmentation_id);
 619                 new = old + 1;
 620                 if (!new)
 621                         new = 1;
 622         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
 623         fhdr->identification = htonl(new);
 624 }
 625
 626 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 627 {
 628         struct sk_buff *frag;
 629         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 630         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 631         struct ipv6hdr *tmp_hdr;
 632         struct frag_hdr *fh;
 633         unsigned int mtu, hlen, left, len;
 634         __be32 frag_id = 0;
 635         int ptr, offset = 0, err=0;
 636         u8 *prevhdr, nexthdr = 0;
 637         struct net *net = dev_net(skb_dst(skb)->dev);
 638
 639         hlen = ip6_find_1stfragopt(skb, &prevhdr);
 640         nexthdr = *prevhdr;
 641
 642         mtu = ip6_skb_dst_mtu(skb);
 643
 644         /* We must not fragment if the socket is set to force MTU discovery
 645          * or if the skb it not generated by a local socket.
 646          */
 647         if (!skb->local_df && skb->len > mtu) {
 648                 skb->dev = skb_dst(skb)->dev;
 649                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 650                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 651                               IPSTATS_MIB_FRAGFAILS);
 652                 kfree_skb(skb);
 653                 return -EMSGSIZE;
 654         }
 655
 656         if (np && np->frag_size < mtu) {
 657                 if (np->frag_size)
 658                         mtu = np->frag_size;
 659         }
 660         mtu -= hlen + sizeof(struct frag_hdr);
 661
 662         if (skb_has_frag_list(skb)) {
 663                 int first_len = skb_pagelen(skb);
 664                 struct sk_buff *frag2;
 665
 666                 if (first_len - hlen > mtu ||
 667                     ((first_len - hlen) & 7) ||
 668                     skb_cloned(skb))
 669                         goto slow_path;
 670
 671                 skb_walk_frags(skb, frag) {
 672                         /* Correct geometry. */
 673                         if (frag->len > mtu ||
 674                             ((frag->len & 7) && frag->next) ||
 675                             skb_headroom(frag) < hlen)
 676                                 goto slow_path_clean;
 677
 678                         /* Partially cloned skb? */
 679                         if (skb_shared(frag))
 680                                 goto slow_path_clean;
 681
 682                         BUG_ON(frag->sk);
 683                         if (skb->sk) {
 684                                 frag->sk = skb->sk;
 685                                 frag->destructor = sock_wfree;
 686                         }
 687                         skb->truesize -= frag->truesize;
 688                 }
 689
 690                 err = 0;
 691                 offset = 0;
 692                 frag = skb_shinfo(skb)->frag_list;
 693                 skb_frag_list_init(skb);
 694                 /* BUILD HEADER */
 695
 696                 *prevhdr = NEXTHDR_FRAGMENT;
 697                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 698                 if (!tmp_hdr) {
 699                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 700                                       IPSTATS_MIB_FRAGFAILS);
 701                         return -ENOMEM;
 702                 }
 703
 704                 __skb_pull(skb, hlen);
 705                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 706                 __skb_push(skb, hlen);
 707                 skb_reset_network_header(skb);
 708                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
 709
 710                 ipv6_select_ident(fh, rt);
 711                 fh->nexthdr = nexthdr;
 712                 fh->reserved = 0;
 713                 fh->frag_off = htons(IP6_MF);
 714                 frag_id = fh->identification;
 715
 716                 first_len = skb_pagelen(skb);
 717                 skb->data_len = first_len - skb_headlen(skb);
 718                 skb->len = first_len;
 719                 ipv6_hdr(skb)->payload_len = htons(first_len -
 720                                                    sizeof(struct ipv6hdr));
 721
 722                 dst_hold(&rt->dst);
 723
 724                 for (;;) {
 725                         /* Prepare header of the next frame,
 726                          * before previous one went down. */
 727                         if (frag) {
 728                                 frag->ip_summed = CHECKSUM_NONE;
 729                                 skb_reset_transport_header(frag);
 730                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 731                                 __skb_push(frag, hlen);
 732                                 skb_reset_network_header(frag);
 733                                 memcpy(skb_network_header(frag), tmp_hdr,
 734                                        hlen);
 735                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
 736                                 fh->nexthdr = nexthdr;
 737                                 fh->reserved = 0;
 738                                 fh->frag_off = htons(offset);
 739                                 if (frag->next != NULL)
 740                                         fh->frag_off |= htons(IP6_MF);
 741                                 fh->identification = frag_id;
 742                                 ipv6_hdr(frag)->payload_len =
 743                                                 htons(frag->len -
 744                                                       sizeof(struct ipv6hdr));
 745                                 ip6_copy_metadata(frag, skb);
 746                         }
 747
 748                         err = output(skb);
 749                         if(!err)
 750                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 751                                               IPSTATS_MIB_FRAGCREATES);
 752
 753                         if (err || !frag)
 754                                 break;
 755
 756                         skb = frag;
 757                         frag = skb->next;
 758                         skb->next = NULL;
 759                 }
 760
 761                 kfree(tmp_hdr);
 762
 763                 if (err == 0) {
 764                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 765                                       IPSTATS_MIB_FRAGOKS);
 766                         dst_release(&rt->dst);
 767                         return 0;
 768                 }
 769
 770                 while (frag) {
 771                         skb = frag->next;
 772                         kfree_skb(frag);
 773                         frag = skb;
 774                 }
 775
 776                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 777                               IPSTATS_MIB_FRAGFAILS);
 778                 dst_release(&rt->dst);
 779                 return err;
 780
 781 slow_path_clean:
 782                 skb_walk_frags(skb, frag2) {
 783                         if (frag2 == frag)
 784                                 break;
 785                         frag2->sk = NULL;
 786                         frag2->destructor = NULL;
 787                         skb->truesize += frag2->truesize;
 788                 }
 789         }
 790
 791 slow_path:
 792         left = skb->len - hlen;         /* Space per frame */
 793         ptr = hlen;                     /* Where to start from */
 794
 795         /*
 796          *      Fragment the datagram.
 797          */
 798
 799         *prevhdr = NEXTHDR_FRAGMENT;
 800
 801         /*
 802          *      Keep copying data until we run out.
 803          */
 804         while(left > 0) {
 805                 len = left;
 806                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 807                 if (len > mtu)
 808                         len = mtu;
 809                 /* IF: we are not sending up to and including the packet end
 810                    then align the next start on an eight byte boundary */
 811                 if (len < left) {
 812                         len &= ~7;
 813                 }
 814                 /*
 815                  *      Allocate buffer.
 816                  */
 817
 818                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
 819                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 820                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 821                                       IPSTATS_MIB_FRAGFAILS);
 822                         err = -ENOMEM;
 823                         goto fail;
 824                 }
 825
 826                 /*
 827                  *      Set up data on packet
 828                  */
 829
 830                 ip6_copy_metadata(frag, skb);
 831                 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
 832                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 833                 skb_reset_network_header(frag);
 834                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 835                 frag->transport_header = (frag->network_header + hlen +
 836                                           sizeof(struct frag_hdr));
 837
 838                 /*
 839                  *      Charge the memory for the fragment to any owner
 840                  *      it might possess
 841                  */
 842                 if (skb->sk)
 843                         skb_set_owner_w(frag, skb->sk);
 844
 845                 /*
 846                  *      Copy the packet header into the new buffer.
 847                  */
 848                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 849
 850                 /*
 851                  *      Build fragment header.
 852                  */
 853                 fh->nexthdr = nexthdr;
 854                 fh->reserved = 0;
 855                 if (!frag_id) {
 856                         ipv6_select_ident(fh, rt);
 857                         frag_id = fh->identification;
 858                 } else
 859                         fh->identification = frag_id;
 860
 861                 /*
 862                  *      Copy a block of the IP datagram.
 863                  */
 864                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 865                         BUG();
 866                 left -= len;
 867
 868                 fh->frag_off = htons(offset);
 869                 if (left > 0)
 870                         fh->frag_off |= htons(IP6_MF);
 871                 ipv6_hdr(frag)->payload_len = htons(frag->len -
 872                                                     sizeof(struct ipv6hdr));
 873
 874                 ptr += len;
 875                 offset += len;
 876
 877                 /*
 878                  *      Put this fragment into the sending queue.
 879                  */
 880                 err = output(frag);
 881                 if (err)
 882                         goto fail;
 883
 884                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 885                               IPSTATS_MIB_FRAGCREATES);
 886         }
 887         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 888                       IPSTATS_MIB_FRAGOKS);
 889         kfree_skb(skb);
 890         return err;
 891
 892 fail:
 893         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 894                       IPSTATS_MIB_FRAGFAILS);
 895         kfree_skb(skb);
 896         return err;
 897 }
 898
 899 static inline int ip6_rt_check(const struct rt6key *rt_key,
 900                                const struct in6_addr *fl_addr,
 901                                const struct in6_addr *addr_cache)
 902 {
 903         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 904                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 905 }
 906
 907 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 908                                           struct dst_entry *dst,
 909                                           const struct flowi6 *fl6)
 910 {
 911         struct ipv6_pinfo *np = inet6_sk(sk);
 912         struct rt6_info *rt;
 913
 914         if (!dst)
 915                 goto out;
 916
 917         if (dst->ops->family != AF_INET6) {
 918                 dst_release(dst);
 919                 return NULL;
 920         }
 921
 922         rt = (struct rt6_info *)dst;
 923         /* Yes, checking route validity in not connected
 924          * case is not very simple. Take into account,
 925          * that we do not support routing by source, TOS,
 926          * and MSG_DONTROUTE            --ANK (980726)
 927          *
 928          * 1. ip6_rt_check(): If route was host route,
 929          *    check that cached destination is current.
 930          *    If it is network route, we still may
 931          *    check its validity using saved pointer
 932          *    to the last used address: daddr_cache.
 933          *    We do not want to save whole address now,
 934          *    (because main consumer of this service
 935          *    is tcp, which has not this problem),
 936          *    so that the last trick works only on connected
 937          *    sockets.
 938          * 2. oif also should be the same.
 939          */
 940         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 941 #ifdef CONFIG_IPV6_SUBTREES
 942             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 943 #endif
 944             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
 945                 dst_release(dst);
 946                 dst = NULL;
 947         }
 948
 949 out:
 950         return dst;
 951 }
 952
 953 static int ip6_dst_lookup_tail(struct sock *sk,
 954                                struct dst_entry **dst, struct flowi6 *fl6)
 955 {
 956         struct net *net = sock_net(sk);
 957 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 958         struct neighbour *n;
 959 #endif
 960         int err;
 961
 962         if (*dst == NULL)
 963                 *dst = ip6_route_output(net, sk, fl6);
 964
 965         if ((err = (*dst)->error))
 966                 goto out_err_release;
 967
 968         if (ipv6_addr_any(&fl6->saddr)) {
 969                 struct rt6_info *rt = (struct rt6_info *) *dst;
 970                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 971                                           sk ? inet6_sk(sk)->srcprefs : 0,
 972                                           &fl6->saddr);
 973                 if (err)
 974                         goto out_err_release;
 975         }
 976
 977 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 978         /*
 979          * Here if the dst entry we've looked up
 980          * has a neighbour entry that is in the INCOMPLETE
 981          * state and the src address from the flow is
 982          * marked as OPTIMISTIC, we release the found
 983          * dst entry and replace it instead with the
 984          * dst entry of the nexthop router
 985          */
 986         rcu_read_lock();
 987         n = dst_get_neighbour(*dst);
 988         if (n && !(n->nud_state & NUD_VALID)) {
 989                 struct inet6_ifaddr *ifp;
 990                 struct flowi6 fl_gw6;
 991                 int redirect;
 992
 993                 rcu_read_unlock();
 994                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 995                                       (*dst)->dev, 1);
 996
 997                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 998                 if (ifp)
 999                         in6_ifa_put(ifp);
1000
1001                 if (redirect) {
1002                         /*
1003                          * We need to get the dst entry for the
1004                          * default router instead
1005                          */
1006                         dst_release(*dst);
1007                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1008                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1009                         *dst = ip6_route_output(net, sk, &fl_gw6);
1010                         if ((err = (*dst)->error))
1011                                 goto out_err_release;
1012                 }
1013         } else {
1014                 rcu_read_unlock();
1015         }
1016 #endif
1017
1018         return 0;
1019
1020 out_err_release:
1021         if (err == -ENETUNREACH)
1022                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1023         dst_release(*dst);
1024         *dst = NULL;
1025         return err;
1026 }
1027
1028 /**
1029  *      ip6_dst_lookup - perform route lookup on flow
1030  *      @sk: socket which provides route info
1031  *      @dst: pointer to dst_entry * for result
1032  *      @fl6: flow to lookup
1033  *
1034  *      This function performs a route lookup on the given flow.
1035  *
1036  *      It returns zero on success, or a standard errno code on error.
1037  */
1038 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1039 {
1040         *dst = NULL;
1041         return ip6_dst_lookup_tail(sk, dst, fl6);
1042 }
1043 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1044
1045 /**
1046  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1047  *      @sk: socket which provides route info
1048  *      @fl6: flow to lookup
1049  *      @final_dst: final destination address for ipsec lookup
1050  *      @can_sleep: we are in a sleepable context
1051  *
1052  *      This function performs a route lookup on the given flow.
1053  *
1054  *      It returns a valid dst pointer on success, or a pointer encoded
1055  *      error code.
1056  */
1057 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1058                                       const struct in6_addr *final_dst,
1059                                       bool can_sleep)
1060 {
1061         struct dst_entry *dst = NULL;
1062         int err;
1063
1064         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1065         if (err)
1066                 return ERR_PTR(err);
1067         if (final_dst)
1068                 ipv6_addr_copy(&fl6->daddr, final_dst);
1069         if (can_sleep)
1070                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1071
1072         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1073 }
1074 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1075
1076 /**
1077  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1078  *      @sk: socket which provides the dst cache and route info
1079  *      @fl6: flow to lookup
1080  *      @final_dst: final destination address for ipsec lookup
1081  *      @can_sleep: we are in a sleepable context
1082  *
1083  *      This function performs a route lookup on the given flow with the
1084  *      possibility of using the cached route in the socket if it is valid.
1085  *      It will take the socket dst lock when operating on the dst cache.
1086  *      As a result, this function can only be used in process context.
1087  *
1088  *      It returns a valid dst pointer on success, or a pointer encoded
1089  *      error code.
1090  */
1091 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1092                                          const struct in6_addr *final_dst,
1093                                          bool can_sleep)
1094 {
1095         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1096         int err;
1097
1098         dst = ip6_sk_dst_check(sk, dst, fl6);
1099
1100         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1101         if (err)
1102                 return ERR_PTR(err);
1103         if (final_dst)
1104                 ipv6_addr_copy(&fl6->daddr, final_dst);
1105         if (can_sleep)
1106                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1107
1108         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1109 }
1110 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1111
1112 static inline int ip6_ufo_append_data(struct sock *sk,
1113                         int getfrag(void *from, char *to, int offset, int len,
1114                         int odd, struct sk_buff *skb),
1115                         void *from, int length, int hh_len, int fragheaderlen,
1116                         int transhdrlen, int mtu,unsigned int flags,
1117                         struct rt6_info *rt)
1118
1119 {
1120         struct sk_buff *skb;
1121         int err;
1122
1123         /* There is support for UDP large send offload by network
1124          * device, so create one single skb packet containing complete
1125          * udp datagram
1126          */
1127         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1128                 skb = sock_alloc_send_skb(sk,
1129                         hh_len + fragheaderlen + transhdrlen + 20,
1130                         (flags & MSG_DONTWAIT), &err);
1131                 if (skb == NULL)
1132                         return err;
1133
1134                 /* reserve space for Hardware header */
1135                 skb_reserve(skb, hh_len);
1136
1137                 /* create space for UDP/IP header */
1138                 skb_put(skb,fragheaderlen + transhdrlen);
1139
1140                 /* initialize network header pointer */
1141                 skb_reset_network_header(skb);
1142
1143                 /* initialize protocol header pointer */
1144                 skb->transport_header = skb->network_header + fragheaderlen;
1145
1146                 skb->ip_summed = CHECKSUM_PARTIAL;
1147                 skb->csum = 0;
1148         }
1149
1150         err = skb_append_datato_frags(sk,skb, getfrag, from,
1151                                       (length - transhdrlen));
1152         if (!err) {
1153                 struct frag_hdr fhdr;
1154
1155                 /* Specify the length of each IPv6 datagram fragment.
1156                  * It has to be a multiple of 8.
1157                  */
1158                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1159                                              sizeof(struct frag_hdr)) & ~7;
1160                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1161                 ipv6_select_ident(&fhdr, rt);
1162                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1163                 __skb_queue_tail(&sk->sk_write_queue, skb);
1164
1165                 return 0;
1166         }
1167         /* There is not enough support do UPD LSO,
1168          * so follow normal path
1169          */
1170         kfree_skb(skb);
1171
1172         return err;
1173 }
1174
1175 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1176                                                gfp_t gfp)
1177 {
1178         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1179 }
1180
1181 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1182                                                 gfp_t gfp)
1183 {
1184         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1185 }
1186
1187 static void ip6_append_data_mtu(int *mtu,
1188                                 int *maxfraglen,
1189                                 unsigned int fragheaderlen,
1190                                 struct sk_buff *skb,
1191                                 struct rt6_info *rt)
1192 {
1193         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1194                 if (skb == NULL) {
1195                         /* first fragment, reserve header_len */
1196                         *mtu = *mtu - rt->dst.header_len;
1197
1198                 } else {
1199                         /*
1200                          * this fragment is not first, the headers
1201                          * space is regarded as data space.
1202                          */
1203                         *mtu = dst_mtu(rt->dst.path);
1204                 }
1205                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1206                               + fragheaderlen - sizeof(struct frag_hdr);
1207         }
1208 }
1209
1210 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1211         int offset, int len, int odd, struct sk_buff *skb),
1212         void *from, int length, int transhdrlen,
1213         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1214         struct rt6_info *rt, unsigned int flags, int dontfrag)
1215 {
1216         struct inet_sock *inet = inet_sk(sk);
1217         struct ipv6_pinfo *np = inet6_sk(sk);
1218         struct inet_cork *cork;
1219         struct sk_buff *skb, *skb_prev = NULL;
1220         unsigned int maxfraglen, fragheaderlen;
1221         int exthdrlen;
1222         int dst_exthdrlen;
1223         int hh_len;
1224         int mtu;
1225         int copy;
1226         int err;
1227         int offset = 0;
1228         int csummode = CHECKSUM_NONE;
1229         __u8 tx_flags = 0;
1230
1231         if (flags&MSG_PROBE)
1232                 return 0;
1233         cork = &inet->cork.base;
1234         if (skb_queue_empty(&sk->sk_write_queue)) {
1235                 /*
1236                  * setup for corking
1237                  */
1238                 if (opt) {
1239                         if (WARN_ON(np->cork.opt))
1240                                 return -EINVAL;
1241
1242                         np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1243                         if (unlikely(np->cork.opt == NULL))
1244                                 return -ENOBUFS;
1245
1246                         np->cork.opt->tot_len = opt->tot_len;
1247                         np->cork.opt->opt_flen = opt->opt_flen;
1248                         np->cork.opt->opt_nflen = opt->opt_nflen;
1249
1250                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1251                                                             sk->sk_allocation);
1252                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1253                                 return -ENOBUFS;
1254
1255                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1256                                                             sk->sk_allocation);
1257                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1258                                 return -ENOBUFS;
1259
1260                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1261                                                            sk->sk_allocation);
1262                         if (opt->hopopt && !np->cork.opt->hopopt)
1263                                 return -ENOBUFS;
1264
1265                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1266                                                             sk->sk_allocation);
1267                         if (opt->srcrt && !np->cork.opt->srcrt)
1268                                 return -ENOBUFS;
1269
1270                         /* need source address above miyazawa*/
1271                 }
1272                 dst_hold(&rt->dst);
1273                 cork->dst = &rt->dst;
1274                 inet->cork.fl.u.ip6 = *fl6;
1275                 np->cork.hop_limit = hlimit;
1276                 np->cork.tclass = tclass;
1277                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1278                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1279                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1280                 else
1281                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1282                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1283                 if (np->frag_size < mtu) {
1284                         if (np->frag_size)
1285                                 mtu = np->frag_size;
1286                 }
1287                 cork->fragsize = mtu;
1288                 if (dst_allfrag(rt->dst.path))
1289                         cork->flags |= IPCORK_ALLFRAG;
1290                 cork->length = 0;
1291                 sk->sk_sndmsg_page = NULL;
1292                 sk->sk_sndmsg_off = 0;
1293                 exthdrlen = (opt ? opt->opt_flen : 0);
1294                 length += exthdrlen;
1295                 transhdrlen += exthdrlen;
1296                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1297         } else {
1298                 rt = (struct rt6_info *)cork->dst;
1299                 fl6 = &inet->cork.fl.u.ip6;
1300                 opt = np->cork.opt;
1301                 transhdrlen = 0;
1302                 exthdrlen = 0;
1303                 dst_exthdrlen = 0;
1304                 mtu = cork->fragsize;
1305         }
1306
1307         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1308
1309         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1310                         (opt ? opt->opt_nflen : 0);
1311         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1312
1313         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1314                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1315                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1316                         return -EMSGSIZE;
1317                 }
1318         }
1319
1320         /* For UDP, check if TX timestamp is enabled */
1321         if (sk->sk_type == SOCK_DGRAM) {
1322                 err = sock_tx_timestamp(sk, &tx_flags);
1323                 if (err)
1324                         goto error;
1325         }
1326
1327         /*
1328          * Let's try using as much space as possible.
1329          * Use MTU if total length of the message fits into the MTU.
1330          * Otherwise, we need to reserve fragment header and
1331          * fragment alignment (= 8-15 octects, in total).
1332          *
1333          * Note that we may need to "move" the data from the tail of
1334          * of the buffer to the new fragment when we split
1335          * the message.
1336          *
1337          * FIXME: It may be fragmented into multiple chunks
1338          *        at once if non-fragmentable extension headers
1339          *        are too large.
1340          * --yoshfuji
1341          */
1342
1343         cork->length += length;
1344         if (length > mtu) {
1345                 int proto = sk->sk_protocol;
1346                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1347                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1348                         return -EMSGSIZE;
1349                 }
1350
1351                 if (proto == IPPROTO_UDP &&
1352                     (rt->dst.dev->features & NETIF_F_UFO)) {
1353
1354                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1355                                                   hh_len, fragheaderlen,
1356                                                   transhdrlen, mtu, flags, rt);
1357                         if (err)
1358                                 goto error;
1359                         return 0;
1360                 }
1361         }
1362
1363         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1364                 goto alloc_new_skb;
1365
1366         while (length > 0) {
1367                 /* Check if the remaining data fits into current packet. */
1368                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1369                 if (copy < length)
1370                         copy = maxfraglen - skb->len;
1371
1372                 if (copy <= 0) {
1373                         char *data;
1374                         unsigned int datalen;
1375                         unsigned int fraglen;
1376                         unsigned int fraggap;
1377                         unsigned int alloclen;
1378 alloc_new_skb:
1379                         /* There's no room in the current skb */
1380                         if (skb)
1381                                 fraggap = skb->len - maxfraglen;
1382                         else
1383                                 fraggap = 0;
1384                         /* update mtu and maxfraglen if necessary */
1385                         if (skb == NULL || skb_prev == NULL)
1386                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1387                                                     fragheaderlen, skb, rt);
1388
1389                         skb_prev = skb;
1390
1391                         /*
1392                          * If remaining data exceeds the mtu,
1393                          * we know we need more fragment(s).
1394                          */
1395                         datalen = length + fraggap;
1396
1397                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1398                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1399                         if ((flags & MSG_MORE) &&
1400                             !(rt->dst.dev->features&NETIF_F_SG))
1401                                 alloclen = mtu;
1402                         else
1403                                 alloclen = datalen + fragheaderlen;
1404
1405                         alloclen += dst_exthdrlen;
1406
1407                         if (datalen != length + fraggap) {
1408                                 /*
1409                                  * this is not the last fragment, the trailer
1410                                  * space is regarded as data space.
1411                                  */
1412                                 datalen += rt->dst.trailer_len;
1413                         }
1414
1415                         alloclen += rt->dst.trailer_len;
1416                         fraglen = datalen + fragheaderlen;
1417
1418                         /*
1419                          * We just reserve space for fragment header.
1420                          * Note: this may be overallocation if the message
1421                          * (without MSG_MORE) fits into the MTU.
1422                          */
1423                         alloclen += sizeof(struct frag_hdr);
1424
1425                         if (transhdrlen) {
1426                                 skb = sock_alloc_send_skb(sk,
1427                                                 alloclen + hh_len,
1428                                                 (flags & MSG_DONTWAIT), &err);
1429                         } else {
1430                                 skb = NULL;
1431                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1432                                     2 * sk->sk_sndbuf)
1433                                         skb = sock_wmalloc(sk,
1434                                                            alloclen + hh_len, 1,
1435                                                            sk->sk_allocation);
1436                                 if (unlikely(skb == NULL))
1437                                         err = -ENOBUFS;
1438                                 else {
1439                                         /* Only the initial fragment
1440                                          * is time stamped.
1441                                          */
1442                                         tx_flags = 0;
1443                                 }
1444                         }
1445                         if (skb == NULL)
1446                                 goto error;
1447                         /*
1448                          *      Fill in the control structures
1449                          */
1450                         skb->ip_summed = csummode;
1451                         skb->csum = 0;
1452                         /* reserve for fragmentation and ipsec header */
1453                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1454                                     dst_exthdrlen);
1455
1456                         if (sk->sk_type == SOCK_DGRAM)
1457                                 skb_shinfo(skb)->tx_flags = tx_flags;
1458
1459                         /*
1460                          *      Find where to start putting bytes
1461                          */
1462                         data = skb_put(skb, fraglen);
1463                         skb_set_network_header(skb, exthdrlen);
1464                         data += fragheaderlen;
1465                         skb->transport_header = (skb->network_header +
1466                                                  fragheaderlen);
1467                         if (fraggap) {
1468                                 skb->csum = skb_copy_and_csum_bits(
1469                                         skb_prev, maxfraglen,
1470                                         data + transhdrlen, fraggap, 0);
1471                                 skb_prev->csum = csum_sub(skb_prev->csum,
1472                                                           skb->csum);
1473                                 data += fraggap;
1474                                 pskb_trim_unique(skb_prev, maxfraglen);
1475                         }
1476                         copy = datalen - transhdrlen - fraggap;
1477
1478                         if (copy < 0) {
1479                                 err = -EINVAL;
1480                                 kfree_skb(skb);
1481                                 goto error;
1482                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1483                                 err = -EFAULT;
1484                                 kfree_skb(skb);
1485                                 goto error;
1486                         }
1487
1488                         offset += copy;
1489                         length -= datalen - fraggap;
1490                         transhdrlen = 0;
1491                         exthdrlen = 0;
1492                         dst_exthdrlen = 0;
1493                         csummode = CHECKSUM_NONE;
1494
1495                         /*
1496                          * Put the packet on the pending queue
1497                          */
1498                         __skb_queue_tail(&sk->sk_write_queue, skb);
1499                         continue;
1500                 }
1501
1502                 if (copy > length)
1503                         copy = length;
1504
1505                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1506                         unsigned int off;
1507
1508                         off = skb->len;
1509                         if (getfrag(from, skb_put(skb, copy),
1510                                                 offset, copy, off, skb) < 0) {
1511                                 __skb_trim(skb, off);
1512                                 err = -EFAULT;
1513                                 goto error;
1514                         }
1515                 } else {
1516                         int i = skb_shinfo(skb)->nr_frags;
1517                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1518                         struct page *page = sk->sk_sndmsg_page;
1519                         int off = sk->sk_sndmsg_off;
1520                         unsigned int left;
1521
1522                         if (page && (left = PAGE_SIZE - off) > 0) {
1523                                 if (copy >= left)
1524                                         copy = left;
1525                                 if (page != skb_frag_page(frag)) {
1526                                         if (i == MAX_SKB_FRAGS) {
1527                                                 err = -EMSGSIZE;
1528                                                 goto error;
1529                                         }
1530                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1531                                         skb_frag_ref(skb, i);
1532                                         frag = &skb_shinfo(skb)->frags[i];
1533                                 }
1534                         } else if(i < MAX_SKB_FRAGS) {
1535                                 if (copy > PAGE_SIZE)
1536                                         copy = PAGE_SIZE;
1537                                 page = alloc_pages(sk->sk_allocation, 0);
1538                                 if (page == NULL) {
1539                                         err = -ENOMEM;
1540                                         goto error;
1541                                 }
1542                                 sk->sk_sndmsg_page = page;
1543                                 sk->sk_sndmsg_off = 0;
1544
1545                                 skb_fill_page_desc(skb, i, page, 0, 0);
1546                                 frag = &skb_shinfo(skb)->frags[i];
1547                         } else {
1548                                 err = -EMSGSIZE;
1549                                 goto error;
1550                         }
1551                         if (getfrag(from,
1552                                     skb_frag_address(frag) + skb_frag_size(frag),
1553                                     offset, copy, skb->len, skb) < 0) {
1554                                 err = -EFAULT;
1555                                 goto error;
1556                         }
1557                         sk->sk_sndmsg_off += copy;
1558                         skb_frag_size_add(frag, copy);
1559                         skb->len += copy;
1560                         skb->data_len += copy;
1561                         skb->truesize += copy;
1562                         atomic_add(copy, &sk->sk_wmem_alloc);
1563                 }
1564                 offset += copy;
1565                 length -= copy;
1566         }
1567         return 0;
1568 error:
1569         cork->length -= length;
1570         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1571         return err;
1572 }
1573
1574 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1575 {
1576         if (np->cork.opt) {
1577                 kfree(np->cork.opt->dst0opt);
1578                 kfree(np->cork.opt->dst1opt);
1579                 kfree(np->cork.opt->hopopt);
1580                 kfree(np->cork.opt->srcrt);
1581                 kfree(np->cork.opt);
1582                 np->cork.opt = NULL;
1583         }
1584
1585         if (inet->cork.base.dst) {
1586                 dst_release(inet->cork.base.dst);
1587                 inet->cork.base.dst = NULL;
1588                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1589         }
1590         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1591 }
1592
1593 int ip6_push_pending_frames(struct sock *sk)
1594 {
1595         struct sk_buff *skb, *tmp_skb;
1596         struct sk_buff **tail_skb;
1597         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1598         struct inet_sock *inet = inet_sk(sk);
1599         struct ipv6_pinfo *np = inet6_sk(sk);
1600         struct net *net = sock_net(sk);
1601         struct ipv6hdr *hdr;
1602         struct ipv6_txoptions *opt = np->cork.opt;
1603         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1604         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1605         unsigned char proto = fl6->flowi6_proto;
1606         int err = 0;
1607
1608         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1609                 goto out;
1610         tail_skb = &(skb_shinfo(skb)->frag_list);
1611
1612         /* move skb->data to ip header from ext header */
1613         if (skb->data < skb_network_header(skb))
1614                 __skb_pull(skb, skb_network_offset(skb));
1615         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1616                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1617                 *tail_skb = tmp_skb;
1618                 tail_skb = &(tmp_skb->next);
1619                 skb->len += tmp_skb->len;
1620                 skb->data_len += tmp_skb->len;
1621                 skb->truesize += tmp_skb->truesize;
1622                 tmp_skb->destructor = NULL;
1623                 tmp_skb->sk = NULL;
1624         }
1625
1626         /* Allow local fragmentation. */
1627         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1628                 skb->local_df = 1;
1629
1630         ipv6_addr_copy(final_dst, &fl6->daddr);
1631         __skb_pull(skb, skb_network_header_len(skb));
1632         if (opt && opt->opt_flen)
1633                 ipv6_push_frag_opts(skb, opt, &proto);
1634         if (opt && opt->opt_nflen)
1635                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1636
1637         skb_push(skb, sizeof(struct ipv6hdr));
1638         skb_reset_network_header(skb);
1639         hdr = ipv6_hdr(skb);
1640
1641         *(__be32*)hdr = fl6->flowlabel |
1642                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1643
1644         hdr->hop_limit = np->cork.hop_limit;
1645         hdr->nexthdr = proto;
1646         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1647         ipv6_addr_copy(&hdr->daddr, final_dst);
1648
1649         skb->priority = sk->sk_priority;
1650         skb->mark = sk->sk_mark;
1651
1652         skb_dst_set(skb, dst_clone(&rt->dst));
1653         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1654         if (proto == IPPROTO_ICMPV6) {
1655                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1656
1657                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1658                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1659         }
1660
1661         err = ip6_local_out(skb);
1662         if (err) {
1663                 if (err > 0)
1664                         err = net_xmit_errno(err);
1665                 if (err)
1666                         goto error;
1667         }
1668
1669 out:
1670         ip6_cork_release(inet, np);
1671         return err;
1672 error:
1673         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1674         goto out;
1675 }
1676
1677 void ip6_flush_pending_frames(struct sock *sk)
1678 {
1679         struct sk_buff *skb;
1680
1681         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1682                 if (skb_dst(skb))
1683                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1684                                       IPSTATS_MIB_OUTDISCARDS);
1685                 kfree_skb(skb);
1686         }
1687
1688         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1689 }