Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[pandora-kernel.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89         skb_reset_mac_header(newskb);
90         __skb_pull(newskb, skb_network_offset(newskb));
91         newskb->pkt_type = PACKET_LOOPBACK;
92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
93         WARN_ON(!skb_dst(newskb));
94
95         netif_rx_ni(newskb);
96         return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101         struct dst_entry *dst = skb_dst(skb);
102         struct net_device *dev = dst->dev;
103         struct neighbour *neigh;
104
105         skb->protocol = htons(ETH_P_IPV6);
106         skb->dev = dev;
107
108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110
111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112                     ((mroute6_socket(dev_net(dev), skb) &&
113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115                                          &ipv6_hdr(skb)->saddr))) {
116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117
118                         /* Do not check for IFF_ALLMULTI; multicast routing
119                            is not supported in any case.
120                          */
121                         if (newskb)
122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123                                         newskb, NULL, newskb->dev,
124                                         ip6_dev_loopback_xmit);
125
126                         if (ipv6_hdr(skb)->hop_limit == 0) {
127                                 IP6_INC_STATS(dev_net(dev), idev,
128                                               IPSTATS_MIB_OUTDISCARDS);
129                                 kfree_skb(skb);
130                                 return 0;
131                         }
132                 }
133
134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135                                 skb->len);
136         }
137
138         rcu_read_lock();
139         neigh = dst_get_neighbour(dst);
140         if (neigh) {
141                 int res = neigh_output(neigh, skb);
142
143                 rcu_read_unlock();
144                 return res;
145         }
146         rcu_read_unlock();
147         IP6_INC_STATS_BH(dev_net(dst->dev),
148                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149         kfree_skb(skb);
150         return -EINVAL;
151 }
152
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156             dst_allfrag(skb_dst(skb)))
157                 return ip6_fragment(skb, ip6_finish_output2);
158         else
159                 return ip6_finish_output2(skb);
160 }
161
162 int ip6_output(struct sk_buff *skb)
163 {
164         struct net_device *dev = skb_dst(skb)->dev;
165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166         if (unlikely(idev->cnf.disable_ipv6)) {
167                 IP6_INC_STATS(dev_net(dev), idev,
168                               IPSTATS_MIB_OUTDISCARDS);
169                 kfree_skb(skb);
170                 return 0;
171         }
172
173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174                             ip6_finish_output,
175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177
178 /*
179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183              struct ipv6_txoptions *opt)
184 {
185         struct net *net = sock_net(sk);
186         struct ipv6_pinfo *np = inet6_sk(sk);
187         struct in6_addr *first_hop = &fl6->daddr;
188         struct dst_entry *dst = skb_dst(skb);
189         struct ipv6hdr *hdr;
190         u8  proto = fl6->flowi6_proto;
191         int seg_len = skb->len;
192         int hlimit = -1;
193         int tclass = 0;
194         u32 mtu;
195
196         if (opt) {
197                 unsigned int head_room;
198
199                 /* First: exthdrs may take lots of space (~8K for now)
200                    MAX_HEADER is not enough.
201                  */
202                 head_room = opt->opt_nflen + opt->opt_flen;
203                 seg_len += head_room;
204                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
205
206                 if (skb_headroom(skb) < head_room) {
207                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
208                         if (skb2 == NULL) {
209                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
210                                               IPSTATS_MIB_OUTDISCARDS);
211                                 kfree_skb(skb);
212                                 return -ENOBUFS;
213                         }
214                         kfree_skb(skb);
215                         skb = skb2;
216                         skb_set_owner_w(skb, sk);
217                 }
218                 if (opt->opt_flen)
219                         ipv6_push_frag_opts(skb, opt, &proto);
220                 if (opt->opt_nflen)
221                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
222         }
223
224         skb_push(skb, sizeof(struct ipv6hdr));
225         skb_reset_network_header(skb);
226         hdr = ipv6_hdr(skb);
227
228         /*
229          *      Fill in the IPv6 header
230          */
231         if (np) {
232                 tclass = np->tclass;
233                 hlimit = np->hop_limit;
234         }
235         if (hlimit < 0)
236                 hlimit = ip6_dst_hoplimit(dst);
237
238         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
239
240         hdr->payload_len = htons(seg_len);
241         hdr->nexthdr = proto;
242         hdr->hop_limit = hlimit;
243
244         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
245         ipv6_addr_copy(&hdr->daddr, first_hop);
246
247         skb->priority = sk->sk_priority;
248         skb->mark = sk->sk_mark;
249
250         mtu = dst_mtu(dst);
251         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
252                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
253                               IPSTATS_MIB_OUT, skb->len);
254                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
255                                dst->dev, dst_output);
256         }
257
258         if (net_ratelimit())
259                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
260         skb->dev = dst->dev;
261         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
262         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
263         kfree_skb(skb);
264         return -EMSGSIZE;
265 }
266
267 EXPORT_SYMBOL(ip6_xmit);
268
269 /*
270  *      To avoid extra problems ND packets are send through this
271  *      routine. It's code duplication but I really want to avoid
272  *      extra checks since ipv6_build_header is used by TCP (which
273  *      is for us performance critical)
274  */
275
276 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
277                const struct in6_addr *saddr, const struct in6_addr *daddr,
278                int proto, int len)
279 {
280         struct ipv6_pinfo *np = inet6_sk(sk);
281         struct ipv6hdr *hdr;
282
283         skb->protocol = htons(ETH_P_IPV6);
284         skb->dev = dev;
285
286         skb_reset_network_header(skb);
287         skb_put(skb, sizeof(struct ipv6hdr));
288         hdr = ipv6_hdr(skb);
289
290         *(__be32*)hdr = htonl(0x60000000);
291
292         hdr->payload_len = htons(len);
293         hdr->nexthdr = proto;
294         hdr->hop_limit = np->hop_limit;
295
296         ipv6_addr_copy(&hdr->saddr, saddr);
297         ipv6_addr_copy(&hdr->daddr, daddr);
298
299         return 0;
300 }
301
302 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
303 {
304         struct ip6_ra_chain *ra;
305         struct sock *last = NULL;
306
307         read_lock(&ip6_ra_lock);
308         for (ra = ip6_ra_chain; ra; ra = ra->next) {
309                 struct sock *sk = ra->sk;
310                 if (sk && ra->sel == sel &&
311                     (!sk->sk_bound_dev_if ||
312                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
313                         if (last) {
314                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
315                                 if (skb2)
316                                         rawv6_rcv(last, skb2);
317                         }
318                         last = sk;
319                 }
320         }
321
322         if (last) {
323                 rawv6_rcv(last, skb);
324                 read_unlock(&ip6_ra_lock);
325                 return 1;
326         }
327         read_unlock(&ip6_ra_lock);
328         return 0;
329 }
330
331 static int ip6_forward_proxy_check(struct sk_buff *skb)
332 {
333         struct ipv6hdr *hdr = ipv6_hdr(skb);
334         u8 nexthdr = hdr->nexthdr;
335         int offset;
336
337         if (ipv6_ext_hdr(nexthdr)) {
338                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
339                 if (offset < 0)
340                         return 0;
341         } else
342                 offset = sizeof(struct ipv6hdr);
343
344         if (nexthdr == IPPROTO_ICMPV6) {
345                 struct icmp6hdr *icmp6;
346
347                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
348                                          offset + 1 - skb->data)))
349                         return 0;
350
351                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
352
353                 switch (icmp6->icmp6_type) {
354                 case NDISC_ROUTER_SOLICITATION:
355                 case NDISC_ROUTER_ADVERTISEMENT:
356                 case NDISC_NEIGHBOUR_SOLICITATION:
357                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
358                 case NDISC_REDIRECT:
359                         /* For reaction involving unicast neighbor discovery
360                          * message destined to the proxied address, pass it to
361                          * input function.
362                          */
363                         return 1;
364                 default:
365                         break;
366                 }
367         }
368
369         /*
370          * The proxying router can't forward traffic sent to a link-local
371          * address, so signal the sender and discard the packet. This
372          * behavior is clarified by the MIPv6 specification.
373          */
374         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
375                 dst_link_failure(skb);
376                 return -1;
377         }
378
379         return 0;
380 }
381
382 static inline int ip6_forward_finish(struct sk_buff *skb)
383 {
384         return dst_output(skb);
385 }
386
387 int ip6_forward(struct sk_buff *skb)
388 {
389         struct dst_entry *dst = skb_dst(skb);
390         struct ipv6hdr *hdr = ipv6_hdr(skb);
391         struct inet6_skb_parm *opt = IP6CB(skb);
392         struct net *net = dev_net(dst->dev);
393         struct neighbour *n;
394         u32 mtu;
395
396         if (net->ipv6.devconf_all->forwarding == 0)
397                 goto error;
398
399         if (skb_warn_if_lro(skb))
400                 goto drop;
401
402         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
403                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
404                 goto drop;
405         }
406
407         if (skb->pkt_type != PACKET_HOST)
408                 goto drop;
409
410         skb_forward_csum(skb);
411
412         /*
413          *      We DO NOT make any processing on
414          *      RA packets, pushing them to user level AS IS
415          *      without ane WARRANTY that application will be able
416          *      to interpret them. The reason is that we
417          *      cannot make anything clever here.
418          *
419          *      We are not end-node, so that if packet contains
420          *      AH/ESP, we cannot make anything.
421          *      Defragmentation also would be mistake, RA packets
422          *      cannot be fragmented, because there is no warranty
423          *      that different fragments will go along one path. --ANK
424          */
425         if (opt->ra) {
426                 u8 *ptr = skb_network_header(skb) + opt->ra;
427                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
428                         return 0;
429         }
430
431         /*
432          *      check and decrement ttl
433          */
434         if (hdr->hop_limit <= 1) {
435                 /* Force OUTPUT device used as source address */
436                 skb->dev = dst->dev;
437                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
438                 IP6_INC_STATS_BH(net,
439                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
440
441                 kfree_skb(skb);
442                 return -ETIMEDOUT;
443         }
444
445         /* XXX: idev->cnf.proxy_ndp? */
446         if (net->ipv6.devconf_all->proxy_ndp &&
447             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
448                 int proxied = ip6_forward_proxy_check(skb);
449                 if (proxied > 0)
450                         return ip6_input(skb);
451                 else if (proxied < 0) {
452                         IP6_INC_STATS(net, ip6_dst_idev(dst),
453                                       IPSTATS_MIB_INDISCARDS);
454                         goto drop;
455                 }
456         }
457
458         if (!xfrm6_route_forward(skb)) {
459                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
460                 goto drop;
461         }
462         dst = skb_dst(skb);
463
464         /* IPv6 specs say nothing about it, but it is clear that we cannot
465            send redirects to source routed frames.
466            We don't send redirects to frames decapsulated from IPsec.
467          */
468         n = dst_get_neighbour(dst);
469         if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
470                 struct in6_addr *target = NULL;
471                 struct rt6_info *rt;
472
473                 /*
474                  *      incoming and outgoing devices are the same
475                  *      send a redirect.
476                  */
477
478                 rt = (struct rt6_info *) dst;
479                 if ((rt->rt6i_flags & RTF_GATEWAY))
480                         target = (struct in6_addr*)&n->primary_key;
481                 else
482                         target = &hdr->daddr;
483
484                 if (!rt->rt6i_peer)
485                         rt6_bind_peer(rt, 1);
486
487                 /* Limit redirects both by destination (here)
488                    and by source (inside ndisc_send_redirect)
489                  */
490                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
491                         ndisc_send_redirect(skb, n, target);
492         } else {
493                 int addrtype = ipv6_addr_type(&hdr->saddr);
494
495                 /* This check is security critical. */
496                 if (addrtype == IPV6_ADDR_ANY ||
497                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
498                         goto error;
499                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
500                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
501                                     ICMPV6_NOT_NEIGHBOUR, 0);
502                         goto error;
503                 }
504         }
505
506         mtu = dst_mtu(dst);
507         if (mtu < IPV6_MIN_MTU)
508                 mtu = IPV6_MIN_MTU;
509
510         if (skb->len > mtu && !skb_is_gso(skb)) {
511                 /* Again, force OUTPUT device used as source address */
512                 skb->dev = dst->dev;
513                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
514                 IP6_INC_STATS_BH(net,
515                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
516                 IP6_INC_STATS_BH(net,
517                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
518                 kfree_skb(skb);
519                 return -EMSGSIZE;
520         }
521
522         if (skb_cow(skb, dst->dev->hard_header_len)) {
523                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
524                 goto drop;
525         }
526
527         hdr = ipv6_hdr(skb);
528
529         /* Mangling hops number delayed to point after skb COW */
530
531         hdr->hop_limit--;
532
533         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
534         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
535                        ip6_forward_finish);
536
537 error:
538         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
539 drop:
540         kfree_skb(skb);
541         return -EINVAL;
542 }
543
544 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
545 {
546         to->pkt_type = from->pkt_type;
547         to->priority = from->priority;
548         to->protocol = from->protocol;
549         skb_dst_drop(to);
550         skb_dst_set(to, dst_clone(skb_dst(from)));
551         to->dev = from->dev;
552         to->mark = from->mark;
553
554 #ifdef CONFIG_NET_SCHED
555         to->tc_index = from->tc_index;
556 #endif
557         nf_copy(to, from);
558 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
559     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
560         to->nf_trace = from->nf_trace;
561 #endif
562         skb_copy_secmark(to, from);
563 }
564
565 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
566 {
567         u16 offset = sizeof(struct ipv6hdr);
568         struct ipv6_opt_hdr *exthdr =
569                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
570         unsigned int packet_len = skb->tail - skb->network_header;
571         int found_rhdr = 0;
572         *nexthdr = &ipv6_hdr(skb)->nexthdr;
573
574         while (offset + 1 <= packet_len) {
575
576                 switch (**nexthdr) {
577
578                 case NEXTHDR_HOP:
579                         break;
580                 case NEXTHDR_ROUTING:
581                         found_rhdr = 1;
582                         break;
583                 case NEXTHDR_DEST:
584 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
585                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
586                                 break;
587 #endif
588                         if (found_rhdr)
589                                 return offset;
590                         break;
591                 default :
592                         return offset;
593                 }
594
595                 offset += ipv6_optlen(exthdr);
596                 *nexthdr = &exthdr->nexthdr;
597                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
598                                                  offset);
599         }
600
601         return offset;
602 }
603
604 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
605 {
606         static atomic_t ipv6_fragmentation_id;
607         int old, new;
608
609         if (rt) {
610                 struct inet_peer *peer;
611
612                 if (!rt->rt6i_peer)
613                         rt6_bind_peer(rt, 1);
614                 peer = rt->rt6i_peer;
615                 if (peer) {
616                         fhdr->identification = htonl(inet_getid(peer, 0));
617                         return;
618                 }
619         }
620         do {
621                 old = atomic_read(&ipv6_fragmentation_id);
622                 new = old + 1;
623                 if (!new)
624                         new = 1;
625         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
626         fhdr->identification = htonl(new);
627 }
628
629 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
630 {
631         struct sk_buff *frag;
632         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
633         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
634         struct ipv6hdr *tmp_hdr;
635         struct frag_hdr *fh;
636         unsigned int mtu, hlen, left, len;
637         __be32 frag_id = 0;
638         int ptr, offset = 0, err=0;
639         u8 *prevhdr, nexthdr = 0;
640         struct net *net = dev_net(skb_dst(skb)->dev);
641
642         hlen = ip6_find_1stfragopt(skb, &prevhdr);
643         nexthdr = *prevhdr;
644
645         mtu = ip6_skb_dst_mtu(skb);
646
647         /* We must not fragment if the socket is set to force MTU discovery
648          * or if the skb it not generated by a local socket.
649          */
650         if (!skb->local_df && skb->len > mtu) {
651                 skb->dev = skb_dst(skb)->dev;
652                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
653                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
654                               IPSTATS_MIB_FRAGFAILS);
655                 kfree_skb(skb);
656                 return -EMSGSIZE;
657         }
658
659         if (np && np->frag_size < mtu) {
660                 if (np->frag_size)
661                         mtu = np->frag_size;
662         }
663         mtu -= hlen + sizeof(struct frag_hdr);
664
665         if (skb_has_frag_list(skb)) {
666                 int first_len = skb_pagelen(skb);
667                 struct sk_buff *frag2;
668
669                 if (first_len - hlen > mtu ||
670                     ((first_len - hlen) & 7) ||
671                     skb_cloned(skb))
672                         goto slow_path;
673
674                 skb_walk_frags(skb, frag) {
675                         /* Correct geometry. */
676                         if (frag->len > mtu ||
677                             ((frag->len & 7) && frag->next) ||
678                             skb_headroom(frag) < hlen)
679                                 goto slow_path_clean;
680
681                         /* Partially cloned skb? */
682                         if (skb_shared(frag))
683                                 goto slow_path_clean;
684
685                         BUG_ON(frag->sk);
686                         if (skb->sk) {
687                                 frag->sk = skb->sk;
688                                 frag->destructor = sock_wfree;
689                         }
690                         skb->truesize -= frag->truesize;
691                 }
692
693                 err = 0;
694                 offset = 0;
695                 frag = skb_shinfo(skb)->frag_list;
696                 skb_frag_list_init(skb);
697                 /* BUILD HEADER */
698
699                 *prevhdr = NEXTHDR_FRAGMENT;
700                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
701                 if (!tmp_hdr) {
702                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
703                                       IPSTATS_MIB_FRAGFAILS);
704                         return -ENOMEM;
705                 }
706
707                 __skb_pull(skb, hlen);
708                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
709                 __skb_push(skb, hlen);
710                 skb_reset_network_header(skb);
711                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
712
713                 ipv6_select_ident(fh, rt);
714                 fh->nexthdr = nexthdr;
715                 fh->reserved = 0;
716                 fh->frag_off = htons(IP6_MF);
717                 frag_id = fh->identification;
718
719                 first_len = skb_pagelen(skb);
720                 skb->data_len = first_len - skb_headlen(skb);
721                 skb->len = first_len;
722                 ipv6_hdr(skb)->payload_len = htons(first_len -
723                                                    sizeof(struct ipv6hdr));
724
725                 dst_hold(&rt->dst);
726
727                 for (;;) {
728                         /* Prepare header of the next frame,
729                          * before previous one went down. */
730                         if (frag) {
731                                 frag->ip_summed = CHECKSUM_NONE;
732                                 skb_reset_transport_header(frag);
733                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
734                                 __skb_push(frag, hlen);
735                                 skb_reset_network_header(frag);
736                                 memcpy(skb_network_header(frag), tmp_hdr,
737                                        hlen);
738                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
739                                 fh->nexthdr = nexthdr;
740                                 fh->reserved = 0;
741                                 fh->frag_off = htons(offset);
742                                 if (frag->next != NULL)
743                                         fh->frag_off |= htons(IP6_MF);
744                                 fh->identification = frag_id;
745                                 ipv6_hdr(frag)->payload_len =
746                                                 htons(frag->len -
747                                                       sizeof(struct ipv6hdr));
748                                 ip6_copy_metadata(frag, skb);
749                         }
750
751                         err = output(skb);
752                         if(!err)
753                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
754                                               IPSTATS_MIB_FRAGCREATES);
755
756                         if (err || !frag)
757                                 break;
758
759                         skb = frag;
760                         frag = skb->next;
761                         skb->next = NULL;
762                 }
763
764                 kfree(tmp_hdr);
765
766                 if (err == 0) {
767                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
768                                       IPSTATS_MIB_FRAGOKS);
769                         dst_release(&rt->dst);
770                         return 0;
771                 }
772
773                 while (frag) {
774                         skb = frag->next;
775                         kfree_skb(frag);
776                         frag = skb;
777                 }
778
779                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
780                               IPSTATS_MIB_FRAGFAILS);
781                 dst_release(&rt->dst);
782                 return err;
783
784 slow_path_clean:
785                 skb_walk_frags(skb, frag2) {
786                         if (frag2 == frag)
787                                 break;
788                         frag2->sk = NULL;
789                         frag2->destructor = NULL;
790                         skb->truesize += frag2->truesize;
791                 }
792         }
793
794 slow_path:
795         left = skb->len - hlen;         /* Space per frame */
796         ptr = hlen;                     /* Where to start from */
797
798         /*
799          *      Fragment the datagram.
800          */
801
802         *prevhdr = NEXTHDR_FRAGMENT;
803
804         /*
805          *      Keep copying data until we run out.
806          */
807         while(left > 0) {
808                 len = left;
809                 /* IF: it doesn't fit, use 'mtu' - the data space left */
810                 if (len > mtu)
811                         len = mtu;
812                 /* IF: we are not sending up to and including the packet end
813                    then align the next start on an eight byte boundary */
814                 if (len < left) {
815                         len &= ~7;
816                 }
817                 /*
818                  *      Allocate buffer.
819                  */
820
821                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
822                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
823                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
824                                       IPSTATS_MIB_FRAGFAILS);
825                         err = -ENOMEM;
826                         goto fail;
827                 }
828
829                 /*
830                  *      Set up data on packet
831                  */
832
833                 ip6_copy_metadata(frag, skb);
834                 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
835                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
836                 skb_reset_network_header(frag);
837                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
838                 frag->transport_header = (frag->network_header + hlen +
839                                           sizeof(struct frag_hdr));
840
841                 /*
842                  *      Charge the memory for the fragment to any owner
843                  *      it might possess
844                  */
845                 if (skb->sk)
846                         skb_set_owner_w(frag, skb->sk);
847
848                 /*
849                  *      Copy the packet header into the new buffer.
850                  */
851                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
852
853                 /*
854                  *      Build fragment header.
855                  */
856                 fh->nexthdr = nexthdr;
857                 fh->reserved = 0;
858                 if (!frag_id) {
859                         ipv6_select_ident(fh, rt);
860                         frag_id = fh->identification;
861                 } else
862                         fh->identification = frag_id;
863
864                 /*
865                  *      Copy a block of the IP datagram.
866                  */
867                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
868                         BUG();
869                 left -= len;
870
871                 fh->frag_off = htons(offset);
872                 if (left > 0)
873                         fh->frag_off |= htons(IP6_MF);
874                 ipv6_hdr(frag)->payload_len = htons(frag->len -
875                                                     sizeof(struct ipv6hdr));
876
877                 ptr += len;
878                 offset += len;
879
880                 /*
881                  *      Put this fragment into the sending queue.
882                  */
883                 err = output(frag);
884                 if (err)
885                         goto fail;
886
887                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888                               IPSTATS_MIB_FRAGCREATES);
889         }
890         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
891                       IPSTATS_MIB_FRAGOKS);
892         kfree_skb(skb);
893         return err;
894
895 fail:
896         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
897                       IPSTATS_MIB_FRAGFAILS);
898         kfree_skb(skb);
899         return err;
900 }
901
902 static inline int ip6_rt_check(const struct rt6key *rt_key,
903                                const struct in6_addr *fl_addr,
904                                const struct in6_addr *addr_cache)
905 {
906         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
907                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
908 }
909
910 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
911                                           struct dst_entry *dst,
912                                           const struct flowi6 *fl6)
913 {
914         struct ipv6_pinfo *np = inet6_sk(sk);
915         struct rt6_info *rt = (struct rt6_info *)dst;
916
917         if (!dst)
918                 goto out;
919
920         /* Yes, checking route validity in not connected
921          * case is not very simple. Take into account,
922          * that we do not support routing by source, TOS,
923          * and MSG_DONTROUTE            --ANK (980726)
924          *
925          * 1. ip6_rt_check(): If route was host route,
926          *    check that cached destination is current.
927          *    If it is network route, we still may
928          *    check its validity using saved pointer
929          *    to the last used address: daddr_cache.
930          *    We do not want to save whole address now,
931          *    (because main consumer of this service
932          *    is tcp, which has not this problem),
933          *    so that the last trick works only on connected
934          *    sockets.
935          * 2. oif also should be the same.
936          */
937         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
938 #ifdef CONFIG_IPV6_SUBTREES
939             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
940 #endif
941             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
942                 dst_release(dst);
943                 dst = NULL;
944         }
945
946 out:
947         return dst;
948 }
949
950 static int ip6_dst_lookup_tail(struct sock *sk,
951                                struct dst_entry **dst, struct flowi6 *fl6)
952 {
953         struct net *net = sock_net(sk);
954 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
955         struct neighbour *n;
956 #endif
957         int err;
958
959         if (*dst == NULL)
960                 *dst = ip6_route_output(net, sk, fl6);
961
962         if ((err = (*dst)->error))
963                 goto out_err_release;
964
965         if (ipv6_addr_any(&fl6->saddr)) {
966                 struct rt6_info *rt = (struct rt6_info *) *dst;
967                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
968                                           sk ? inet6_sk(sk)->srcprefs : 0,
969                                           &fl6->saddr);
970                 if (err)
971                         goto out_err_release;
972         }
973
974 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
975         /*
976          * Here if the dst entry we've looked up
977          * has a neighbour entry that is in the INCOMPLETE
978          * state and the src address from the flow is
979          * marked as OPTIMISTIC, we release the found
980          * dst entry and replace it instead with the
981          * dst entry of the nexthop router
982          */
983         rcu_read_lock();
984         n = dst_get_neighbour(*dst);
985         if (n && !(n->nud_state & NUD_VALID)) {
986                 struct inet6_ifaddr *ifp;
987                 struct flowi6 fl_gw6;
988                 int redirect;
989
990                 rcu_read_unlock();
991                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
992                                       (*dst)->dev, 1);
993
994                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
995                 if (ifp)
996                         in6_ifa_put(ifp);
997
998                 if (redirect) {
999                         /*
1000                          * We need to get the dst entry for the
1001                          * default router instead
1002                          */
1003                         dst_release(*dst);
1004                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1005                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1006                         *dst = ip6_route_output(net, sk, &fl_gw6);
1007                         if ((err = (*dst)->error))
1008                                 goto out_err_release;
1009                 }
1010         } else {
1011                 rcu_read_unlock();
1012         }
1013 #endif
1014
1015         return 0;
1016
1017 out_err_release:
1018         if (err == -ENETUNREACH)
1019                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1020         dst_release(*dst);
1021         *dst = NULL;
1022         return err;
1023 }
1024
1025 /**
1026  *      ip6_dst_lookup - perform route lookup on flow
1027  *      @sk: socket which provides route info
1028  *      @dst: pointer to dst_entry * for result
1029  *      @fl6: flow to lookup
1030  *
1031  *      This function performs a route lookup on the given flow.
1032  *
1033  *      It returns zero on success, or a standard errno code on error.
1034  */
1035 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1036 {
1037         *dst = NULL;
1038         return ip6_dst_lookup_tail(sk, dst, fl6);
1039 }
1040 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1041
1042 /**
1043  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1044  *      @sk: socket which provides route info
1045  *      @fl6: flow to lookup
1046  *      @final_dst: final destination address for ipsec lookup
1047  *      @can_sleep: we are in a sleepable context
1048  *
1049  *      This function performs a route lookup on the given flow.
1050  *
1051  *      It returns a valid dst pointer on success, or a pointer encoded
1052  *      error code.
1053  */
1054 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1055                                       const struct in6_addr *final_dst,
1056                                       bool can_sleep)
1057 {
1058         struct dst_entry *dst = NULL;
1059         int err;
1060
1061         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1062         if (err)
1063                 return ERR_PTR(err);
1064         if (final_dst)
1065                 ipv6_addr_copy(&fl6->daddr, final_dst);
1066         if (can_sleep)
1067                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1068
1069         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1070 }
1071 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1072
1073 /**
1074  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1075  *      @sk: socket which provides the dst cache and route info
1076  *      @fl6: flow to lookup
1077  *      @final_dst: final destination address for ipsec lookup
1078  *      @can_sleep: we are in a sleepable context
1079  *
1080  *      This function performs a route lookup on the given flow with the
1081  *      possibility of using the cached route in the socket if it is valid.
1082  *      It will take the socket dst lock when operating on the dst cache.
1083  *      As a result, this function can only be used in process context.
1084  *
1085  *      It returns a valid dst pointer on success, or a pointer encoded
1086  *      error code.
1087  */
1088 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1089                                          const struct in6_addr *final_dst,
1090                                          bool can_sleep)
1091 {
1092         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1093         int err;
1094
1095         dst = ip6_sk_dst_check(sk, dst, fl6);
1096
1097         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1098         if (err)
1099                 return ERR_PTR(err);
1100         if (final_dst)
1101                 ipv6_addr_copy(&fl6->daddr, final_dst);
1102         if (can_sleep)
1103                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1104
1105         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1106 }
1107 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1108
1109 static inline int ip6_ufo_append_data(struct sock *sk,
1110                         int getfrag(void *from, char *to, int offset, int len,
1111                         int odd, struct sk_buff *skb),
1112                         void *from, int length, int hh_len, int fragheaderlen,
1113                         int transhdrlen, int mtu,unsigned int flags,
1114                         struct rt6_info *rt)
1115
1116 {
1117         struct sk_buff *skb;
1118         int err;
1119
1120         /* There is support for UDP large send offload by network
1121          * device, so create one single skb packet containing complete
1122          * udp datagram
1123          */
1124         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1125                 skb = sock_alloc_send_skb(sk,
1126                         hh_len + fragheaderlen + transhdrlen + 20,
1127                         (flags & MSG_DONTWAIT), &err);
1128                 if (skb == NULL)
1129                         return -ENOMEM;
1130
1131                 /* reserve space for Hardware header */
1132                 skb_reserve(skb, hh_len);
1133
1134                 /* create space for UDP/IP header */
1135                 skb_put(skb,fragheaderlen + transhdrlen);
1136
1137                 /* initialize network header pointer */
1138                 skb_reset_network_header(skb);
1139
1140                 /* initialize protocol header pointer */
1141                 skb->transport_header = skb->network_header + fragheaderlen;
1142
1143                 skb->ip_summed = CHECKSUM_PARTIAL;
1144                 skb->csum = 0;
1145         }
1146
1147         err = skb_append_datato_frags(sk,skb, getfrag, from,
1148                                       (length - transhdrlen));
1149         if (!err) {
1150                 struct frag_hdr fhdr;
1151
1152                 /* Specify the length of each IPv6 datagram fragment.
1153                  * It has to be a multiple of 8.
1154                  */
1155                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1156                                              sizeof(struct frag_hdr)) & ~7;
1157                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1158                 ipv6_select_ident(&fhdr, rt);
1159                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1160                 __skb_queue_tail(&sk->sk_write_queue, skb);
1161
1162                 return 0;
1163         }
1164         /* There is not enough support do UPD LSO,
1165          * so follow normal path
1166          */
1167         kfree_skb(skb);
1168
1169         return err;
1170 }
1171
1172 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1173                                                gfp_t gfp)
1174 {
1175         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1176 }
1177
1178 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1179                                                 gfp_t gfp)
1180 {
1181         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1182 }
1183
1184 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1185         int offset, int len, int odd, struct sk_buff *skb),
1186         void *from, int length, int transhdrlen,
1187         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1188         struct rt6_info *rt, unsigned int flags, int dontfrag)
1189 {
1190         struct inet_sock *inet = inet_sk(sk);
1191         struct ipv6_pinfo *np = inet6_sk(sk);
1192         struct inet_cork *cork;
1193         struct sk_buff *skb;
1194         unsigned int maxfraglen, fragheaderlen;
1195         int exthdrlen;
1196         int hh_len;
1197         int mtu;
1198         int copy;
1199         int err;
1200         int offset = 0;
1201         int csummode = CHECKSUM_NONE;
1202         __u8 tx_flags = 0;
1203
1204         if (flags&MSG_PROBE)
1205                 return 0;
1206         cork = &inet->cork.base;
1207         if (skb_queue_empty(&sk->sk_write_queue)) {
1208                 /*
1209                  * setup for corking
1210                  */
1211                 if (opt) {
1212                         if (WARN_ON(np->cork.opt))
1213                                 return -EINVAL;
1214
1215                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1216                         if (unlikely(np->cork.opt == NULL))
1217                                 return -ENOBUFS;
1218
1219                         np->cork.opt->tot_len = opt->tot_len;
1220                         np->cork.opt->opt_flen = opt->opt_flen;
1221                         np->cork.opt->opt_nflen = opt->opt_nflen;
1222
1223                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1224                                                             sk->sk_allocation);
1225                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1226                                 return -ENOBUFS;
1227
1228                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1229                                                             sk->sk_allocation);
1230                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1231                                 return -ENOBUFS;
1232
1233                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1234                                                            sk->sk_allocation);
1235                         if (opt->hopopt && !np->cork.opt->hopopt)
1236                                 return -ENOBUFS;
1237
1238                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1239                                                             sk->sk_allocation);
1240                         if (opt->srcrt && !np->cork.opt->srcrt)
1241                                 return -ENOBUFS;
1242
1243                         /* need source address above miyazawa*/
1244                 }
1245                 dst_hold(&rt->dst);
1246                 cork->dst = &rt->dst;
1247                 inet->cork.fl.u.ip6 = *fl6;
1248                 np->cork.hop_limit = hlimit;
1249                 np->cork.tclass = tclass;
1250                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1251                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1252                 if (np->frag_size < mtu) {
1253                         if (np->frag_size)
1254                                 mtu = np->frag_size;
1255                 }
1256                 cork->fragsize = mtu;
1257                 if (dst_allfrag(rt->dst.path))
1258                         cork->flags |= IPCORK_ALLFRAG;
1259                 cork->length = 0;
1260                 sk->sk_sndmsg_page = NULL;
1261                 sk->sk_sndmsg_off = 0;
1262                 exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1263                             rt->rt6i_nfheader_len;
1264                 length += exthdrlen;
1265                 transhdrlen += exthdrlen;
1266         } else {
1267                 rt = (struct rt6_info *)cork->dst;
1268                 fl6 = &inet->cork.fl.u.ip6;
1269                 opt = np->cork.opt;
1270                 transhdrlen = 0;
1271                 exthdrlen = 0;
1272                 mtu = cork->fragsize;
1273         }
1274
1275         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1276
1277         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1278                         (opt ? opt->opt_nflen : 0);
1279         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1280
1281         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1282                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1283                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1284                         return -EMSGSIZE;
1285                 }
1286         }
1287
1288         /* For UDP, check if TX timestamp is enabled */
1289         if (sk->sk_type == SOCK_DGRAM) {
1290                 err = sock_tx_timestamp(sk, &tx_flags);
1291                 if (err)
1292                         goto error;
1293         }
1294
1295         /*
1296          * Let's try using as much space as possible.
1297          * Use MTU if total length of the message fits into the MTU.
1298          * Otherwise, we need to reserve fragment header and
1299          * fragment alignment (= 8-15 octects, in total).
1300          *
1301          * Note that we may need to "move" the data from the tail of
1302          * of the buffer to the new fragment when we split
1303          * the message.
1304          *
1305          * FIXME: It may be fragmented into multiple chunks
1306          *        at once if non-fragmentable extension headers
1307          *        are too large.
1308          * --yoshfuji
1309          */
1310
1311         cork->length += length;
1312         if (length > mtu) {
1313                 int proto = sk->sk_protocol;
1314                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1315                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1316                         return -EMSGSIZE;
1317                 }
1318
1319                 if (proto == IPPROTO_UDP &&
1320                     (rt->dst.dev->features & NETIF_F_UFO)) {
1321
1322                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1323                                                   hh_len, fragheaderlen,
1324                                                   transhdrlen, mtu, flags, rt);
1325                         if (err)
1326                                 goto error;
1327                         return 0;
1328                 }
1329         }
1330
1331         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1332                 goto alloc_new_skb;
1333
1334         while (length > 0) {
1335                 /* Check if the remaining data fits into current packet. */
1336                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1337                 if (copy < length)
1338                         copy = maxfraglen - skb->len;
1339
1340                 if (copy <= 0) {
1341                         char *data;
1342                         unsigned int datalen;
1343                         unsigned int fraglen;
1344                         unsigned int fraggap;
1345                         unsigned int alloclen;
1346                         struct sk_buff *skb_prev;
1347 alloc_new_skb:
1348                         skb_prev = skb;
1349
1350                         /* There's no room in the current skb */
1351                         if (skb_prev)
1352                                 fraggap = skb_prev->len - maxfraglen;
1353                         else
1354                                 fraggap = 0;
1355
1356                         /*
1357                          * If remaining data exceeds the mtu,
1358                          * we know we need more fragment(s).
1359                          */
1360                         datalen = length + fraggap;
1361                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1362                                 datalen = maxfraglen - fragheaderlen;
1363
1364                         fraglen = datalen + fragheaderlen;
1365                         if ((flags & MSG_MORE) &&
1366                             !(rt->dst.dev->features&NETIF_F_SG))
1367                                 alloclen = mtu;
1368                         else
1369                                 alloclen = datalen + fragheaderlen;
1370
1371                         /*
1372                          * The last fragment gets additional space at tail.
1373                          * Note: we overallocate on fragments with MSG_MODE
1374                          * because we have no idea if we're the last one.
1375                          */
1376                         if (datalen == length + fraggap)
1377                                 alloclen += rt->dst.trailer_len;
1378
1379                         /*
1380                          * We just reserve space for fragment header.
1381                          * Note: this may be overallocation if the message
1382                          * (without MSG_MORE) fits into the MTU.
1383                          */
1384                         alloclen += sizeof(struct frag_hdr);
1385
1386                         if (transhdrlen) {
1387                                 skb = sock_alloc_send_skb(sk,
1388                                                 alloclen + hh_len,
1389                                                 (flags & MSG_DONTWAIT), &err);
1390                         } else {
1391                                 skb = NULL;
1392                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1393                                     2 * sk->sk_sndbuf)
1394                                         skb = sock_wmalloc(sk,
1395                                                            alloclen + hh_len, 1,
1396                                                            sk->sk_allocation);
1397                                 if (unlikely(skb == NULL))
1398                                         err = -ENOBUFS;
1399                                 else {
1400                                         /* Only the initial fragment
1401                                          * is time stamped.
1402                                          */
1403                                         tx_flags = 0;
1404                                 }
1405                         }
1406                         if (skb == NULL)
1407                                 goto error;
1408                         /*
1409                          *      Fill in the control structures
1410                          */
1411                         skb->ip_summed = csummode;
1412                         skb->csum = 0;
1413                         /* reserve for fragmentation */
1414                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1415
1416                         if (sk->sk_type == SOCK_DGRAM)
1417                                 skb_shinfo(skb)->tx_flags = tx_flags;
1418
1419                         /*
1420                          *      Find where to start putting bytes
1421                          */
1422                         data = skb_put(skb, fraglen);
1423                         skb_set_network_header(skb, exthdrlen);
1424                         data += fragheaderlen;
1425                         skb->transport_header = (skb->network_header +
1426                                                  fragheaderlen);
1427                         if (fraggap) {
1428                                 skb->csum = skb_copy_and_csum_bits(
1429                                         skb_prev, maxfraglen,
1430                                         data + transhdrlen, fraggap, 0);
1431                                 skb_prev->csum = csum_sub(skb_prev->csum,
1432                                                           skb->csum);
1433                                 data += fraggap;
1434                                 pskb_trim_unique(skb_prev, maxfraglen);
1435                         }
1436                         copy = datalen - transhdrlen - fraggap;
1437                         if (copy < 0) {
1438                                 err = -EINVAL;
1439                                 kfree_skb(skb);
1440                                 goto error;
1441                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1442                                 err = -EFAULT;
1443                                 kfree_skb(skb);
1444                                 goto error;
1445                         }
1446
1447                         offset += copy;
1448                         length -= datalen - fraggap;
1449                         transhdrlen = 0;
1450                         exthdrlen = 0;
1451                         csummode = CHECKSUM_NONE;
1452
1453                         /*
1454                          * Put the packet on the pending queue
1455                          */
1456                         __skb_queue_tail(&sk->sk_write_queue, skb);
1457                         continue;
1458                 }
1459
1460                 if (copy > length)
1461                         copy = length;
1462
1463                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1464                         unsigned int off;
1465
1466                         off = skb->len;
1467                         if (getfrag(from, skb_put(skb, copy),
1468                                                 offset, copy, off, skb) < 0) {
1469                                 __skb_trim(skb, off);
1470                                 err = -EFAULT;
1471                                 goto error;
1472                         }
1473                 } else {
1474                         int i = skb_shinfo(skb)->nr_frags;
1475                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1476                         struct page *page = sk->sk_sndmsg_page;
1477                         int off = sk->sk_sndmsg_off;
1478                         unsigned int left;
1479
1480                         if (page && (left = PAGE_SIZE - off) > 0) {
1481                                 if (copy >= left)
1482                                         copy = left;
1483                                 if (page != frag->page) {
1484                                         if (i == MAX_SKB_FRAGS) {
1485                                                 err = -EMSGSIZE;
1486                                                 goto error;
1487                                         }
1488                                         get_page(page);
1489                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1490                                         frag = &skb_shinfo(skb)->frags[i];
1491                                 }
1492                         } else if(i < MAX_SKB_FRAGS) {
1493                                 if (copy > PAGE_SIZE)
1494                                         copy = PAGE_SIZE;
1495                                 page = alloc_pages(sk->sk_allocation, 0);
1496                                 if (page == NULL) {
1497                                         err = -ENOMEM;
1498                                         goto error;
1499                                 }
1500                                 sk->sk_sndmsg_page = page;
1501                                 sk->sk_sndmsg_off = 0;
1502
1503                                 skb_fill_page_desc(skb, i, page, 0, 0);
1504                                 frag = &skb_shinfo(skb)->frags[i];
1505                         } else {
1506                                 err = -EMSGSIZE;
1507                                 goto error;
1508                         }
1509                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1510                                 err = -EFAULT;
1511                                 goto error;
1512                         }
1513                         sk->sk_sndmsg_off += copy;
1514                         frag->size += copy;
1515                         skb->len += copy;
1516                         skb->data_len += copy;
1517                         skb->truesize += copy;
1518                         atomic_add(copy, &sk->sk_wmem_alloc);
1519                 }
1520                 offset += copy;
1521                 length -= copy;
1522         }
1523         return 0;
1524 error:
1525         cork->length -= length;
1526         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1527         return err;
1528 }
1529
1530 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1531 {
1532         if (np->cork.opt) {
1533                 kfree(np->cork.opt->dst0opt);
1534                 kfree(np->cork.opt->dst1opt);
1535                 kfree(np->cork.opt->hopopt);
1536                 kfree(np->cork.opt->srcrt);
1537                 kfree(np->cork.opt);
1538                 np->cork.opt = NULL;
1539         }
1540
1541         if (inet->cork.base.dst) {
1542                 dst_release(inet->cork.base.dst);
1543                 inet->cork.base.dst = NULL;
1544                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1545         }
1546         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1547 }
1548
1549 int ip6_push_pending_frames(struct sock *sk)
1550 {
1551         struct sk_buff *skb, *tmp_skb;
1552         struct sk_buff **tail_skb;
1553         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1554         struct inet_sock *inet = inet_sk(sk);
1555         struct ipv6_pinfo *np = inet6_sk(sk);
1556         struct net *net = sock_net(sk);
1557         struct ipv6hdr *hdr;
1558         struct ipv6_txoptions *opt = np->cork.opt;
1559         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1560         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1561         unsigned char proto = fl6->flowi6_proto;
1562         int err = 0;
1563
1564         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1565                 goto out;
1566         tail_skb = &(skb_shinfo(skb)->frag_list);
1567
1568         /* move skb->data to ip header from ext header */
1569         if (skb->data < skb_network_header(skb))
1570                 __skb_pull(skb, skb_network_offset(skb));
1571         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1572                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1573                 *tail_skb = tmp_skb;
1574                 tail_skb = &(tmp_skb->next);
1575                 skb->len += tmp_skb->len;
1576                 skb->data_len += tmp_skb->len;
1577                 skb->truesize += tmp_skb->truesize;
1578                 tmp_skb->destructor = NULL;
1579                 tmp_skb->sk = NULL;
1580         }
1581
1582         /* Allow local fragmentation. */
1583         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1584                 skb->local_df = 1;
1585
1586         ipv6_addr_copy(final_dst, &fl6->daddr);
1587         __skb_pull(skb, skb_network_header_len(skb));
1588         if (opt && opt->opt_flen)
1589                 ipv6_push_frag_opts(skb, opt, &proto);
1590         if (opt && opt->opt_nflen)
1591                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1592
1593         skb_push(skb, sizeof(struct ipv6hdr));
1594         skb_reset_network_header(skb);
1595         hdr = ipv6_hdr(skb);
1596
1597         *(__be32*)hdr = fl6->flowlabel |
1598                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1599
1600         hdr->hop_limit = np->cork.hop_limit;
1601         hdr->nexthdr = proto;
1602         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1603         ipv6_addr_copy(&hdr->daddr, final_dst);
1604
1605         skb->priority = sk->sk_priority;
1606         skb->mark = sk->sk_mark;
1607
1608         skb_dst_set(skb, dst_clone(&rt->dst));
1609         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1610         if (proto == IPPROTO_ICMPV6) {
1611                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1612
1613                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1614                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1615         }
1616
1617         err = ip6_local_out(skb);
1618         if (err) {
1619                 if (err > 0)
1620                         err = net_xmit_errno(err);
1621                 if (err)
1622                         goto error;
1623         }
1624
1625 out:
1626         ip6_cork_release(inet, np);
1627         return err;
1628 error:
1629         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1630         goto out;
1631 }
1632
1633 void ip6_flush_pending_frames(struct sock *sk)
1634 {
1635         struct sk_buff *skb;
1636
1637         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1638                 if (skb_dst(skb))
1639                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1640                                       IPSTATS_MIB_OUTDISCARDS);
1641                 kfree_skb(skb);
1642         }
1643
1644         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1645 }