Revert "net: ip, ipv6: handle gso skbs in forwarding path"
[pandora-kernel.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89         skb_reset_mac_header(newskb);
90         __skb_pull(newskb, skb_network_offset(newskb));
91         newskb->pkt_type = PACKET_LOOPBACK;
92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
93         WARN_ON(!skb_dst(newskb));
94
95         netif_rx_ni(newskb);
96         return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101         struct dst_entry *dst = skb_dst(skb);
102         struct net_device *dev = dst->dev;
103         struct neighbour *neigh;
104
105         skb->protocol = htons(ETH_P_IPV6);
106         skb->dev = dev;
107
108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110
111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112                     ((mroute6_socket(dev_net(dev), skb) &&
113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115                                          &ipv6_hdr(skb)->saddr))) {
116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117
118                         /* Do not check for IFF_ALLMULTI; multicast routing
119                            is not supported in any case.
120                          */
121                         if (newskb)
122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123                                         newskb, NULL, newskb->dev,
124                                         ip6_dev_loopback_xmit);
125
126                         if (ipv6_hdr(skb)->hop_limit == 0) {
127                                 IP6_INC_STATS(dev_net(dev), idev,
128                                               IPSTATS_MIB_OUTDISCARDS);
129                                 kfree_skb(skb);
130                                 return 0;
131                         }
132                 }
133
134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135                                 skb->len);
136         }
137
138         rcu_read_lock();
139         neigh = dst_get_neighbour(dst);
140         if (neigh) {
141                 int res = neigh_output(neigh, skb);
142
143                 rcu_read_unlock();
144                 return res;
145         }
146         rcu_read_unlock();
147         IP6_INC_STATS(dev_net(dst->dev),
148                       ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149         kfree_skb(skb);
150         return -EINVAL;
151 }
152
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156             dst_allfrag(skb_dst(skb)))
157                 return ip6_fragment(skb, ip6_finish_output2);
158         else
159                 return ip6_finish_output2(skb);
160 }
161
162 int ip6_output(struct sk_buff *skb)
163 {
164         struct net_device *dev = skb_dst(skb)->dev;
165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166         if (unlikely(idev->cnf.disable_ipv6)) {
167                 IP6_INC_STATS(dev_net(dev), idev,
168                               IPSTATS_MIB_OUTDISCARDS);
169                 kfree_skb(skb);
170                 return 0;
171         }
172
173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174                             ip6_finish_output,
175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177
178 /*
179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183              struct ipv6_txoptions *opt, int tclass)
184 {
185         struct net *net = sock_net(sk);
186         struct ipv6_pinfo *np = inet6_sk(sk);
187         struct in6_addr *first_hop = &fl6->daddr;
188         struct dst_entry *dst = skb_dst(skb);
189         struct ipv6hdr *hdr;
190         u8  proto = fl6->flowi6_proto;
191         int seg_len = skb->len;
192         int hlimit = -1;
193         u32 mtu;
194
195         if (opt) {
196                 unsigned int head_room;
197
198                 /* First: exthdrs may take lots of space (~8K for now)
199                    MAX_HEADER is not enough.
200                  */
201                 head_room = opt->opt_nflen + opt->opt_flen;
202                 seg_len += head_room;
203                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204
205                 if (skb_headroom(skb) < head_room) {
206                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207                         if (skb2 == NULL) {
208                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209                                               IPSTATS_MIB_OUTDISCARDS);
210                                 kfree_skb(skb);
211                                 return -ENOBUFS;
212                         }
213                         kfree_skb(skb);
214                         skb = skb2;
215                         skb_set_owner_w(skb, sk);
216                 }
217                 if (opt->opt_flen)
218                         ipv6_push_frag_opts(skb, opt, &proto);
219                 if (opt->opt_nflen)
220                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221         }
222
223         skb_push(skb, sizeof(struct ipv6hdr));
224         skb_reset_network_header(skb);
225         hdr = ipv6_hdr(skb);
226
227         /*
228          *      Fill in the IPv6 header
229          */
230         if (np)
231                 hlimit = np->hop_limit;
232         if (hlimit < 0)
233                 hlimit = ip6_dst_hoplimit(dst);
234
235         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236
237         hdr->payload_len = htons(seg_len);
238         hdr->nexthdr = proto;
239         hdr->hop_limit = hlimit;
240
241         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
242         ipv6_addr_copy(&hdr->daddr, first_hop);
243
244         skb->priority = sk->sk_priority;
245         skb->mark = sk->sk_mark;
246
247         mtu = dst_mtu(dst);
248         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250                               IPSTATS_MIB_OUT, skb->len);
251                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252                                dst->dev, dst_output);
253         }
254
255         if (net_ratelimit())
256                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
257         skb->dev = dst->dev;
258         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
259         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
260         kfree_skb(skb);
261         return -EMSGSIZE;
262 }
263
264 EXPORT_SYMBOL(ip6_xmit);
265
266 /*
267  *      To avoid extra problems ND packets are send through this
268  *      routine. It's code duplication but I really want to avoid
269  *      extra checks since ipv6_build_header is used by TCP (which
270  *      is for us performance critical)
271  */
272
273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
274                const struct in6_addr *saddr, const struct in6_addr *daddr,
275                int proto, int len)
276 {
277         struct ipv6_pinfo *np = inet6_sk(sk);
278         struct ipv6hdr *hdr;
279
280         skb->protocol = htons(ETH_P_IPV6);
281         skb->dev = dev;
282
283         skb_reset_network_header(skb);
284         skb_put(skb, sizeof(struct ipv6hdr));
285         hdr = ipv6_hdr(skb);
286
287         *(__be32*)hdr = htonl(0x60000000);
288
289         hdr->payload_len = htons(len);
290         hdr->nexthdr = proto;
291         hdr->hop_limit = np->hop_limit;
292
293         ipv6_addr_copy(&hdr->saddr, saddr);
294         ipv6_addr_copy(&hdr->daddr, daddr);
295
296         return 0;
297 }
298
299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
300 {
301         struct ip6_ra_chain *ra;
302         struct sock *last = NULL;
303
304         read_lock(&ip6_ra_lock);
305         for (ra = ip6_ra_chain; ra; ra = ra->next) {
306                 struct sock *sk = ra->sk;
307                 if (sk && ra->sel == sel &&
308                     (!sk->sk_bound_dev_if ||
309                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
310                         if (last) {
311                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
312                                 if (skb2)
313                                         rawv6_rcv(last, skb2);
314                         }
315                         last = sk;
316                 }
317         }
318
319         if (last) {
320                 rawv6_rcv(last, skb);
321                 read_unlock(&ip6_ra_lock);
322                 return 1;
323         }
324         read_unlock(&ip6_ra_lock);
325         return 0;
326 }
327
328 static int ip6_forward_proxy_check(struct sk_buff *skb)
329 {
330         struct ipv6hdr *hdr = ipv6_hdr(skb);
331         u8 nexthdr = hdr->nexthdr;
332         int offset;
333
334         if (ipv6_ext_hdr(nexthdr)) {
335                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
336                 if (offset < 0)
337                         return 0;
338         } else
339                 offset = sizeof(struct ipv6hdr);
340
341         if (nexthdr == IPPROTO_ICMPV6) {
342                 struct icmp6hdr *icmp6;
343
344                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345                                          offset + 1 - skb->data)))
346                         return 0;
347
348                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349
350                 switch (icmp6->icmp6_type) {
351                 case NDISC_ROUTER_SOLICITATION:
352                 case NDISC_ROUTER_ADVERTISEMENT:
353                 case NDISC_NEIGHBOUR_SOLICITATION:
354                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
355                 case NDISC_REDIRECT:
356                         /* For reaction involving unicast neighbor discovery
357                          * message destined to the proxied address, pass it to
358                          * input function.
359                          */
360                         return 1;
361                 default:
362                         break;
363                 }
364         }
365
366         /*
367          * The proxying router can't forward traffic sent to a link-local
368          * address, so signal the sender and discard the packet. This
369          * behavior is clarified by the MIPv6 specification.
370          */
371         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372                 dst_link_failure(skb);
373                 return -1;
374         }
375
376         return 0;
377 }
378
379 static inline int ip6_forward_finish(struct sk_buff *skb)
380 {
381         return dst_output(skb);
382 }
383
384 int ip6_forward(struct sk_buff *skb)
385 {
386         struct dst_entry *dst = skb_dst(skb);
387         struct ipv6hdr *hdr = ipv6_hdr(skb);
388         struct inet6_skb_parm *opt = IP6CB(skb);
389         struct net *net = dev_net(dst->dev);
390         struct neighbour *n;
391         u32 mtu;
392
393         if (net->ipv6.devconf_all->forwarding == 0)
394                 goto error;
395
396         if (skb_warn_if_lro(skb))
397                 goto drop;
398
399         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
401                 goto drop;
402         }
403
404         if (skb->pkt_type != PACKET_HOST)
405                 goto drop;
406
407         skb_forward_csum(skb);
408
409         /*
410          *      We DO NOT make any processing on
411          *      RA packets, pushing them to user level AS IS
412          *      without ane WARRANTY that application will be able
413          *      to interpret them. The reason is that we
414          *      cannot make anything clever here.
415          *
416          *      We are not end-node, so that if packet contains
417          *      AH/ESP, we cannot make anything.
418          *      Defragmentation also would be mistake, RA packets
419          *      cannot be fragmented, because there is no warranty
420          *      that different fragments will go along one path. --ANK
421          */
422         if (opt->ra) {
423                 u8 *ptr = skb_network_header(skb) + opt->ra;
424                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
425                         return 0;
426         }
427
428         /*
429          *      check and decrement ttl
430          */
431         if (hdr->hop_limit <= 1) {
432                 /* Force OUTPUT device used as source address */
433                 skb->dev = dst->dev;
434                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
435                 IP6_INC_STATS_BH(net,
436                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
437
438                 kfree_skb(skb);
439                 return -ETIMEDOUT;
440         }
441
442         /* XXX: idev->cnf.proxy_ndp? */
443         if (net->ipv6.devconf_all->proxy_ndp &&
444             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
445                 int proxied = ip6_forward_proxy_check(skb);
446                 if (proxied > 0)
447                         return ip6_input(skb);
448                 else if (proxied < 0) {
449                         IP6_INC_STATS(net, ip6_dst_idev(dst),
450                                       IPSTATS_MIB_INDISCARDS);
451                         goto drop;
452                 }
453         }
454
455         if (!xfrm6_route_forward(skb)) {
456                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
457                 goto drop;
458         }
459         dst = skb_dst(skb);
460
461         /* IPv6 specs say nothing about it, but it is clear that we cannot
462            send redirects to source routed frames.
463            We don't send redirects to frames decapsulated from IPsec.
464          */
465         n = dst_get_neighbour(dst);
466         if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
467                 struct in6_addr *target = NULL;
468                 struct rt6_info *rt;
469
470                 /*
471                  *      incoming and outgoing devices are the same
472                  *      send a redirect.
473                  */
474
475                 rt = (struct rt6_info *) dst;
476                 if ((rt->rt6i_flags & RTF_GATEWAY))
477                         target = (struct in6_addr*)&n->primary_key;
478                 else
479                         target = &hdr->daddr;
480
481                 if (!rt->rt6i_peer)
482                         rt6_bind_peer(rt, 1);
483
484                 /* Limit redirects both by destination (here)
485                    and by source (inside ndisc_send_redirect)
486                  */
487                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
488                         ndisc_send_redirect(skb, n, target);
489         } else {
490                 int addrtype = ipv6_addr_type(&hdr->saddr);
491
492                 /* This check is security critical. */
493                 if (addrtype == IPV6_ADDR_ANY ||
494                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
495                         goto error;
496                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
497                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
498                                     ICMPV6_NOT_NEIGHBOUR, 0);
499                         goto error;
500                 }
501         }
502
503         mtu = dst_mtu(dst);
504         if (mtu < IPV6_MIN_MTU)
505                 mtu = IPV6_MIN_MTU;
506
507         if (skb->len > mtu && !skb_is_gso(skb)) {
508                 /* Again, force OUTPUT device used as source address */
509                 skb->dev = dst->dev;
510                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
511                 IP6_INC_STATS_BH(net,
512                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
513                 IP6_INC_STATS_BH(net,
514                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
515                 kfree_skb(skb);
516                 return -EMSGSIZE;
517         }
518
519         if (skb_cow(skb, dst->dev->hard_header_len)) {
520                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
521                 goto drop;
522         }
523
524         hdr = ipv6_hdr(skb);
525
526         /* Mangling hops number delayed to point after skb COW */
527
528         hdr->hop_limit--;
529
530         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
531         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
532                        ip6_forward_finish);
533
534 error:
535         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
536 drop:
537         kfree_skb(skb);
538         return -EINVAL;
539 }
540
541 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
542 {
543         to->pkt_type = from->pkt_type;
544         to->priority = from->priority;
545         to->protocol = from->protocol;
546         skb_dst_drop(to);
547         skb_dst_set(to, dst_clone(skb_dst(from)));
548         to->dev = from->dev;
549         to->mark = from->mark;
550
551 #ifdef CONFIG_NET_SCHED
552         to->tc_index = from->tc_index;
553 #endif
554         nf_copy(to, from);
555 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
556     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
557         to->nf_trace = from->nf_trace;
558 #endif
559         skb_copy_secmark(to, from);
560 }
561
562 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
563 {
564         u16 offset = sizeof(struct ipv6hdr);
565         struct ipv6_opt_hdr *exthdr =
566                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
567         unsigned int packet_len = skb->tail - skb->network_header;
568         int found_rhdr = 0;
569         *nexthdr = &ipv6_hdr(skb)->nexthdr;
570
571         while (offset + 1 <= packet_len) {
572
573                 switch (**nexthdr) {
574
575                 case NEXTHDR_HOP:
576                         break;
577                 case NEXTHDR_ROUTING:
578                         found_rhdr = 1;
579                         break;
580                 case NEXTHDR_DEST:
581 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
582                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
583                                 break;
584 #endif
585                         if (found_rhdr)
586                                 return offset;
587                         break;
588                 default :
589                         return offset;
590                 }
591
592                 offset += ipv6_optlen(exthdr);
593                 *nexthdr = &exthdr->nexthdr;
594                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
595                                                  offset);
596         }
597
598         return offset;
599 }
600
601 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
602 {
603         static atomic_t ipv6_fragmentation_id;
604         int ident;
605
606         if (rt && !(rt->dst.flags & DST_NOPEER)) {
607                 struct inet_peer *peer;
608
609                 if (!rt->rt6i_peer)
610                         rt6_bind_peer(rt, 1);
611                 peer = rt->rt6i_peer;
612                 if (peer) {
613                         fhdr->identification = htonl(inet_getid(peer, 0));
614                         return;
615                 }
616         }
617         ident = atomic_inc_return(&ipv6_fragmentation_id);
618         fhdr->identification = htonl(ident);
619 }
620
621 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
622 {
623         struct sk_buff *frag;
624         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
625         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
626         struct ipv6hdr *tmp_hdr;
627         struct frag_hdr *fh;
628         unsigned int mtu, hlen, left, len;
629         __be32 frag_id = 0;
630         int ptr, offset = 0, err=0;
631         u8 *prevhdr, nexthdr = 0;
632         struct net *net = dev_net(skb_dst(skb)->dev);
633
634         hlen = ip6_find_1stfragopt(skb, &prevhdr);
635         nexthdr = *prevhdr;
636
637         mtu = ip6_skb_dst_mtu(skb);
638
639         /* We must not fragment if the socket is set to force MTU discovery
640          * or if the skb it not generated by a local socket.
641          */
642         if (!skb->local_df && skb->len > mtu) {
643                 skb->dev = skb_dst(skb)->dev;
644                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
645                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
646                               IPSTATS_MIB_FRAGFAILS);
647                 kfree_skb(skb);
648                 return -EMSGSIZE;
649         }
650
651         if (np && np->frag_size < mtu) {
652                 if (np->frag_size)
653                         mtu = np->frag_size;
654         }
655         mtu -= hlen + sizeof(struct frag_hdr);
656
657         if (skb_has_frag_list(skb)) {
658                 int first_len = skb_pagelen(skb);
659                 struct sk_buff *frag2;
660
661                 if (first_len - hlen > mtu ||
662                     ((first_len - hlen) & 7) ||
663                     skb_cloned(skb))
664                         goto slow_path;
665
666                 skb_walk_frags(skb, frag) {
667                         /* Correct geometry. */
668                         if (frag->len > mtu ||
669                             ((frag->len & 7) && frag->next) ||
670                             skb_headroom(frag) < hlen)
671                                 goto slow_path_clean;
672
673                         /* Partially cloned skb? */
674                         if (skb_shared(frag))
675                                 goto slow_path_clean;
676
677                         BUG_ON(frag->sk);
678                         if (skb->sk) {
679                                 frag->sk = skb->sk;
680                                 frag->destructor = sock_wfree;
681                         }
682                         skb->truesize -= frag->truesize;
683                 }
684
685                 err = 0;
686                 offset = 0;
687                 frag = skb_shinfo(skb)->frag_list;
688                 skb_frag_list_init(skb);
689                 /* BUILD HEADER */
690
691                 *prevhdr = NEXTHDR_FRAGMENT;
692                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
693                 if (!tmp_hdr) {
694                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
695                                       IPSTATS_MIB_FRAGFAILS);
696                         return -ENOMEM;
697                 }
698
699                 __skb_pull(skb, hlen);
700                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
701                 __skb_push(skb, hlen);
702                 skb_reset_network_header(skb);
703                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
704
705                 ipv6_select_ident(fh, rt);
706                 fh->nexthdr = nexthdr;
707                 fh->reserved = 0;
708                 fh->frag_off = htons(IP6_MF);
709                 frag_id = fh->identification;
710
711                 first_len = skb_pagelen(skb);
712                 skb->data_len = first_len - skb_headlen(skb);
713                 skb->len = first_len;
714                 ipv6_hdr(skb)->payload_len = htons(first_len -
715                                                    sizeof(struct ipv6hdr));
716
717                 dst_hold(&rt->dst);
718
719                 for (;;) {
720                         /* Prepare header of the next frame,
721                          * before previous one went down. */
722                         if (frag) {
723                                 frag->ip_summed = CHECKSUM_NONE;
724                                 skb_reset_transport_header(frag);
725                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
726                                 __skb_push(frag, hlen);
727                                 skb_reset_network_header(frag);
728                                 memcpy(skb_network_header(frag), tmp_hdr,
729                                        hlen);
730                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
731                                 fh->nexthdr = nexthdr;
732                                 fh->reserved = 0;
733                                 fh->frag_off = htons(offset);
734                                 if (frag->next != NULL)
735                                         fh->frag_off |= htons(IP6_MF);
736                                 fh->identification = frag_id;
737                                 ipv6_hdr(frag)->payload_len =
738                                                 htons(frag->len -
739                                                       sizeof(struct ipv6hdr));
740                                 ip6_copy_metadata(frag, skb);
741                         }
742
743                         err = output(skb);
744                         if(!err)
745                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
746                                               IPSTATS_MIB_FRAGCREATES);
747
748                         if (err || !frag)
749                                 break;
750
751                         skb = frag;
752                         frag = skb->next;
753                         skb->next = NULL;
754                 }
755
756                 kfree(tmp_hdr);
757
758                 if (err == 0) {
759                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
760                                       IPSTATS_MIB_FRAGOKS);
761                         dst_release(&rt->dst);
762                         return 0;
763                 }
764
765                 while (frag) {
766                         skb = frag->next;
767                         kfree_skb(frag);
768                         frag = skb;
769                 }
770
771                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
772                               IPSTATS_MIB_FRAGFAILS);
773                 dst_release(&rt->dst);
774                 return err;
775
776 slow_path_clean:
777                 skb_walk_frags(skb, frag2) {
778                         if (frag2 == frag)
779                                 break;
780                         frag2->sk = NULL;
781                         frag2->destructor = NULL;
782                         skb->truesize += frag2->truesize;
783                 }
784         }
785
786 slow_path:
787         left = skb->len - hlen;         /* Space per frame */
788         ptr = hlen;                     /* Where to start from */
789
790         /*
791          *      Fragment the datagram.
792          */
793
794         *prevhdr = NEXTHDR_FRAGMENT;
795
796         /*
797          *      Keep copying data until we run out.
798          */
799         while(left > 0) {
800                 len = left;
801                 /* IF: it doesn't fit, use 'mtu' - the data space left */
802                 if (len > mtu)
803                         len = mtu;
804                 /* IF: we are not sending up to and including the packet end
805                    then align the next start on an eight byte boundary */
806                 if (len < left) {
807                         len &= ~7;
808                 }
809                 /*
810                  *      Allocate buffer.
811                  */
812
813                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
814                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
815                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
816                                       IPSTATS_MIB_FRAGFAILS);
817                         err = -ENOMEM;
818                         goto fail;
819                 }
820
821                 /*
822                  *      Set up data on packet
823                  */
824
825                 ip6_copy_metadata(frag, skb);
826                 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
827                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
828                 skb_reset_network_header(frag);
829                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
830                 frag->transport_header = (frag->network_header + hlen +
831                                           sizeof(struct frag_hdr));
832
833                 /*
834                  *      Charge the memory for the fragment to any owner
835                  *      it might possess
836                  */
837                 if (skb->sk)
838                         skb_set_owner_w(frag, skb->sk);
839
840                 /*
841                  *      Copy the packet header into the new buffer.
842                  */
843                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
844
845                 /*
846                  *      Build fragment header.
847                  */
848                 fh->nexthdr = nexthdr;
849                 fh->reserved = 0;
850                 if (!frag_id) {
851                         ipv6_select_ident(fh, rt);
852                         frag_id = fh->identification;
853                 } else
854                         fh->identification = frag_id;
855
856                 /*
857                  *      Copy a block of the IP datagram.
858                  */
859                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
860                         BUG();
861                 left -= len;
862
863                 fh->frag_off = htons(offset);
864                 if (left > 0)
865                         fh->frag_off |= htons(IP6_MF);
866                 ipv6_hdr(frag)->payload_len = htons(frag->len -
867                                                     sizeof(struct ipv6hdr));
868
869                 ptr += len;
870                 offset += len;
871
872                 /*
873                  *      Put this fragment into the sending queue.
874                  */
875                 err = output(frag);
876                 if (err)
877                         goto fail;
878
879                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
880                               IPSTATS_MIB_FRAGCREATES);
881         }
882         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
883                       IPSTATS_MIB_FRAGOKS);
884         kfree_skb(skb);
885         return err;
886
887 fail:
888         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
889                       IPSTATS_MIB_FRAGFAILS);
890         kfree_skb(skb);
891         return err;
892 }
893
894 static inline int ip6_rt_check(const struct rt6key *rt_key,
895                                const struct in6_addr *fl_addr,
896                                const struct in6_addr *addr_cache)
897 {
898         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
899                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
900 }
901
902 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
903                                           struct dst_entry *dst,
904                                           const struct flowi6 *fl6)
905 {
906         struct ipv6_pinfo *np = inet6_sk(sk);
907         struct rt6_info *rt;
908
909         if (!dst)
910                 goto out;
911
912         if (dst->ops->family != AF_INET6) {
913                 dst_release(dst);
914                 return NULL;
915         }
916
917         rt = (struct rt6_info *)dst;
918         /* Yes, checking route validity in not connected
919          * case is not very simple. Take into account,
920          * that we do not support routing by source, TOS,
921          * and MSG_DONTROUTE            --ANK (980726)
922          *
923          * 1. ip6_rt_check(): If route was host route,
924          *    check that cached destination is current.
925          *    If it is network route, we still may
926          *    check its validity using saved pointer
927          *    to the last used address: daddr_cache.
928          *    We do not want to save whole address now,
929          *    (because main consumer of this service
930          *    is tcp, which has not this problem),
931          *    so that the last trick works only on connected
932          *    sockets.
933          * 2. oif also should be the same.
934          */
935         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
936 #ifdef CONFIG_IPV6_SUBTREES
937             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
938 #endif
939             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
940                 dst_release(dst);
941                 dst = NULL;
942         }
943
944 out:
945         return dst;
946 }
947
948 static int ip6_dst_lookup_tail(struct sock *sk,
949                                struct dst_entry **dst, struct flowi6 *fl6)
950 {
951         struct net *net = sock_net(sk);
952 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
953         struct neighbour *n;
954 #endif
955         int err;
956
957         if (*dst == NULL)
958                 *dst = ip6_route_output(net, sk, fl6);
959
960         if ((err = (*dst)->error))
961                 goto out_err_release;
962
963         if (ipv6_addr_any(&fl6->saddr)) {
964                 struct rt6_info *rt = (struct rt6_info *) *dst;
965                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
966                                           sk ? inet6_sk(sk)->srcprefs : 0,
967                                           &fl6->saddr);
968                 if (err)
969                         goto out_err_release;
970         }
971
972 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
973         /*
974          * Here if the dst entry we've looked up
975          * has a neighbour entry that is in the INCOMPLETE
976          * state and the src address from the flow is
977          * marked as OPTIMISTIC, we release the found
978          * dst entry and replace it instead with the
979          * dst entry of the nexthop router
980          */
981         rcu_read_lock();
982         n = dst_get_neighbour(*dst);
983         if (n && !(n->nud_state & NUD_VALID)) {
984                 struct inet6_ifaddr *ifp;
985                 struct flowi6 fl_gw6;
986                 int redirect;
987
988                 rcu_read_unlock();
989                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
990                                       (*dst)->dev, 1);
991
992                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
993                 if (ifp)
994                         in6_ifa_put(ifp);
995
996                 if (redirect) {
997                         /*
998                          * We need to get the dst entry for the
999                          * default router instead
1000                          */
1001                         dst_release(*dst);
1002                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1003                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1004                         *dst = ip6_route_output(net, sk, &fl_gw6);
1005                         if ((err = (*dst)->error))
1006                                 goto out_err_release;
1007                 }
1008         } else {
1009                 rcu_read_unlock();
1010         }
1011 #endif
1012
1013         return 0;
1014
1015 out_err_release:
1016         if (err == -ENETUNREACH)
1017                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1018         dst_release(*dst);
1019         *dst = NULL;
1020         return err;
1021 }
1022
1023 /**
1024  *      ip6_dst_lookup - perform route lookup on flow
1025  *      @sk: socket which provides route info
1026  *      @dst: pointer to dst_entry * for result
1027  *      @fl6: flow to lookup
1028  *
1029  *      This function performs a route lookup on the given flow.
1030  *
1031  *      It returns zero on success, or a standard errno code on error.
1032  */
1033 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1034 {
1035         *dst = NULL;
1036         return ip6_dst_lookup_tail(sk, dst, fl6);
1037 }
1038 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1039
1040 /**
1041  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1042  *      @sk: socket which provides route info
1043  *      @fl6: flow to lookup
1044  *      @final_dst: final destination address for ipsec lookup
1045  *      @can_sleep: we are in a sleepable context
1046  *
1047  *      This function performs a route lookup on the given flow.
1048  *
1049  *      It returns a valid dst pointer on success, or a pointer encoded
1050  *      error code.
1051  */
1052 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1053                                       const struct in6_addr *final_dst,
1054                                       bool can_sleep)
1055 {
1056         struct dst_entry *dst = NULL;
1057         int err;
1058
1059         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1060         if (err)
1061                 return ERR_PTR(err);
1062         if (final_dst)
1063                 ipv6_addr_copy(&fl6->daddr, final_dst);
1064         if (can_sleep)
1065                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1066
1067         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1068 }
1069 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1070
1071 /**
1072  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1073  *      @sk: socket which provides the dst cache and route info
1074  *      @fl6: flow to lookup
1075  *      @final_dst: final destination address for ipsec lookup
1076  *      @can_sleep: we are in a sleepable context
1077  *
1078  *      This function performs a route lookup on the given flow with the
1079  *      possibility of using the cached route in the socket if it is valid.
1080  *      It will take the socket dst lock when operating on the dst cache.
1081  *      As a result, this function can only be used in process context.
1082  *
1083  *      It returns a valid dst pointer on success, or a pointer encoded
1084  *      error code.
1085  */
1086 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1087                                          const struct in6_addr *final_dst,
1088                                          bool can_sleep)
1089 {
1090         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1091         int err;
1092
1093         dst = ip6_sk_dst_check(sk, dst, fl6);
1094
1095         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1096         if (err)
1097                 return ERR_PTR(err);
1098         if (final_dst)
1099                 ipv6_addr_copy(&fl6->daddr, final_dst);
1100         if (can_sleep)
1101                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1102
1103         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1104 }
1105 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1106
1107 static inline int ip6_ufo_append_data(struct sock *sk,
1108                         int getfrag(void *from, char *to, int offset, int len,
1109                         int odd, struct sk_buff *skb),
1110                         void *from, int length, int hh_len, int fragheaderlen,
1111                         int transhdrlen, int mtu,unsigned int flags,
1112                         struct rt6_info *rt)
1113
1114 {
1115         struct sk_buff *skb;
1116         int err;
1117
1118         /* There is support for UDP large send offload by network
1119          * device, so create one single skb packet containing complete
1120          * udp datagram
1121          */
1122         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1123                 struct frag_hdr fhdr;
1124
1125                 skb = sock_alloc_send_skb(sk,
1126                         hh_len + fragheaderlen + transhdrlen + 20,
1127                         (flags & MSG_DONTWAIT), &err);
1128                 if (skb == NULL)
1129                         return err;
1130
1131                 /* reserve space for Hardware header */
1132                 skb_reserve(skb, hh_len);
1133
1134                 /* create space for UDP/IP header */
1135                 skb_put(skb,fragheaderlen + transhdrlen);
1136
1137                 /* initialize network header pointer */
1138                 skb_reset_network_header(skb);
1139
1140                 /* initialize protocol header pointer */
1141                 skb->transport_header = skb->network_header + fragheaderlen;
1142
1143                 skb->ip_summed = CHECKSUM_PARTIAL;
1144                 skb->csum = 0;
1145
1146                 /* Specify the length of each IPv6 datagram fragment.
1147                  * It has to be a multiple of 8.
1148                  */
1149                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1150                                              sizeof(struct frag_hdr)) & ~7;
1151                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1152                 ipv6_select_ident(&fhdr, rt);
1153                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1154                 __skb_queue_tail(&sk->sk_write_queue, skb);
1155         }
1156
1157         return skb_append_datato_frags(sk, skb, getfrag, from,
1158                                        (length - transhdrlen));
1159 }
1160
1161 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1162                                                gfp_t gfp)
1163 {
1164         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1165 }
1166
1167 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1168                                                 gfp_t gfp)
1169 {
1170         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1171 }
1172
1173 static void ip6_append_data_mtu(unsigned int *mtu,
1174                                 int *maxfraglen,
1175                                 unsigned int fragheaderlen,
1176                                 struct sk_buff *skb,
1177                                 struct rt6_info *rt,
1178                                 unsigned int orig_mtu)
1179 {
1180         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1181                 if (skb == NULL) {
1182                         /* first fragment, reserve header_len */
1183                         *mtu = orig_mtu - rt->dst.header_len;
1184
1185                 } else {
1186                         /*
1187                          * this fragment is not first, the headers
1188                          * space is regarded as data space.
1189                          */
1190                         *mtu = orig_mtu;
1191                 }
1192                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1193                               + fragheaderlen - sizeof(struct frag_hdr);
1194         }
1195 }
1196
1197 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1198         int offset, int len, int odd, struct sk_buff *skb),
1199         void *from, int length, int transhdrlen,
1200         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1201         struct rt6_info *rt, unsigned int flags, int dontfrag)
1202 {
1203         struct inet_sock *inet = inet_sk(sk);
1204         struct ipv6_pinfo *np = inet6_sk(sk);
1205         struct inet_cork *cork;
1206         struct sk_buff *skb, *skb_prev = NULL;
1207         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1208         int exthdrlen;
1209         int dst_exthdrlen;
1210         int hh_len;
1211         int copy;
1212         int err;
1213         int offset = 0;
1214         int csummode = CHECKSUM_NONE;
1215         __u8 tx_flags = 0;
1216
1217         if (flags&MSG_PROBE)
1218                 return 0;
1219         cork = &inet->cork.base;
1220         if (skb_queue_empty(&sk->sk_write_queue)) {
1221                 /*
1222                  * setup for corking
1223                  */
1224                 if (opt) {
1225                         if (WARN_ON(np->cork.opt))
1226                                 return -EINVAL;
1227
1228                         np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1229                         if (unlikely(np->cork.opt == NULL))
1230                                 return -ENOBUFS;
1231
1232                         np->cork.opt->tot_len = opt->tot_len;
1233                         np->cork.opt->opt_flen = opt->opt_flen;
1234                         np->cork.opt->opt_nflen = opt->opt_nflen;
1235
1236                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1237                                                             sk->sk_allocation);
1238                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1239                                 return -ENOBUFS;
1240
1241                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1242                                                             sk->sk_allocation);
1243                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1244                                 return -ENOBUFS;
1245
1246                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1247                                                            sk->sk_allocation);
1248                         if (opt->hopopt && !np->cork.opt->hopopt)
1249                                 return -ENOBUFS;
1250
1251                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1252                                                             sk->sk_allocation);
1253                         if (opt->srcrt && !np->cork.opt->srcrt)
1254                                 return -ENOBUFS;
1255
1256                         /* need source address above miyazawa*/
1257                 }
1258                 dst_hold(&rt->dst);
1259                 cork->dst = &rt->dst;
1260                 inet->cork.fl.u.ip6 = *fl6;
1261                 np->cork.hop_limit = hlimit;
1262                 np->cork.tclass = tclass;
1263                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1264                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1265                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1266                 else
1267                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1268                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1269                 if (np->frag_size < mtu) {
1270                         if (np->frag_size)
1271                                 mtu = np->frag_size;
1272                 }
1273                 cork->fragsize = mtu;
1274                 if (dst_allfrag(rt->dst.path))
1275                         cork->flags |= IPCORK_ALLFRAG;
1276                 cork->length = 0;
1277                 sk->sk_sndmsg_page = NULL;
1278                 sk->sk_sndmsg_off = 0;
1279                 exthdrlen = (opt ? opt->opt_flen : 0);
1280                 length += exthdrlen;
1281                 transhdrlen += exthdrlen;
1282                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1283         } else {
1284                 rt = (struct rt6_info *)cork->dst;
1285                 fl6 = &inet->cork.fl.u.ip6;
1286                 opt = np->cork.opt;
1287                 transhdrlen = 0;
1288                 exthdrlen = 0;
1289                 dst_exthdrlen = 0;
1290                 mtu = cork->fragsize;
1291         }
1292         orig_mtu = mtu;
1293
1294         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1295
1296         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1297                         (opt ? opt->opt_nflen : 0);
1298         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1299
1300         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1301                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1302                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1303                         return -EMSGSIZE;
1304                 }
1305         }
1306
1307         /* For UDP, check if TX timestamp is enabled */
1308         if (sk->sk_type == SOCK_DGRAM) {
1309                 err = sock_tx_timestamp(sk, &tx_flags);
1310                 if (err)
1311                         goto error;
1312         }
1313
1314         /*
1315          * Let's try using as much space as possible.
1316          * Use MTU if total length of the message fits into the MTU.
1317          * Otherwise, we need to reserve fragment header and
1318          * fragment alignment (= 8-15 octects, in total).
1319          *
1320          * Note that we may need to "move" the data from the tail of
1321          * of the buffer to the new fragment when we split
1322          * the message.
1323          *
1324          * FIXME: It may be fragmented into multiple chunks
1325          *        at once if non-fragmentable extension headers
1326          *        are too large.
1327          * --yoshfuji
1328          */
1329
1330         if ((length > mtu) && dontfrag && (sk->sk_protocol == IPPROTO_UDP ||
1331                                            sk->sk_protocol == IPPROTO_RAW)) {
1332                 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1333                 return -EMSGSIZE;
1334         }
1335
1336         skb = skb_peek_tail(&sk->sk_write_queue);
1337         cork->length += length;
1338         if (((length > mtu) ||
1339              (skb && skb_has_frags(skb))) &&
1340             (sk->sk_protocol == IPPROTO_UDP) &&
1341             (rt->dst.dev->features & NETIF_F_UFO)) {
1342                 err = ip6_ufo_append_data(sk, getfrag, from, length,
1343                                           hh_len, fragheaderlen,
1344                                           transhdrlen, mtu, flags, rt);
1345                 if (err)
1346                         goto error;
1347                 return 0;
1348         }
1349
1350         if (!skb)
1351                 goto alloc_new_skb;
1352
1353         while (length > 0) {
1354                 /* Check if the remaining data fits into current packet. */
1355                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1356                 if (copy < length)
1357                         copy = maxfraglen - skb->len;
1358
1359                 if (copy <= 0) {
1360                         char *data;
1361                         unsigned int datalen;
1362                         unsigned int fraglen;
1363                         unsigned int fraggap;
1364                         unsigned int alloclen;
1365 alloc_new_skb:
1366                         /* There's no room in the current skb */
1367                         if (skb)
1368                                 fraggap = skb->len - maxfraglen;
1369                         else
1370                                 fraggap = 0;
1371                         /* update mtu and maxfraglen if necessary */
1372                         if (skb == NULL || skb_prev == NULL)
1373                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1374                                                     fragheaderlen, skb, rt,
1375                                                     orig_mtu);
1376
1377                         skb_prev = skb;
1378
1379                         /*
1380                          * If remaining data exceeds the mtu,
1381                          * we know we need more fragment(s).
1382                          */
1383                         datalen = length + fraggap;
1384
1385                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1386                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1387                         if ((flags & MSG_MORE) &&
1388                             !(rt->dst.dev->features&NETIF_F_SG))
1389                                 alloclen = mtu;
1390                         else
1391                                 alloclen = datalen + fragheaderlen;
1392
1393                         alloclen += dst_exthdrlen;
1394
1395                         if (datalen != length + fraggap) {
1396                                 /*
1397                                  * this is not the last fragment, the trailer
1398                                  * space is regarded as data space.
1399                                  */
1400                                 datalen += rt->dst.trailer_len;
1401                         }
1402
1403                         alloclen += rt->dst.trailer_len;
1404                         fraglen = datalen + fragheaderlen;
1405
1406                         /*
1407                          * We just reserve space for fragment header.
1408                          * Note: this may be overallocation if the message
1409                          * (without MSG_MORE) fits into the MTU.
1410                          */
1411                         alloclen += sizeof(struct frag_hdr);
1412
1413                         if (transhdrlen) {
1414                                 skb = sock_alloc_send_skb(sk,
1415                                                 alloclen + hh_len,
1416                                                 (flags & MSG_DONTWAIT), &err);
1417                         } else {
1418                                 skb = NULL;
1419                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1420                                     2 * sk->sk_sndbuf)
1421                                         skb = sock_wmalloc(sk,
1422                                                            alloclen + hh_len, 1,
1423                                                            sk->sk_allocation);
1424                                 if (unlikely(skb == NULL))
1425                                         err = -ENOBUFS;
1426                                 else {
1427                                         /* Only the initial fragment
1428                                          * is time stamped.
1429                                          */
1430                                         tx_flags = 0;
1431                                 }
1432                         }
1433                         if (skb == NULL)
1434                                 goto error;
1435                         /*
1436                          *      Fill in the control structures
1437                          */
1438                         skb->ip_summed = csummode;
1439                         skb->csum = 0;
1440                         /* reserve for fragmentation and ipsec header */
1441                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1442                                     dst_exthdrlen);
1443
1444                         if (sk->sk_type == SOCK_DGRAM)
1445                                 skb_shinfo(skb)->tx_flags = tx_flags;
1446
1447                         /*
1448                          *      Find where to start putting bytes
1449                          */
1450                         data = skb_put(skb, fraglen);
1451                         skb_set_network_header(skb, exthdrlen);
1452                         data += fragheaderlen;
1453                         skb->transport_header = (skb->network_header +
1454                                                  fragheaderlen);
1455                         if (fraggap) {
1456                                 skb->csum = skb_copy_and_csum_bits(
1457                                         skb_prev, maxfraglen,
1458                                         data + transhdrlen, fraggap, 0);
1459                                 skb_prev->csum = csum_sub(skb_prev->csum,
1460                                                           skb->csum);
1461                                 data += fraggap;
1462                                 pskb_trim_unique(skb_prev, maxfraglen);
1463                         }
1464                         copy = datalen - transhdrlen - fraggap;
1465
1466                         if (copy < 0) {
1467                                 err = -EINVAL;
1468                                 kfree_skb(skb);
1469                                 goto error;
1470                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1471                                 err = -EFAULT;
1472                                 kfree_skb(skb);
1473                                 goto error;
1474                         }
1475
1476                         offset += copy;
1477                         length -= datalen - fraggap;
1478                         transhdrlen = 0;
1479                         exthdrlen = 0;
1480                         dst_exthdrlen = 0;
1481                         csummode = CHECKSUM_NONE;
1482
1483                         /*
1484                          * Put the packet on the pending queue
1485                          */
1486                         __skb_queue_tail(&sk->sk_write_queue, skb);
1487                         continue;
1488                 }
1489
1490                 if (copy > length)
1491                         copy = length;
1492
1493                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1494                         unsigned int off;
1495
1496                         off = skb->len;
1497                         if (getfrag(from, skb_put(skb, copy),
1498                                                 offset, copy, off, skb) < 0) {
1499                                 __skb_trim(skb, off);
1500                                 err = -EFAULT;
1501                                 goto error;
1502                         }
1503                 } else {
1504                         int i = skb_shinfo(skb)->nr_frags;
1505                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1506                         struct page *page = sk->sk_sndmsg_page;
1507                         int off = sk->sk_sndmsg_off;
1508                         unsigned int left;
1509
1510                         if (page && (left = PAGE_SIZE - off) > 0) {
1511                                 if (copy >= left)
1512                                         copy = left;
1513                                 if (page != skb_frag_page(frag)) {
1514                                         if (i == MAX_SKB_FRAGS) {
1515                                                 err = -EMSGSIZE;
1516                                                 goto error;
1517                                         }
1518                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1519                                         skb_frag_ref(skb, i);
1520                                         frag = &skb_shinfo(skb)->frags[i];
1521                                 }
1522                         } else if(i < MAX_SKB_FRAGS) {
1523                                 if (copy > PAGE_SIZE)
1524                                         copy = PAGE_SIZE;
1525                                 page = alloc_pages(sk->sk_allocation, 0);
1526                                 if (page == NULL) {
1527                                         err = -ENOMEM;
1528                                         goto error;
1529                                 }
1530                                 sk->sk_sndmsg_page = page;
1531                                 sk->sk_sndmsg_off = 0;
1532
1533                                 skb_fill_page_desc(skb, i, page, 0, 0);
1534                                 frag = &skb_shinfo(skb)->frags[i];
1535                         } else {
1536                                 err = -EMSGSIZE;
1537                                 goto error;
1538                         }
1539                         if (getfrag(from,
1540                                     skb_frag_address(frag) + skb_frag_size(frag),
1541                                     offset, copy, skb->len, skb) < 0) {
1542                                 err = -EFAULT;
1543                                 goto error;
1544                         }
1545                         sk->sk_sndmsg_off += copy;
1546                         skb_frag_size_add(frag, copy);
1547                         skb->len += copy;
1548                         skb->data_len += copy;
1549                         skb->truesize += copy;
1550                         atomic_add(copy, &sk->sk_wmem_alloc);
1551                 }
1552                 offset += copy;
1553                 length -= copy;
1554         }
1555         return 0;
1556 error:
1557         cork->length -= length;
1558         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1559         return err;
1560 }
1561
1562 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1563 {
1564         if (np->cork.opt) {
1565                 kfree(np->cork.opt->dst0opt);
1566                 kfree(np->cork.opt->dst1opt);
1567                 kfree(np->cork.opt->hopopt);
1568                 kfree(np->cork.opt->srcrt);
1569                 kfree(np->cork.opt);
1570                 np->cork.opt = NULL;
1571         }
1572
1573         if (inet->cork.base.dst) {
1574                 dst_release(inet->cork.base.dst);
1575                 inet->cork.base.dst = NULL;
1576                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1577         }
1578         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1579 }
1580
1581 int ip6_push_pending_frames(struct sock *sk)
1582 {
1583         struct sk_buff *skb, *tmp_skb;
1584         struct sk_buff **tail_skb;
1585         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1586         struct inet_sock *inet = inet_sk(sk);
1587         struct ipv6_pinfo *np = inet6_sk(sk);
1588         struct net *net = sock_net(sk);
1589         struct ipv6hdr *hdr;
1590         struct ipv6_txoptions *opt = np->cork.opt;
1591         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1592         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1593         unsigned char proto = fl6->flowi6_proto;
1594         int err = 0;
1595
1596         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1597                 goto out;
1598         tail_skb = &(skb_shinfo(skb)->frag_list);
1599
1600         /* move skb->data to ip header from ext header */
1601         if (skb->data < skb_network_header(skb))
1602                 __skb_pull(skb, skb_network_offset(skb));
1603         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1604                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1605                 *tail_skb = tmp_skb;
1606                 tail_skb = &(tmp_skb->next);
1607                 skb->len += tmp_skb->len;
1608                 skb->data_len += tmp_skb->len;
1609                 skb->truesize += tmp_skb->truesize;
1610                 tmp_skb->destructor = NULL;
1611                 tmp_skb->sk = NULL;
1612         }
1613
1614         /* Allow local fragmentation. */
1615         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1616                 skb->local_df = 1;
1617
1618         ipv6_addr_copy(final_dst, &fl6->daddr);
1619         __skb_pull(skb, skb_network_header_len(skb));
1620         if (opt && opt->opt_flen)
1621                 ipv6_push_frag_opts(skb, opt, &proto);
1622         if (opt && opt->opt_nflen)
1623                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1624
1625         skb_push(skb, sizeof(struct ipv6hdr));
1626         skb_reset_network_header(skb);
1627         hdr = ipv6_hdr(skb);
1628
1629         *(__be32*)hdr = fl6->flowlabel |
1630                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1631
1632         hdr->hop_limit = np->cork.hop_limit;
1633         hdr->nexthdr = proto;
1634         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1635         ipv6_addr_copy(&hdr->daddr, final_dst);
1636
1637         skb->priority = sk->sk_priority;
1638         skb->mark = sk->sk_mark;
1639
1640         skb_dst_set(skb, dst_clone(&rt->dst));
1641         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1642         if (proto == IPPROTO_ICMPV6) {
1643                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1644
1645                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1646                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1647         }
1648
1649         err = ip6_local_out(skb);
1650         if (err) {
1651                 if (err > 0)
1652                         err = net_xmit_errno(err);
1653                 if (err)
1654                         goto error;
1655         }
1656
1657 out:
1658         ip6_cork_release(inet, np);
1659         return err;
1660 error:
1661         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1662         goto out;
1663 }
1664
1665 void ip6_flush_pending_frames(struct sock *sk)
1666 {
1667         struct sk_buff *skb;
1668
1669         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1670                 if (skb_dst(skb))
1671                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1672                                       IPSTATS_MIB_OUTDISCARDS);
1673                 kfree_skb(skb);
1674         }
1675
1676         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1677 }