ipv6: Count in extension headers in skb->network_header
[pandora-kernel.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89         skb_reset_mac_header(newskb);
90         __skb_pull(newskb, skb_network_offset(newskb));
91         newskb->pkt_type = PACKET_LOOPBACK;
92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
93         WARN_ON(!skb_dst(newskb));
94
95         netif_rx_ni(newskb);
96         return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101         struct dst_entry *dst = skb_dst(skb);
102         struct net_device *dev = dst->dev;
103         struct neighbour *neigh;
104
105         skb->protocol = htons(ETH_P_IPV6);
106         skb->dev = dev;
107
108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110
111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112                     ((mroute6_socket(dev_net(dev), skb) &&
113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115                                          &ipv6_hdr(skb)->saddr))) {
116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117
118                         /* Do not check for IFF_ALLMULTI; multicast routing
119                            is not supported in any case.
120                          */
121                         if (newskb)
122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123                                         newskb, NULL, newskb->dev,
124                                         ip6_dev_loopback_xmit);
125
126                         if (ipv6_hdr(skb)->hop_limit == 0) {
127                                 IP6_INC_STATS(dev_net(dev), idev,
128                                               IPSTATS_MIB_OUTDISCARDS);
129                                 kfree_skb(skb);
130                                 return 0;
131                         }
132                 }
133
134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135                                 skb->len);
136         }
137
138         rcu_read_lock();
139         neigh = dst_get_neighbour(dst);
140         if (neigh) {
141                 int res = neigh_output(neigh, skb);
142
143                 rcu_read_unlock();
144                 return res;
145         }
146         rcu_read_unlock();
147         IP6_INC_STATS(dev_net(dst->dev),
148                       ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149         kfree_skb(skb);
150         return -EINVAL;
151 }
152
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156             dst_allfrag(skb_dst(skb)))
157                 return ip6_fragment(skb, ip6_finish_output2);
158         else
159                 return ip6_finish_output2(skb);
160 }
161
162 int ip6_output(struct sk_buff *skb)
163 {
164         struct net_device *dev = skb_dst(skb)->dev;
165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166         if (unlikely(idev->cnf.disable_ipv6)) {
167                 IP6_INC_STATS(dev_net(dev), idev,
168                               IPSTATS_MIB_OUTDISCARDS);
169                 kfree_skb(skb);
170                 return 0;
171         }
172
173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174                             ip6_finish_output,
175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177
178 /*
179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183              struct ipv6_txoptions *opt, int tclass)
184 {
185         struct net *net = sock_net(sk);
186         struct ipv6_pinfo *np = inet6_sk(sk);
187         struct in6_addr *first_hop = &fl6->daddr;
188         struct dst_entry *dst = skb_dst(skb);
189         struct ipv6hdr *hdr;
190         u8  proto = fl6->flowi6_proto;
191         int seg_len = skb->len;
192         int hlimit = -1;
193         u32 mtu;
194
195         if (opt) {
196                 unsigned int head_room;
197
198                 /* First: exthdrs may take lots of space (~8K for now)
199                    MAX_HEADER is not enough.
200                  */
201                 head_room = opt->opt_nflen + opt->opt_flen;
202                 seg_len += head_room;
203                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204
205                 if (skb_headroom(skb) < head_room) {
206                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207                         if (skb2 == NULL) {
208                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209                                               IPSTATS_MIB_OUTDISCARDS);
210                                 kfree_skb(skb);
211                                 return -ENOBUFS;
212                         }
213                         kfree_skb(skb);
214                         skb = skb2;
215                         skb_set_owner_w(skb, sk);
216                 }
217                 if (opt->opt_flen)
218                         ipv6_push_frag_opts(skb, opt, &proto);
219                 if (opt->opt_nflen)
220                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221         }
222
223         skb_push(skb, sizeof(struct ipv6hdr));
224         skb_reset_network_header(skb);
225         hdr = ipv6_hdr(skb);
226
227         /*
228          *      Fill in the IPv6 header
229          */
230         if (np)
231                 hlimit = np->hop_limit;
232         if (hlimit < 0)
233                 hlimit = ip6_dst_hoplimit(dst);
234
235         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236
237         hdr->payload_len = htons(seg_len);
238         hdr->nexthdr = proto;
239         hdr->hop_limit = hlimit;
240
241         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
242         ipv6_addr_copy(&hdr->daddr, first_hop);
243
244         skb->priority = sk->sk_priority;
245         skb->mark = sk->sk_mark;
246
247         mtu = dst_mtu(dst);
248         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250                               IPSTATS_MIB_OUT, skb->len);
251                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252                                dst->dev, dst_output);
253         }
254
255         if (net_ratelimit())
256                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
257         skb->dev = dst->dev;
258         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
259         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
260         kfree_skb(skb);
261         return -EMSGSIZE;
262 }
263
264 EXPORT_SYMBOL(ip6_xmit);
265
266 /*
267  *      To avoid extra problems ND packets are send through this
268  *      routine. It's code duplication but I really want to avoid
269  *      extra checks since ipv6_build_header is used by TCP (which
270  *      is for us performance critical)
271  */
272
273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
274                const struct in6_addr *saddr, const struct in6_addr *daddr,
275                int proto, int len)
276 {
277         struct ipv6_pinfo *np = inet6_sk(sk);
278         struct ipv6hdr *hdr;
279
280         skb->protocol = htons(ETH_P_IPV6);
281         skb->dev = dev;
282
283         skb_reset_network_header(skb);
284         skb_put(skb, sizeof(struct ipv6hdr));
285         hdr = ipv6_hdr(skb);
286
287         *(__be32*)hdr = htonl(0x60000000);
288
289         hdr->payload_len = htons(len);
290         hdr->nexthdr = proto;
291         hdr->hop_limit = np->hop_limit;
292
293         ipv6_addr_copy(&hdr->saddr, saddr);
294         ipv6_addr_copy(&hdr->daddr, daddr);
295
296         return 0;
297 }
298
299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
300 {
301         struct ip6_ra_chain *ra;
302         struct sock *last = NULL;
303
304         read_lock(&ip6_ra_lock);
305         for (ra = ip6_ra_chain; ra; ra = ra->next) {
306                 struct sock *sk = ra->sk;
307                 if (sk && ra->sel == sel &&
308                     (!sk->sk_bound_dev_if ||
309                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
310                         if (last) {
311                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
312                                 if (skb2)
313                                         rawv6_rcv(last, skb2);
314                         }
315                         last = sk;
316                 }
317         }
318
319         if (last) {
320                 rawv6_rcv(last, skb);
321                 read_unlock(&ip6_ra_lock);
322                 return 1;
323         }
324         read_unlock(&ip6_ra_lock);
325         return 0;
326 }
327
328 static int ip6_forward_proxy_check(struct sk_buff *skb)
329 {
330         struct ipv6hdr *hdr = ipv6_hdr(skb);
331         u8 nexthdr = hdr->nexthdr;
332         int offset;
333
334         if (ipv6_ext_hdr(nexthdr)) {
335                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
336                 if (offset < 0)
337                         return 0;
338         } else
339                 offset = sizeof(struct ipv6hdr);
340
341         if (nexthdr == IPPROTO_ICMPV6) {
342                 struct icmp6hdr *icmp6;
343
344                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345                                          offset + 1 - skb->data)))
346                         return 0;
347
348                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349
350                 switch (icmp6->icmp6_type) {
351                 case NDISC_ROUTER_SOLICITATION:
352                 case NDISC_ROUTER_ADVERTISEMENT:
353                 case NDISC_NEIGHBOUR_SOLICITATION:
354                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
355                 case NDISC_REDIRECT:
356                         /* For reaction involving unicast neighbor discovery
357                          * message destined to the proxied address, pass it to
358                          * input function.
359                          */
360                         return 1;
361                 default:
362                         break;
363                 }
364         }
365
366         /*
367          * The proxying router can't forward traffic sent to a link-local
368          * address, so signal the sender and discard the packet. This
369          * behavior is clarified by the MIPv6 specification.
370          */
371         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372                 dst_link_failure(skb);
373                 return -1;
374         }
375
376         return 0;
377 }
378
379 static inline int ip6_forward_finish(struct sk_buff *skb)
380 {
381         return dst_output(skb);
382 }
383
384 int ip6_forward(struct sk_buff *skb)
385 {
386         struct dst_entry *dst = skb_dst(skb);
387         struct ipv6hdr *hdr = ipv6_hdr(skb);
388         struct inet6_skb_parm *opt = IP6CB(skb);
389         struct net *net = dev_net(dst->dev);
390         struct neighbour *n;
391         u32 mtu;
392
393         if (net->ipv6.devconf_all->forwarding == 0)
394                 goto error;
395
396         if (skb_warn_if_lro(skb))
397                 goto drop;
398
399         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
401                 goto drop;
402         }
403
404         if (skb->pkt_type != PACKET_HOST)
405                 goto drop;
406
407         skb_forward_csum(skb);
408
409         /*
410          *      We DO NOT make any processing on
411          *      RA packets, pushing them to user level AS IS
412          *      without ane WARRANTY that application will be able
413          *      to interpret them. The reason is that we
414          *      cannot make anything clever here.
415          *
416          *      We are not end-node, so that if packet contains
417          *      AH/ESP, we cannot make anything.
418          *      Defragmentation also would be mistake, RA packets
419          *      cannot be fragmented, because there is no warranty
420          *      that different fragments will go along one path. --ANK
421          */
422         if (opt->ra) {
423                 u8 *ptr = skb_network_header(skb) + opt->ra;
424                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
425                         return 0;
426         }
427
428         /*
429          *      check and decrement ttl
430          */
431         if (hdr->hop_limit <= 1) {
432                 /* Force OUTPUT device used as source address */
433                 skb->dev = dst->dev;
434                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
435                 IP6_INC_STATS_BH(net,
436                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
437
438                 kfree_skb(skb);
439                 return -ETIMEDOUT;
440         }
441
442         /* XXX: idev->cnf.proxy_ndp? */
443         if (net->ipv6.devconf_all->proxy_ndp &&
444             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
445                 int proxied = ip6_forward_proxy_check(skb);
446                 if (proxied > 0)
447                         return ip6_input(skb);
448                 else if (proxied < 0) {
449                         IP6_INC_STATS(net, ip6_dst_idev(dst),
450                                       IPSTATS_MIB_INDISCARDS);
451                         goto drop;
452                 }
453         }
454
455         if (!xfrm6_route_forward(skb)) {
456                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
457                 goto drop;
458         }
459         dst = skb_dst(skb);
460
461         /* IPv6 specs say nothing about it, but it is clear that we cannot
462            send redirects to source routed frames.
463            We don't send redirects to frames decapsulated from IPsec.
464          */
465         n = dst_get_neighbour(dst);
466         if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
467                 struct in6_addr *target = NULL;
468                 struct rt6_info *rt;
469
470                 /*
471                  *      incoming and outgoing devices are the same
472                  *      send a redirect.
473                  */
474
475                 rt = (struct rt6_info *) dst;
476                 if ((rt->rt6i_flags & RTF_GATEWAY))
477                         target = (struct in6_addr*)&n->primary_key;
478                 else
479                         target = &hdr->daddr;
480
481                 if (!rt->rt6i_peer)
482                         rt6_bind_peer(rt, 1);
483
484                 /* Limit redirects both by destination (here)
485                    and by source (inside ndisc_send_redirect)
486                  */
487                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
488                         ndisc_send_redirect(skb, n, target);
489         } else {
490                 int addrtype = ipv6_addr_type(&hdr->saddr);
491
492                 /* This check is security critical. */
493                 if (addrtype == IPV6_ADDR_ANY ||
494                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
495                         goto error;
496                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
497                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
498                                     ICMPV6_NOT_NEIGHBOUR, 0);
499                         goto error;
500                 }
501         }
502
503         mtu = dst_mtu(dst);
504         if (mtu < IPV6_MIN_MTU)
505                 mtu = IPV6_MIN_MTU;
506
507         if (skb->len > mtu && !skb_is_gso(skb)) {
508                 /* Again, force OUTPUT device used as source address */
509                 skb->dev = dst->dev;
510                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
511                 IP6_INC_STATS_BH(net,
512                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
513                 IP6_INC_STATS_BH(net,
514                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
515                 kfree_skb(skb);
516                 return -EMSGSIZE;
517         }
518
519         if (skb_cow(skb, dst->dev->hard_header_len)) {
520                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
521                 goto drop;
522         }
523
524         hdr = ipv6_hdr(skb);
525
526         /* Mangling hops number delayed to point after skb COW */
527
528         hdr->hop_limit--;
529
530         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
531         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
532                        ip6_forward_finish);
533
534 error:
535         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
536 drop:
537         kfree_skb(skb);
538         return -EINVAL;
539 }
540
541 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
542 {
543         to->pkt_type = from->pkt_type;
544         to->priority = from->priority;
545         to->protocol = from->protocol;
546         skb_dst_drop(to);
547         skb_dst_set(to, dst_clone(skb_dst(from)));
548         to->dev = from->dev;
549         to->mark = from->mark;
550
551 #ifdef CONFIG_NET_SCHED
552         to->tc_index = from->tc_index;
553 #endif
554         nf_copy(to, from);
555 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
556     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
557         to->nf_trace = from->nf_trace;
558 #endif
559         skb_copy_secmark(to, from);
560 }
561
562 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
563 {
564         u16 offset = sizeof(struct ipv6hdr);
565         struct ipv6_opt_hdr *exthdr =
566                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
567         unsigned int packet_len = skb->tail - skb->network_header;
568         int found_rhdr = 0;
569         *nexthdr = &ipv6_hdr(skb)->nexthdr;
570
571         while (offset + 1 <= packet_len) {
572
573                 switch (**nexthdr) {
574
575                 case NEXTHDR_HOP:
576                         break;
577                 case NEXTHDR_ROUTING:
578                         found_rhdr = 1;
579                         break;
580                 case NEXTHDR_DEST:
581 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
582                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
583                                 break;
584 #endif
585                         if (found_rhdr)
586                                 return offset;
587                         break;
588                 default :
589                         return offset;
590                 }
591
592                 offset += ipv6_optlen(exthdr);
593                 *nexthdr = &exthdr->nexthdr;
594                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
595                                                  offset);
596         }
597
598         return offset;
599 }
600
601 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
602 {
603         static u32 ip6_idents_hashrnd __read_mostly;
604         static bool hashrnd_initialized = false;
605         u32 hash, id;
606
607         if (unlikely(!hashrnd_initialized)) {
608                 hashrnd_initialized = true;
609                 get_random_bytes(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
610         }
611         hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd);
612         hash = __ipv6_addr_jhash(&rt->rt6i_src.addr, hash);
613
614         id = ip_idents_reserve(hash, 1);
615         fhdr->identification = htonl(id);
616 }
617
618 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
619 {
620         struct sk_buff *frag;
621         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
622         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
623         struct ipv6hdr *tmp_hdr;
624         struct frag_hdr *fh;
625         unsigned int mtu, hlen, left, len;
626         int hroom, troom;
627         __be32 frag_id = 0;
628         int ptr, offset = 0, err=0;
629         u8 *prevhdr, nexthdr = 0;
630         struct net *net = dev_net(skb_dst(skb)->dev);
631
632         hlen = ip6_find_1stfragopt(skb, &prevhdr);
633         nexthdr = *prevhdr;
634
635         mtu = ip6_skb_dst_mtu(skb);
636
637         /* We must not fragment if the socket is set to force MTU discovery
638          * or if the skb it not generated by a local socket.
639          */
640         if (!skb->local_df && skb->len > mtu) {
641                 skb->dev = skb_dst(skb)->dev;
642                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
643                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
644                               IPSTATS_MIB_FRAGFAILS);
645                 kfree_skb(skb);
646                 return -EMSGSIZE;
647         }
648
649         if (np && np->frag_size < mtu) {
650                 if (np->frag_size)
651                         mtu = np->frag_size;
652         }
653         mtu -= hlen + sizeof(struct frag_hdr);
654
655         if (skb_has_frag_list(skb)) {
656                 int first_len = skb_pagelen(skb);
657                 struct sk_buff *frag2;
658
659                 if (first_len - hlen > mtu ||
660                     ((first_len - hlen) & 7) ||
661                     skb_cloned(skb))
662                         goto slow_path;
663
664                 skb_walk_frags(skb, frag) {
665                         /* Correct geometry. */
666                         if (frag->len > mtu ||
667                             ((frag->len & 7) && frag->next) ||
668                             skb_headroom(frag) < hlen)
669                                 goto slow_path_clean;
670
671                         /* Partially cloned skb? */
672                         if (skb_shared(frag))
673                                 goto slow_path_clean;
674
675                         BUG_ON(frag->sk);
676                         if (skb->sk) {
677                                 frag->sk = skb->sk;
678                                 frag->destructor = sock_wfree;
679                         }
680                         skb->truesize -= frag->truesize;
681                 }
682
683                 err = 0;
684                 offset = 0;
685                 frag = skb_shinfo(skb)->frag_list;
686                 skb_frag_list_init(skb);
687                 /* BUILD HEADER */
688
689                 *prevhdr = NEXTHDR_FRAGMENT;
690                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
691                 if (!tmp_hdr) {
692                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
693                                       IPSTATS_MIB_FRAGFAILS);
694                         return -ENOMEM;
695                 }
696
697                 __skb_pull(skb, hlen);
698                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
699                 __skb_push(skb, hlen);
700                 skb_reset_network_header(skb);
701                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
702
703                 ipv6_select_ident(fh, rt);
704                 fh->nexthdr = nexthdr;
705                 fh->reserved = 0;
706                 fh->frag_off = htons(IP6_MF);
707                 frag_id = fh->identification;
708
709                 first_len = skb_pagelen(skb);
710                 skb->data_len = first_len - skb_headlen(skb);
711                 skb->len = first_len;
712                 ipv6_hdr(skb)->payload_len = htons(first_len -
713                                                    sizeof(struct ipv6hdr));
714
715                 dst_hold(&rt->dst);
716
717                 for (;;) {
718                         /* Prepare header of the next frame,
719                          * before previous one went down. */
720                         if (frag) {
721                                 frag->ip_summed = CHECKSUM_NONE;
722                                 skb_reset_transport_header(frag);
723                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
724                                 __skb_push(frag, hlen);
725                                 skb_reset_network_header(frag);
726                                 memcpy(skb_network_header(frag), tmp_hdr,
727                                        hlen);
728                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
729                                 fh->nexthdr = nexthdr;
730                                 fh->reserved = 0;
731                                 fh->frag_off = htons(offset);
732                                 if (frag->next != NULL)
733                                         fh->frag_off |= htons(IP6_MF);
734                                 fh->identification = frag_id;
735                                 ipv6_hdr(frag)->payload_len =
736                                                 htons(frag->len -
737                                                       sizeof(struct ipv6hdr));
738                                 ip6_copy_metadata(frag, skb);
739                         }
740
741                         err = output(skb);
742                         if(!err)
743                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
744                                               IPSTATS_MIB_FRAGCREATES);
745
746                         if (err || !frag)
747                                 break;
748
749                         skb = frag;
750                         frag = skb->next;
751                         skb->next = NULL;
752                 }
753
754                 kfree(tmp_hdr);
755
756                 if (err == 0) {
757                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
758                                       IPSTATS_MIB_FRAGOKS);
759                         dst_release(&rt->dst);
760                         return 0;
761                 }
762
763                 while (frag) {
764                         skb = frag->next;
765                         kfree_skb(frag);
766                         frag = skb;
767                 }
768
769                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
770                               IPSTATS_MIB_FRAGFAILS);
771                 dst_release(&rt->dst);
772                 return err;
773
774 slow_path_clean:
775                 skb_walk_frags(skb, frag2) {
776                         if (frag2 == frag)
777                                 break;
778                         frag2->sk = NULL;
779                         frag2->destructor = NULL;
780                         skb->truesize += frag2->truesize;
781                 }
782         }
783
784 slow_path:
785         left = skb->len - hlen;         /* Space per frame */
786         ptr = hlen;                     /* Where to start from */
787
788         /*
789          *      Fragment the datagram.
790          */
791
792         *prevhdr = NEXTHDR_FRAGMENT;
793         hroom = LL_RESERVED_SPACE(rt->dst.dev);
794         troom = rt->dst.dev->needed_tailroom;
795
796         /*
797          *      Keep copying data until we run out.
798          */
799         while(left > 0) {
800                 len = left;
801                 /* IF: it doesn't fit, use 'mtu' - the data space left */
802                 if (len > mtu)
803                         len = mtu;
804                 /* IF: we are not sending up to and including the packet end
805                    then align the next start on an eight byte boundary */
806                 if (len < left) {
807                         len &= ~7;
808                 }
809                 /*
810                  *      Allocate buffer.
811                  */
812
813                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
814                                       hroom + troom, GFP_ATOMIC)) == NULL) {
815                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
816                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
817                                       IPSTATS_MIB_FRAGFAILS);
818                         err = -ENOMEM;
819                         goto fail;
820                 }
821
822                 /*
823                  *      Set up data on packet
824                  */
825
826                 ip6_copy_metadata(frag, skb);
827                 skb_reserve(frag, hroom);
828                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
829                 skb_reset_network_header(frag);
830                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
831                 frag->transport_header = (frag->network_header + hlen +
832                                           sizeof(struct frag_hdr));
833
834                 /*
835                  *      Charge the memory for the fragment to any owner
836                  *      it might possess
837                  */
838                 if (skb->sk)
839                         skb_set_owner_w(frag, skb->sk);
840
841                 /*
842                  *      Copy the packet header into the new buffer.
843                  */
844                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
845
846                 /*
847                  *      Build fragment header.
848                  */
849                 fh->nexthdr = nexthdr;
850                 fh->reserved = 0;
851                 if (!frag_id) {
852                         ipv6_select_ident(fh, rt);
853                         frag_id = fh->identification;
854                 } else
855                         fh->identification = frag_id;
856
857                 /*
858                  *      Copy a block of the IP datagram.
859                  */
860                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
861                         BUG();
862                 left -= len;
863
864                 fh->frag_off = htons(offset);
865                 if (left > 0)
866                         fh->frag_off |= htons(IP6_MF);
867                 ipv6_hdr(frag)->payload_len = htons(frag->len -
868                                                     sizeof(struct ipv6hdr));
869
870                 ptr += len;
871                 offset += len;
872
873                 /*
874                  *      Put this fragment into the sending queue.
875                  */
876                 err = output(frag);
877                 if (err)
878                         goto fail;
879
880                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
881                               IPSTATS_MIB_FRAGCREATES);
882         }
883         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
884                       IPSTATS_MIB_FRAGOKS);
885         kfree_skb(skb);
886         return err;
887
888 fail:
889         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
890                       IPSTATS_MIB_FRAGFAILS);
891         kfree_skb(skb);
892         return err;
893 }
894
895 static inline int ip6_rt_check(const struct rt6key *rt_key,
896                                const struct in6_addr *fl_addr,
897                                const struct in6_addr *addr_cache)
898 {
899         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
900                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
901 }
902
903 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
904                                           struct dst_entry *dst,
905                                           const struct flowi6 *fl6)
906 {
907         struct ipv6_pinfo *np = inet6_sk(sk);
908         struct rt6_info *rt;
909
910         if (!dst)
911                 goto out;
912
913         if (dst->ops->family != AF_INET6) {
914                 dst_release(dst);
915                 return NULL;
916         }
917
918         rt = (struct rt6_info *)dst;
919         /* Yes, checking route validity in not connected
920          * case is not very simple. Take into account,
921          * that we do not support routing by source, TOS,
922          * and MSG_DONTROUTE            --ANK (980726)
923          *
924          * 1. ip6_rt_check(): If route was host route,
925          *    check that cached destination is current.
926          *    If it is network route, we still may
927          *    check its validity using saved pointer
928          *    to the last used address: daddr_cache.
929          *    We do not want to save whole address now,
930          *    (because main consumer of this service
931          *    is tcp, which has not this problem),
932          *    so that the last trick works only on connected
933          *    sockets.
934          * 2. oif also should be the same.
935          */
936         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
937 #ifdef CONFIG_IPV6_SUBTREES
938             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
939 #endif
940             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
941                 dst_release(dst);
942                 dst = NULL;
943         }
944
945 out:
946         return dst;
947 }
948
949 static int ip6_dst_lookup_tail(struct sock *sk,
950                                struct dst_entry **dst, struct flowi6 *fl6)
951 {
952         struct net *net = sock_net(sk);
953 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
954         struct neighbour *n;
955 #endif
956         int err;
957
958         if (*dst == NULL)
959                 *dst = ip6_route_output(net, sk, fl6);
960
961         if ((err = (*dst)->error))
962                 goto out_err_release;
963
964         if (ipv6_addr_any(&fl6->saddr)) {
965                 struct rt6_info *rt = (struct rt6_info *) *dst;
966                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
967                                           sk ? inet6_sk(sk)->srcprefs : 0,
968                                           &fl6->saddr);
969                 if (err)
970                         goto out_err_release;
971         }
972
973 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
974         /*
975          * Here if the dst entry we've looked up
976          * has a neighbour entry that is in the INCOMPLETE
977          * state and the src address from the flow is
978          * marked as OPTIMISTIC, we release the found
979          * dst entry and replace it instead with the
980          * dst entry of the nexthop router
981          */
982         rcu_read_lock();
983         n = dst_get_neighbour(*dst);
984         if (n && !(n->nud_state & NUD_VALID)) {
985                 struct inet6_ifaddr *ifp;
986                 struct flowi6 fl_gw6;
987                 int redirect;
988
989                 rcu_read_unlock();
990                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
991                                       (*dst)->dev, 1);
992
993                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
994                 if (ifp)
995                         in6_ifa_put(ifp);
996
997                 if (redirect) {
998                         /*
999                          * We need to get the dst entry for the
1000                          * default router instead
1001                          */
1002                         dst_release(*dst);
1003                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1004                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1005                         *dst = ip6_route_output(net, sk, &fl_gw6);
1006                         if ((err = (*dst)->error))
1007                                 goto out_err_release;
1008                 }
1009         } else {
1010                 rcu_read_unlock();
1011         }
1012 #endif
1013
1014         return 0;
1015
1016 out_err_release:
1017         if (err == -ENETUNREACH)
1018                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1019         dst_release(*dst);
1020         *dst = NULL;
1021         return err;
1022 }
1023
1024 /**
1025  *      ip6_dst_lookup - perform route lookup on flow
1026  *      @sk: socket which provides route info
1027  *      @dst: pointer to dst_entry * for result
1028  *      @fl6: flow to lookup
1029  *
1030  *      This function performs a route lookup on the given flow.
1031  *
1032  *      It returns zero on success, or a standard errno code on error.
1033  */
1034 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1035 {
1036         *dst = NULL;
1037         return ip6_dst_lookup_tail(sk, dst, fl6);
1038 }
1039 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1040
1041 /**
1042  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1043  *      @sk: socket which provides route info
1044  *      @fl6: flow to lookup
1045  *      @final_dst: final destination address for ipsec lookup
1046  *      @can_sleep: we are in a sleepable context
1047  *
1048  *      This function performs a route lookup on the given flow.
1049  *
1050  *      It returns a valid dst pointer on success, or a pointer encoded
1051  *      error code.
1052  */
1053 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1054                                       const struct in6_addr *final_dst,
1055                                       bool can_sleep)
1056 {
1057         struct dst_entry *dst = NULL;
1058         int err;
1059
1060         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1061         if (err)
1062                 return ERR_PTR(err);
1063         if (final_dst)
1064                 ipv6_addr_copy(&fl6->daddr, final_dst);
1065         if (can_sleep)
1066                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1067
1068         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1069 }
1070 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1071
1072 /**
1073  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1074  *      @sk: socket which provides the dst cache and route info
1075  *      @fl6: flow to lookup
1076  *      @final_dst: final destination address for ipsec lookup
1077  *      @can_sleep: we are in a sleepable context
1078  *
1079  *      This function performs a route lookup on the given flow with the
1080  *      possibility of using the cached route in the socket if it is valid.
1081  *      It will take the socket dst lock when operating on the dst cache.
1082  *      As a result, this function can only be used in process context.
1083  *
1084  *      It returns a valid dst pointer on success, or a pointer encoded
1085  *      error code.
1086  */
1087 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1088                                          const struct in6_addr *final_dst,
1089                                          bool can_sleep)
1090 {
1091         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1092         int err;
1093
1094         dst = ip6_sk_dst_check(sk, dst, fl6);
1095
1096         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1097         if (err)
1098                 return ERR_PTR(err);
1099         if (final_dst)
1100                 ipv6_addr_copy(&fl6->daddr, final_dst);
1101         if (can_sleep)
1102                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1103
1104         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1105 }
1106 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1107
1108 static inline int ip6_ufo_append_data(struct sock *sk,
1109                         int getfrag(void *from, char *to, int offset, int len,
1110                         int odd, struct sk_buff *skb),
1111                         void *from, int length, int hh_len, int fragheaderlen,
1112                         int exthdrlen, int transhdrlen, int mtu,
1113                         unsigned int flags, struct rt6_info *rt)
1114 {
1115         struct sk_buff *skb;
1116         int err;
1117
1118         /* There is support for UDP large send offload by network
1119          * device, so create one single skb packet containing complete
1120          * udp datagram
1121          */
1122         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1123                 struct frag_hdr fhdr;
1124
1125                 skb = sock_alloc_send_skb(sk,
1126                         hh_len + fragheaderlen + transhdrlen + 20,
1127                         (flags & MSG_DONTWAIT), &err);
1128                 if (skb == NULL)
1129                         return err;
1130
1131                 /* reserve space for Hardware header */
1132                 skb_reserve(skb, hh_len);
1133
1134                 /* create space for UDP/IP header */
1135                 skb_put(skb,fragheaderlen + transhdrlen);
1136
1137                 /* initialize network header pointer */
1138                 skb_set_network_header(skb, exthdrlen);
1139
1140                 /* initialize protocol header pointer */
1141                 skb->transport_header = skb->network_header + fragheaderlen;
1142
1143                 skb->ip_summed = CHECKSUM_PARTIAL;
1144                 skb->csum = 0;
1145
1146                 /* Specify the length of each IPv6 datagram fragment.
1147                  * It has to be a multiple of 8.
1148                  */
1149                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1150                                              sizeof(struct frag_hdr)) & ~7;
1151                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1152                 ipv6_select_ident(&fhdr, rt);
1153                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1154                 __skb_queue_tail(&sk->sk_write_queue, skb);
1155         }
1156
1157         return skb_append_datato_frags(sk, skb, getfrag, from,
1158                                        (length - transhdrlen));
1159 }
1160
1161 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1162                                                gfp_t gfp)
1163 {
1164         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1165 }
1166
1167 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1168                                                 gfp_t gfp)
1169 {
1170         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1171 }
1172
1173 static void ip6_append_data_mtu(unsigned int *mtu,
1174                                 int *maxfraglen,
1175                                 unsigned int fragheaderlen,
1176                                 struct sk_buff *skb,
1177                                 struct rt6_info *rt,
1178                                 unsigned int orig_mtu)
1179 {
1180         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1181                 if (skb == NULL) {
1182                         /* first fragment, reserve header_len */
1183                         *mtu = orig_mtu - rt->dst.header_len;
1184
1185                 } else {
1186                         /*
1187                          * this fragment is not first, the headers
1188                          * space is regarded as data space.
1189                          */
1190                         *mtu = orig_mtu;
1191                 }
1192                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1193                               + fragheaderlen - sizeof(struct frag_hdr);
1194         }
1195 }
1196
1197 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1198         int offset, int len, int odd, struct sk_buff *skb),
1199         void *from, int length, int transhdrlen,
1200         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1201         struct rt6_info *rt, unsigned int flags, int dontfrag)
1202 {
1203         struct inet_sock *inet = inet_sk(sk);
1204         struct ipv6_pinfo *np = inet6_sk(sk);
1205         struct inet_cork *cork;
1206         struct sk_buff *skb, *skb_prev = NULL;
1207         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1208         int exthdrlen;
1209         int dst_exthdrlen;
1210         int hh_len;
1211         int copy;
1212         int err;
1213         int offset = 0;
1214         int csummode = CHECKSUM_NONE;
1215         __u8 tx_flags = 0;
1216
1217         if (flags&MSG_PROBE)
1218                 return 0;
1219         cork = &inet->cork.base;
1220         if (skb_queue_empty(&sk->sk_write_queue)) {
1221                 /*
1222                  * setup for corking
1223                  */
1224                 if (opt) {
1225                         if (WARN_ON(np->cork.opt))
1226                                 return -EINVAL;
1227
1228                         np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1229                         if (unlikely(np->cork.opt == NULL))
1230                                 return -ENOBUFS;
1231
1232                         np->cork.opt->tot_len = opt->tot_len;
1233                         np->cork.opt->opt_flen = opt->opt_flen;
1234                         np->cork.opt->opt_nflen = opt->opt_nflen;
1235
1236                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1237                                                             sk->sk_allocation);
1238                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1239                                 return -ENOBUFS;
1240
1241                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1242                                                             sk->sk_allocation);
1243                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1244                                 return -ENOBUFS;
1245
1246                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1247                                                            sk->sk_allocation);
1248                         if (opt->hopopt && !np->cork.opt->hopopt)
1249                                 return -ENOBUFS;
1250
1251                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1252                                                             sk->sk_allocation);
1253                         if (opt->srcrt && !np->cork.opt->srcrt)
1254                                 return -ENOBUFS;
1255
1256                         /* need source address above miyazawa*/
1257                 }
1258                 dst_hold(&rt->dst);
1259                 cork->dst = &rt->dst;
1260                 inet->cork.fl.u.ip6 = *fl6;
1261                 np->cork.hop_limit = hlimit;
1262                 np->cork.tclass = tclass;
1263                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1264                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1265                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1266                 else
1267                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1268                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1269                 if (np->frag_size < mtu) {
1270                         if (np->frag_size)
1271                                 mtu = np->frag_size;
1272                 }
1273                 cork->fragsize = mtu;
1274                 if (dst_allfrag(rt->dst.path))
1275                         cork->flags |= IPCORK_ALLFRAG;
1276                 cork->length = 0;
1277                 sk->sk_sndmsg_page = NULL;
1278                 sk->sk_sndmsg_off = 0;
1279                 exthdrlen = (opt ? opt->opt_flen : 0);
1280                 length += exthdrlen;
1281                 transhdrlen += exthdrlen;
1282                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1283         } else {
1284                 rt = (struct rt6_info *)cork->dst;
1285                 fl6 = &inet->cork.fl.u.ip6;
1286                 opt = np->cork.opt;
1287                 transhdrlen = 0;
1288                 exthdrlen = 0;
1289                 dst_exthdrlen = 0;
1290                 mtu = cork->fragsize;
1291         }
1292         orig_mtu = mtu;
1293
1294         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1295
1296         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1297                         (opt ? opt->opt_nflen : 0);
1298         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1299
1300         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1301                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1302                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1303                         return -EMSGSIZE;
1304                 }
1305         }
1306
1307         /* For UDP, check if TX timestamp is enabled */
1308         if (sk->sk_type == SOCK_DGRAM) {
1309                 err = sock_tx_timestamp(sk, &tx_flags);
1310                 if (err)
1311                         goto error;
1312         }
1313
1314         /*
1315          * Let's try using as much space as possible.
1316          * Use MTU if total length of the message fits into the MTU.
1317          * Otherwise, we need to reserve fragment header and
1318          * fragment alignment (= 8-15 octects, in total).
1319          *
1320          * Note that we may need to "move" the data from the tail of
1321          * of the buffer to the new fragment when we split
1322          * the message.
1323          *
1324          * FIXME: It may be fragmented into multiple chunks
1325          *        at once if non-fragmentable extension headers
1326          *        are too large.
1327          * --yoshfuji
1328          */
1329
1330         if ((length > mtu) && dontfrag && (sk->sk_protocol == IPPROTO_UDP ||
1331                                            sk->sk_protocol == IPPROTO_RAW)) {
1332                 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1333                 return -EMSGSIZE;
1334         }
1335
1336         skb = skb_peek_tail(&sk->sk_write_queue);
1337         cork->length += length;
1338         if (((length > mtu) ||
1339              (skb && skb_has_frags(skb))) &&
1340             (sk->sk_protocol == IPPROTO_UDP) &&
1341             (rt->dst.dev->features & NETIF_F_UFO) &&
1342             (sk->sk_type == SOCK_DGRAM)) {
1343                 err = ip6_ufo_append_data(sk, getfrag, from, length,
1344                                           hh_len, fragheaderlen, exthdrlen,
1345                                           transhdrlen, mtu, flags, rt);
1346                 if (err)
1347                         goto error;
1348                 return 0;
1349         }
1350
1351         if (!skb)
1352                 goto alloc_new_skb;
1353
1354         while (length > 0) {
1355                 /* Check if the remaining data fits into current packet. */
1356                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1357                 if (copy < length)
1358                         copy = maxfraglen - skb->len;
1359
1360                 if (copy <= 0) {
1361                         char *data;
1362                         unsigned int datalen;
1363                         unsigned int fraglen;
1364                         unsigned int fraggap;
1365                         unsigned int alloclen;
1366 alloc_new_skb:
1367                         /* There's no room in the current skb */
1368                         if (skb)
1369                                 fraggap = skb->len - maxfraglen;
1370                         else
1371                                 fraggap = 0;
1372                         /* update mtu and maxfraglen if necessary */
1373                         if (skb == NULL || skb_prev == NULL)
1374                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1375                                                     fragheaderlen, skb, rt,
1376                                                     orig_mtu);
1377
1378                         skb_prev = skb;
1379
1380                         /*
1381                          * If remaining data exceeds the mtu,
1382                          * we know we need more fragment(s).
1383                          */
1384                         datalen = length + fraggap;
1385
1386                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1387                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1388                         if ((flags & MSG_MORE) &&
1389                             !(rt->dst.dev->features&NETIF_F_SG))
1390                                 alloclen = mtu;
1391                         else
1392                                 alloclen = datalen + fragheaderlen;
1393
1394                         alloclen += dst_exthdrlen;
1395
1396                         if (datalen != length + fraggap) {
1397                                 /*
1398                                  * this is not the last fragment, the trailer
1399                                  * space is regarded as data space.
1400                                  */
1401                                 datalen += rt->dst.trailer_len;
1402                         }
1403
1404                         alloclen += rt->dst.trailer_len;
1405                         fraglen = datalen + fragheaderlen;
1406
1407                         /*
1408                          * We just reserve space for fragment header.
1409                          * Note: this may be overallocation if the message
1410                          * (without MSG_MORE) fits into the MTU.
1411                          */
1412                         alloclen += sizeof(struct frag_hdr);
1413
1414                         if (transhdrlen) {
1415                                 skb = sock_alloc_send_skb(sk,
1416                                                 alloclen + hh_len,
1417                                                 (flags & MSG_DONTWAIT), &err);
1418                         } else {
1419                                 skb = NULL;
1420                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1421                                     2 * sk->sk_sndbuf)
1422                                         skb = sock_wmalloc(sk,
1423                                                            alloclen + hh_len, 1,
1424                                                            sk->sk_allocation);
1425                                 if (unlikely(skb == NULL))
1426                                         err = -ENOBUFS;
1427                                 else {
1428                                         /* Only the initial fragment
1429                                          * is time stamped.
1430                                          */
1431                                         tx_flags = 0;
1432                                 }
1433                         }
1434                         if (skb == NULL)
1435                                 goto error;
1436                         /*
1437                          *      Fill in the control structures
1438                          */
1439                         skb->ip_summed = csummode;
1440                         skb->csum = 0;
1441                         /* reserve for fragmentation and ipsec header */
1442                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1443                                     dst_exthdrlen);
1444
1445                         if (sk->sk_type == SOCK_DGRAM)
1446                                 skb_shinfo(skb)->tx_flags = tx_flags;
1447
1448                         /*
1449                          *      Find where to start putting bytes
1450                          */
1451                         data = skb_put(skb, fraglen);
1452                         skb_set_network_header(skb, exthdrlen);
1453                         data += fragheaderlen;
1454                         skb->transport_header = (skb->network_header +
1455                                                  fragheaderlen);
1456                         if (fraggap) {
1457                                 skb->csum = skb_copy_and_csum_bits(
1458                                         skb_prev, maxfraglen,
1459                                         data + transhdrlen, fraggap, 0);
1460                                 skb_prev->csum = csum_sub(skb_prev->csum,
1461                                                           skb->csum);
1462                                 data += fraggap;
1463                                 pskb_trim_unique(skb_prev, maxfraglen);
1464                         }
1465                         copy = datalen - transhdrlen - fraggap;
1466
1467                         if (copy < 0) {
1468                                 err = -EINVAL;
1469                                 kfree_skb(skb);
1470                                 goto error;
1471                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1472                                 err = -EFAULT;
1473                                 kfree_skb(skb);
1474                                 goto error;
1475                         }
1476
1477                         offset += copy;
1478                         length -= datalen - fraggap;
1479                         transhdrlen = 0;
1480                         exthdrlen = 0;
1481                         dst_exthdrlen = 0;
1482                         csummode = CHECKSUM_NONE;
1483
1484                         /*
1485                          * Put the packet on the pending queue
1486                          */
1487                         __skb_queue_tail(&sk->sk_write_queue, skb);
1488                         continue;
1489                 }
1490
1491                 if (copy > length)
1492                         copy = length;
1493
1494                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1495                         unsigned int off;
1496
1497                         off = skb->len;
1498                         if (getfrag(from, skb_put(skb, copy),
1499                                                 offset, copy, off, skb) < 0) {
1500                                 __skb_trim(skb, off);
1501                                 err = -EFAULT;
1502                                 goto error;
1503                         }
1504                 } else {
1505                         int i = skb_shinfo(skb)->nr_frags;
1506                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1507                         struct page *page = sk->sk_sndmsg_page;
1508                         int off = sk->sk_sndmsg_off;
1509                         unsigned int left;
1510
1511                         if (page && (left = PAGE_SIZE - off) > 0) {
1512                                 if (copy >= left)
1513                                         copy = left;
1514                                 if (page != skb_frag_page(frag)) {
1515                                         if (i == MAX_SKB_FRAGS) {
1516                                                 err = -EMSGSIZE;
1517                                                 goto error;
1518                                         }
1519                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1520                                         skb_frag_ref(skb, i);
1521                                         frag = &skb_shinfo(skb)->frags[i];
1522                                 }
1523                         } else if(i < MAX_SKB_FRAGS) {
1524                                 if (copy > PAGE_SIZE)
1525                                         copy = PAGE_SIZE;
1526                                 page = alloc_pages(sk->sk_allocation, 0);
1527                                 if (page == NULL) {
1528                                         err = -ENOMEM;
1529                                         goto error;
1530                                 }
1531                                 sk->sk_sndmsg_page = page;
1532                                 sk->sk_sndmsg_off = 0;
1533
1534                                 skb_fill_page_desc(skb, i, page, 0, 0);
1535                                 frag = &skb_shinfo(skb)->frags[i];
1536                         } else {
1537                                 err = -EMSGSIZE;
1538                                 goto error;
1539                         }
1540                         if (getfrag(from,
1541                                     skb_frag_address(frag) + skb_frag_size(frag),
1542                                     offset, copy, skb->len, skb) < 0) {
1543                                 err = -EFAULT;
1544                                 goto error;
1545                         }
1546                         sk->sk_sndmsg_off += copy;
1547                         skb_frag_size_add(frag, copy);
1548                         skb->len += copy;
1549                         skb->data_len += copy;
1550                         skb->truesize += copy;
1551                         atomic_add(copy, &sk->sk_wmem_alloc);
1552                 }
1553                 offset += copy;
1554                 length -= copy;
1555         }
1556         return 0;
1557 error:
1558         cork->length -= length;
1559         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1560         return err;
1561 }
1562
1563 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1564 {
1565         if (np->cork.opt) {
1566                 kfree(np->cork.opt->dst0opt);
1567                 kfree(np->cork.opt->dst1opt);
1568                 kfree(np->cork.opt->hopopt);
1569                 kfree(np->cork.opt->srcrt);
1570                 kfree(np->cork.opt);
1571                 np->cork.opt = NULL;
1572         }
1573
1574         if (inet->cork.base.dst) {
1575                 dst_release(inet->cork.base.dst);
1576                 inet->cork.base.dst = NULL;
1577                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1578         }
1579         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1580 }
1581
1582 int ip6_push_pending_frames(struct sock *sk)
1583 {
1584         struct sk_buff *skb, *tmp_skb;
1585         struct sk_buff **tail_skb;
1586         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1587         struct inet_sock *inet = inet_sk(sk);
1588         struct ipv6_pinfo *np = inet6_sk(sk);
1589         struct net *net = sock_net(sk);
1590         struct ipv6hdr *hdr;
1591         struct ipv6_txoptions *opt = np->cork.opt;
1592         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1593         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1594         unsigned char proto = fl6->flowi6_proto;
1595         int err = 0;
1596
1597         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1598                 goto out;
1599         tail_skb = &(skb_shinfo(skb)->frag_list);
1600
1601         /* move skb->data to ip header from ext header */
1602         if (skb->data < skb_network_header(skb))
1603                 __skb_pull(skb, skb_network_offset(skb));
1604         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1605                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1606                 *tail_skb = tmp_skb;
1607                 tail_skb = &(tmp_skb->next);
1608                 skb->len += tmp_skb->len;
1609                 skb->data_len += tmp_skb->len;
1610                 skb->truesize += tmp_skb->truesize;
1611                 tmp_skb->destructor = NULL;
1612                 tmp_skb->sk = NULL;
1613         }
1614
1615         /* Allow local fragmentation. */
1616         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1617                 skb->local_df = 1;
1618
1619         ipv6_addr_copy(final_dst, &fl6->daddr);
1620         __skb_pull(skb, skb_network_header_len(skb));
1621         if (opt && opt->opt_flen)
1622                 ipv6_push_frag_opts(skb, opt, &proto);
1623         if (opt && opt->opt_nflen)
1624                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1625
1626         skb_push(skb, sizeof(struct ipv6hdr));
1627         skb_reset_network_header(skb);
1628         hdr = ipv6_hdr(skb);
1629
1630         *(__be32*)hdr = fl6->flowlabel |
1631                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1632
1633         hdr->hop_limit = np->cork.hop_limit;
1634         hdr->nexthdr = proto;
1635         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1636         ipv6_addr_copy(&hdr->daddr, final_dst);
1637
1638         skb->priority = sk->sk_priority;
1639         skb->mark = sk->sk_mark;
1640
1641         skb_dst_set(skb, dst_clone(&rt->dst));
1642         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1643         if (proto == IPPROTO_ICMPV6) {
1644                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1645
1646                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1647                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1648         }
1649
1650         err = ip6_local_out(skb);
1651         if (err) {
1652                 if (err > 0)
1653                         err = net_xmit_errno(err);
1654                 if (err)
1655                         goto error;
1656         }
1657
1658 out:
1659         ip6_cork_release(inet, np);
1660         return err;
1661 error:
1662         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1663         goto out;
1664 }
1665
1666 void ip6_flush_pending_frames(struct sock *sk)
1667 {
1668         struct sk_buff *skb;
1669
1670         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1671                 if (skb_dst(skb))
1672                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1673                                       IPSTATS_MIB_OUTDISCARDS);
1674                 kfree_skb(skb);
1675         }
1676
1677         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1678 }