ipv6: fix out of bound writes in __ip6_append_data()
[pandora-kernel.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89         skb_reset_mac_header(newskb);
90         __skb_pull(newskb, skb_network_offset(newskb));
91         newskb->pkt_type = PACKET_LOOPBACK;
92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
93         WARN_ON(!skb_dst(newskb));
94
95         netif_rx_ni(newskb);
96         return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101         struct dst_entry *dst = skb_dst(skb);
102         struct net_device *dev = dst->dev;
103         struct neighbour *neigh;
104
105         skb->protocol = htons(ETH_P_IPV6);
106         skb->dev = dev;
107
108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110
111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112                     ((mroute6_socket(dev_net(dev), skb) &&
113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115                                          &ipv6_hdr(skb)->saddr))) {
116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117
118                         /* Do not check for IFF_ALLMULTI; multicast routing
119                            is not supported in any case.
120                          */
121                         if (newskb)
122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123                                         newskb, NULL, newskb->dev,
124                                         ip6_dev_loopback_xmit);
125
126                         if (ipv6_hdr(skb)->hop_limit == 0) {
127                                 IP6_INC_STATS(dev_net(dev), idev,
128                                               IPSTATS_MIB_OUTDISCARDS);
129                                 kfree_skb(skb);
130                                 return 0;
131                         }
132                 }
133
134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135                                 skb->len);
136         }
137
138         rcu_read_lock();
139         neigh = dst_get_neighbour(dst);
140         if (neigh) {
141                 int res = neigh_output(neigh, skb);
142
143                 rcu_read_unlock();
144                 return res;
145         }
146         rcu_read_unlock();
147         IP6_INC_STATS(dev_net(dst->dev),
148                       ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149         kfree_skb(skb);
150         return -EINVAL;
151 }
152
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156             dst_allfrag(skb_dst(skb)))
157                 return ip6_fragment(skb, ip6_finish_output2);
158         else
159                 return ip6_finish_output2(skb);
160 }
161
162 int ip6_output(struct sk_buff *skb)
163 {
164         struct net_device *dev = skb_dst(skb)->dev;
165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166         if (unlikely(idev->cnf.disable_ipv6)) {
167                 IP6_INC_STATS(dev_net(dev), idev,
168                               IPSTATS_MIB_OUTDISCARDS);
169                 kfree_skb(skb);
170                 return 0;
171         }
172
173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174                             ip6_finish_output,
175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177
178 /*
179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183              struct ipv6_txoptions *opt, int tclass)
184 {
185         struct net *net = sock_net(sk);
186         struct ipv6_pinfo *np = inet6_sk(sk);
187         struct in6_addr *first_hop = &fl6->daddr;
188         struct dst_entry *dst = skb_dst(skb);
189         struct ipv6hdr *hdr;
190         u8  proto = fl6->flowi6_proto;
191         int seg_len = skb->len;
192         int hlimit = -1;
193         u32 mtu;
194
195         if (opt) {
196                 unsigned int head_room;
197
198                 /* First: exthdrs may take lots of space (~8K for now)
199                    MAX_HEADER is not enough.
200                  */
201                 head_room = opt->opt_nflen + opt->opt_flen;
202                 seg_len += head_room;
203                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204
205                 if (skb_headroom(skb) < head_room) {
206                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207                         if (skb2 == NULL) {
208                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209                                               IPSTATS_MIB_OUTDISCARDS);
210                                 kfree_skb(skb);
211                                 return -ENOBUFS;
212                         }
213                         kfree_skb(skb);
214                         skb = skb2;
215                         skb_set_owner_w(skb, sk);
216                 }
217                 if (opt->opt_flen)
218                         ipv6_push_frag_opts(skb, opt, &proto);
219                 if (opt->opt_nflen)
220                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221         }
222
223         skb_push(skb, sizeof(struct ipv6hdr));
224         skb_reset_network_header(skb);
225         hdr = ipv6_hdr(skb);
226
227         /*
228          *      Fill in the IPv6 header
229          */
230         if (np)
231                 hlimit = np->hop_limit;
232         if (hlimit < 0)
233                 hlimit = ip6_dst_hoplimit(dst);
234
235         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236
237         hdr->payload_len = htons(seg_len);
238         hdr->nexthdr = proto;
239         hdr->hop_limit = hlimit;
240
241         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
242         ipv6_addr_copy(&hdr->daddr, first_hop);
243
244         skb->priority = sk->sk_priority;
245         skb->mark = sk->sk_mark;
246
247         mtu = dst_mtu(dst);
248         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250                               IPSTATS_MIB_OUT, skb->len);
251                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252                                dst->dev, dst_output);
253         }
254
255         if (net_ratelimit())
256                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
257         skb->dev = dst->dev;
258         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
259         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
260         kfree_skb(skb);
261         return -EMSGSIZE;
262 }
263
264 EXPORT_SYMBOL(ip6_xmit);
265
266 /*
267  *      To avoid extra problems ND packets are send through this
268  *      routine. It's code duplication but I really want to avoid
269  *      extra checks since ipv6_build_header is used by TCP (which
270  *      is for us performance critical)
271  */
272
273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
274                const struct in6_addr *saddr, const struct in6_addr *daddr,
275                int proto, int len)
276 {
277         struct ipv6_pinfo *np = inet6_sk(sk);
278         struct ipv6hdr *hdr;
279
280         skb->protocol = htons(ETH_P_IPV6);
281         skb->dev = dev;
282
283         skb_reset_network_header(skb);
284         skb_put(skb, sizeof(struct ipv6hdr));
285         hdr = ipv6_hdr(skb);
286
287         *(__be32*)hdr = htonl(0x60000000);
288
289         hdr->payload_len = htons(len);
290         hdr->nexthdr = proto;
291         hdr->hop_limit = np->hop_limit;
292
293         ipv6_addr_copy(&hdr->saddr, saddr);
294         ipv6_addr_copy(&hdr->daddr, daddr);
295
296         return 0;
297 }
298
299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
300 {
301         struct ip6_ra_chain *ra;
302         struct sock *last = NULL;
303
304         read_lock(&ip6_ra_lock);
305         for (ra = ip6_ra_chain; ra; ra = ra->next) {
306                 struct sock *sk = ra->sk;
307                 if (sk && ra->sel == sel &&
308                     (!sk->sk_bound_dev_if ||
309                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
310                         if (last) {
311                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
312                                 if (skb2)
313                                         rawv6_rcv(last, skb2);
314                         }
315                         last = sk;
316                 }
317         }
318
319         if (last) {
320                 rawv6_rcv(last, skb);
321                 read_unlock(&ip6_ra_lock);
322                 return 1;
323         }
324         read_unlock(&ip6_ra_lock);
325         return 0;
326 }
327
328 static int ip6_forward_proxy_check(struct sk_buff *skb)
329 {
330         struct ipv6hdr *hdr = ipv6_hdr(skb);
331         u8 nexthdr = hdr->nexthdr;
332         int offset;
333
334         if (ipv6_ext_hdr(nexthdr)) {
335                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
336                 if (offset < 0)
337                         return 0;
338         } else
339                 offset = sizeof(struct ipv6hdr);
340
341         if (nexthdr == IPPROTO_ICMPV6) {
342                 struct icmp6hdr *icmp6;
343
344                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345                                          offset + 1 - skb->data)))
346                         return 0;
347
348                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349
350                 switch (icmp6->icmp6_type) {
351                 case NDISC_ROUTER_SOLICITATION:
352                 case NDISC_ROUTER_ADVERTISEMENT:
353                 case NDISC_NEIGHBOUR_SOLICITATION:
354                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
355                 case NDISC_REDIRECT:
356                         /* For reaction involving unicast neighbor discovery
357                          * message destined to the proxied address, pass it to
358                          * input function.
359                          */
360                         return 1;
361                 default:
362                         break;
363                 }
364         }
365
366         /*
367          * The proxying router can't forward traffic sent to a link-local
368          * address, so signal the sender and discard the packet. This
369          * behavior is clarified by the MIPv6 specification.
370          */
371         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372                 dst_link_failure(skb);
373                 return -1;
374         }
375
376         return 0;
377 }
378
379 static inline int ip6_forward_finish(struct sk_buff *skb)
380 {
381         return dst_output(skb);
382 }
383
384 int ip6_forward(struct sk_buff *skb)
385 {
386         struct dst_entry *dst = skb_dst(skb);
387         struct ipv6hdr *hdr = ipv6_hdr(skb);
388         struct inet6_skb_parm *opt = IP6CB(skb);
389         struct net *net = dev_net(dst->dev);
390         struct neighbour *n;
391         u32 mtu;
392
393         if (net->ipv6.devconf_all->forwarding == 0)
394                 goto error;
395
396         if (skb_warn_if_lro(skb))
397                 goto drop;
398
399         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
401                 goto drop;
402         }
403
404         if (skb->pkt_type != PACKET_HOST)
405                 goto drop;
406
407         skb_forward_csum(skb);
408
409         /*
410          *      We DO NOT make any processing on
411          *      RA packets, pushing them to user level AS IS
412          *      without ane WARRANTY that application will be able
413          *      to interpret them. The reason is that we
414          *      cannot make anything clever here.
415          *
416          *      We are not end-node, so that if packet contains
417          *      AH/ESP, we cannot make anything.
418          *      Defragmentation also would be mistake, RA packets
419          *      cannot be fragmented, because there is no warranty
420          *      that different fragments will go along one path. --ANK
421          */
422         if (opt->ra) {
423                 u8 *ptr = skb_network_header(skb) + opt->ra;
424                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
425                         return 0;
426         }
427
428         /*
429          *      check and decrement ttl
430          */
431         if (hdr->hop_limit <= 1) {
432                 /* Force OUTPUT device used as source address */
433                 skb->dev = dst->dev;
434                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
435                 IP6_INC_STATS_BH(net,
436                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
437
438                 kfree_skb(skb);
439                 return -ETIMEDOUT;
440         }
441
442         /* XXX: idev->cnf.proxy_ndp? */
443         if (net->ipv6.devconf_all->proxy_ndp &&
444             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
445                 int proxied = ip6_forward_proxy_check(skb);
446                 if (proxied > 0)
447                         return ip6_input(skb);
448                 else if (proxied < 0) {
449                         IP6_INC_STATS(net, ip6_dst_idev(dst),
450                                       IPSTATS_MIB_INDISCARDS);
451                         goto drop;
452                 }
453         }
454
455         if (!xfrm6_route_forward(skb)) {
456                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
457                 goto drop;
458         }
459         dst = skb_dst(skb);
460
461         /* IPv6 specs say nothing about it, but it is clear that we cannot
462            send redirects to source routed frames.
463            We don't send redirects to frames decapsulated from IPsec.
464          */
465         n = dst_get_neighbour(dst);
466         if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
467                 struct in6_addr *target = NULL;
468                 struct rt6_info *rt;
469
470                 /*
471                  *      incoming and outgoing devices are the same
472                  *      send a redirect.
473                  */
474
475                 rt = (struct rt6_info *) dst;
476                 if ((rt->rt6i_flags & RTF_GATEWAY))
477                         target = (struct in6_addr*)&n->primary_key;
478                 else
479                         target = &hdr->daddr;
480
481                 if (!rt->rt6i_peer)
482                         rt6_bind_peer(rt, 1);
483
484                 /* Limit redirects both by destination (here)
485                    and by source (inside ndisc_send_redirect)
486                  */
487                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
488                         ndisc_send_redirect(skb, n, target);
489         } else {
490                 int addrtype = ipv6_addr_type(&hdr->saddr);
491
492                 /* This check is security critical. */
493                 if (addrtype == IPV6_ADDR_ANY ||
494                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
495                         goto error;
496                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
497                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
498                                     ICMPV6_NOT_NEIGHBOUR, 0);
499                         goto error;
500                 }
501         }
502
503         mtu = dst_mtu(dst);
504         if (mtu < IPV6_MIN_MTU)
505                 mtu = IPV6_MIN_MTU;
506
507         if (skb->len > mtu && !skb_is_gso(skb)) {
508                 /* Again, force OUTPUT device used as source address */
509                 skb->dev = dst->dev;
510                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
511                 IP6_INC_STATS_BH(net,
512                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
513                 IP6_INC_STATS_BH(net,
514                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
515                 kfree_skb(skb);
516                 return -EMSGSIZE;
517         }
518
519         if (skb_cow(skb, dst->dev->hard_header_len)) {
520                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
521                 goto drop;
522         }
523
524         hdr = ipv6_hdr(skb);
525
526         /* Mangling hops number delayed to point after skb COW */
527
528         hdr->hop_limit--;
529
530         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
531         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
532                        ip6_forward_finish);
533
534 error:
535         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
536 drop:
537         kfree_skb(skb);
538         return -EINVAL;
539 }
540
541 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
542 {
543         to->pkt_type = from->pkt_type;
544         to->priority = from->priority;
545         to->protocol = from->protocol;
546         skb_dst_drop(to);
547         skb_dst_set(to, dst_clone(skb_dst(from)));
548         to->dev = from->dev;
549         to->mark = from->mark;
550
551 #ifdef CONFIG_NET_SCHED
552         to->tc_index = from->tc_index;
553 #endif
554         nf_copy(to, from);
555 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
556     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
557         to->nf_trace = from->nf_trace;
558 #endif
559         skb_copy_secmark(to, from);
560 }
561
562 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
563 {
564         u16 offset = sizeof(struct ipv6hdr);
565         unsigned int packet_len = skb->tail - skb->network_header;
566         int found_rhdr = 0;
567         *nexthdr = &ipv6_hdr(skb)->nexthdr;
568
569         while (offset <= packet_len) {
570                 struct ipv6_opt_hdr *exthdr;
571
572                 switch (**nexthdr) {
573
574                 case NEXTHDR_HOP:
575                         break;
576                 case NEXTHDR_ROUTING:
577                         found_rhdr = 1;
578                         break;
579                 case NEXTHDR_DEST:
580 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
581                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
582                                 break;
583 #endif
584                         if (found_rhdr)
585                                 return offset;
586                         break;
587                 default :
588                         return offset;
589                 }
590
591                 if (offset + sizeof(struct ipv6_opt_hdr) > packet_len)
592                         return -EINVAL;
593
594                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
595                                                  offset);
596                 offset += ipv6_optlen(exthdr);
597                 *nexthdr = &exthdr->nexthdr;
598         }
599
600         return -EINVAL;
601 }
602
603 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
604 {
605         static u32 ip6_idents_hashrnd __read_mostly;
606         static bool hashrnd_initialized = false;
607         u32 hash, id;
608
609         if (unlikely(!hashrnd_initialized)) {
610                 hashrnd_initialized = true;
611                 get_random_bytes(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
612         }
613         hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd);
614         hash = __ipv6_addr_jhash(&rt->rt6i_src.addr, hash);
615
616         id = ip_idents_reserve(hash, 1);
617         fhdr->identification = htonl(id);
618 }
619
620 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
621 {
622         struct sk_buff *frag;
623         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
624         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
625         struct ipv6hdr *tmp_hdr;
626         struct frag_hdr *fh;
627         unsigned int mtu, hlen, left, len;
628         int hroom, troom;
629         __be32 frag_id = 0;
630         int ptr, offset = 0, err=0;
631         u8 *prevhdr, nexthdr = 0;
632         struct net *net = dev_net(skb_dst(skb)->dev);
633
634         err = ip6_find_1stfragopt(skb, &prevhdr);
635         if (err < 0)
636                 goto fail;
637         hlen = err;
638         nexthdr = *prevhdr;
639
640         mtu = ip6_skb_dst_mtu(skb);
641
642         /* We must not fragment if the socket is set to force MTU discovery
643          * or if the skb it not generated by a local socket.
644          */
645         if (!skb->local_df && skb->len > mtu) {
646                 skb->dev = skb_dst(skb)->dev;
647                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
648                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
649                               IPSTATS_MIB_FRAGFAILS);
650                 kfree_skb(skb);
651                 return -EMSGSIZE;
652         }
653
654         if (np && np->frag_size < mtu) {
655                 if (np->frag_size)
656                         mtu = np->frag_size;
657         }
658         mtu -= hlen + sizeof(struct frag_hdr);
659
660         if (skb_has_frag_list(skb)) {
661                 int first_len = skb_pagelen(skb);
662                 struct sk_buff *frag2;
663
664                 if (first_len - hlen > mtu ||
665                     ((first_len - hlen) & 7) ||
666                     skb_cloned(skb))
667                         goto slow_path;
668
669                 skb_walk_frags(skb, frag) {
670                         /* Correct geometry. */
671                         if (frag->len > mtu ||
672                             ((frag->len & 7) && frag->next) ||
673                             skb_headroom(frag) < hlen)
674                                 goto slow_path_clean;
675
676                         /* Partially cloned skb? */
677                         if (skb_shared(frag))
678                                 goto slow_path_clean;
679
680                         BUG_ON(frag->sk);
681                         if (skb->sk) {
682                                 frag->sk = skb->sk;
683                                 frag->destructor = sock_wfree;
684                         }
685                         skb->truesize -= frag->truesize;
686                 }
687
688                 err = 0;
689                 offset = 0;
690                 frag = skb_shinfo(skb)->frag_list;
691                 skb_frag_list_init(skb);
692                 /* BUILD HEADER */
693
694                 *prevhdr = NEXTHDR_FRAGMENT;
695                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
696                 if (!tmp_hdr) {
697                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
698                                       IPSTATS_MIB_FRAGFAILS);
699                         return -ENOMEM;
700                 }
701
702                 __skb_pull(skb, hlen);
703                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
704                 __skb_push(skb, hlen);
705                 skb_reset_network_header(skb);
706                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
707
708                 ipv6_select_ident(fh, rt);
709                 fh->nexthdr = nexthdr;
710                 fh->reserved = 0;
711                 fh->frag_off = htons(IP6_MF);
712                 frag_id = fh->identification;
713
714                 first_len = skb_pagelen(skb);
715                 skb->data_len = first_len - skb_headlen(skb);
716                 skb->len = first_len;
717                 ipv6_hdr(skb)->payload_len = htons(first_len -
718                                                    sizeof(struct ipv6hdr));
719
720                 dst_hold(&rt->dst);
721
722                 for (;;) {
723                         /* Prepare header of the next frame,
724                          * before previous one went down. */
725                         if (frag) {
726                                 frag->ip_summed = CHECKSUM_NONE;
727                                 skb_reset_transport_header(frag);
728                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
729                                 __skb_push(frag, hlen);
730                                 skb_reset_network_header(frag);
731                                 memcpy(skb_network_header(frag), tmp_hdr,
732                                        hlen);
733                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
734                                 fh->nexthdr = nexthdr;
735                                 fh->reserved = 0;
736                                 fh->frag_off = htons(offset);
737                                 if (frag->next != NULL)
738                                         fh->frag_off |= htons(IP6_MF);
739                                 fh->identification = frag_id;
740                                 ipv6_hdr(frag)->payload_len =
741                                                 htons(frag->len -
742                                                       sizeof(struct ipv6hdr));
743                                 ip6_copy_metadata(frag, skb);
744                         }
745
746                         err = output(skb);
747                         if(!err)
748                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
749                                               IPSTATS_MIB_FRAGCREATES);
750
751                         if (err || !frag)
752                                 break;
753
754                         skb = frag;
755                         frag = skb->next;
756                         skb->next = NULL;
757                 }
758
759                 kfree(tmp_hdr);
760
761                 if (err == 0) {
762                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
763                                       IPSTATS_MIB_FRAGOKS);
764                         dst_release(&rt->dst);
765                         return 0;
766                 }
767
768                 while (frag) {
769                         skb = frag->next;
770                         kfree_skb(frag);
771                         frag = skb;
772                 }
773
774                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
775                               IPSTATS_MIB_FRAGFAILS);
776                 dst_release(&rt->dst);
777                 return err;
778
779 slow_path_clean:
780                 skb_walk_frags(skb, frag2) {
781                         if (frag2 == frag)
782                                 break;
783                         frag2->sk = NULL;
784                         frag2->destructor = NULL;
785                         skb->truesize += frag2->truesize;
786                 }
787         }
788
789 slow_path:
790         left = skb->len - hlen;         /* Space per frame */
791         ptr = hlen;                     /* Where to start from */
792
793         /*
794          *      Fragment the datagram.
795          */
796
797         *prevhdr = NEXTHDR_FRAGMENT;
798         hroom = LL_RESERVED_SPACE(rt->dst.dev);
799         troom = rt->dst.dev->needed_tailroom;
800
801         /*
802          *      Keep copying data until we run out.
803          */
804         while(left > 0) {
805                 len = left;
806                 /* IF: it doesn't fit, use 'mtu' - the data space left */
807                 if (len > mtu)
808                         len = mtu;
809                 /* IF: we are not sending up to and including the packet end
810                    then align the next start on an eight byte boundary */
811                 if (len < left) {
812                         len &= ~7;
813                 }
814                 /*
815                  *      Allocate buffer.
816                  */
817
818                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
819                                       hroom + troom, GFP_ATOMIC)) == NULL) {
820                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
821                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
822                                       IPSTATS_MIB_FRAGFAILS);
823                         err = -ENOMEM;
824                         goto fail;
825                 }
826
827                 /*
828                  *      Set up data on packet
829                  */
830
831                 ip6_copy_metadata(frag, skb);
832                 skb_reserve(frag, hroom);
833                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
834                 skb_reset_network_header(frag);
835                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
836                 frag->transport_header = (frag->network_header + hlen +
837                                           sizeof(struct frag_hdr));
838
839                 /*
840                  *      Charge the memory for the fragment to any owner
841                  *      it might possess
842                  */
843                 if (skb->sk)
844                         skb_set_owner_w(frag, skb->sk);
845
846                 /*
847                  *      Copy the packet header into the new buffer.
848                  */
849                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
850
851                 /*
852                  *      Build fragment header.
853                  */
854                 fh->nexthdr = nexthdr;
855                 fh->reserved = 0;
856                 if (!frag_id) {
857                         ipv6_select_ident(fh, rt);
858                         frag_id = fh->identification;
859                 } else
860                         fh->identification = frag_id;
861
862                 /*
863                  *      Copy a block of the IP datagram.
864                  */
865                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
866                         BUG();
867                 left -= len;
868
869                 fh->frag_off = htons(offset);
870                 if (left > 0)
871                         fh->frag_off |= htons(IP6_MF);
872                 ipv6_hdr(frag)->payload_len = htons(frag->len -
873                                                     sizeof(struct ipv6hdr));
874
875                 ptr += len;
876                 offset += len;
877
878                 /*
879                  *      Put this fragment into the sending queue.
880                  */
881                 err = output(frag);
882                 if (err)
883                         goto fail;
884
885                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
886                               IPSTATS_MIB_FRAGCREATES);
887         }
888         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
889                       IPSTATS_MIB_FRAGOKS);
890         kfree_skb(skb);
891         return err;
892
893 fail:
894         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
895                       IPSTATS_MIB_FRAGFAILS);
896         kfree_skb(skb);
897         return err;
898 }
899
900 static inline int ip6_rt_check(const struct rt6key *rt_key,
901                                const struct in6_addr *fl_addr,
902                                const struct in6_addr *addr_cache)
903 {
904         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
905                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
906 }
907
908 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
909                                           struct dst_entry *dst,
910                                           const struct flowi6 *fl6)
911 {
912         struct ipv6_pinfo *np = inet6_sk(sk);
913         struct rt6_info *rt;
914
915         if (!dst)
916                 goto out;
917
918         if (dst->ops->family != AF_INET6) {
919                 dst_release(dst);
920                 return NULL;
921         }
922
923         rt = (struct rt6_info *)dst;
924         /* Yes, checking route validity in not connected
925          * case is not very simple. Take into account,
926          * that we do not support routing by source, TOS,
927          * and MSG_DONTROUTE            --ANK (980726)
928          *
929          * 1. ip6_rt_check(): If route was host route,
930          *    check that cached destination is current.
931          *    If it is network route, we still may
932          *    check its validity using saved pointer
933          *    to the last used address: daddr_cache.
934          *    We do not want to save whole address now,
935          *    (because main consumer of this service
936          *    is tcp, which has not this problem),
937          *    so that the last trick works only on connected
938          *    sockets.
939          * 2. oif also should be the same.
940          */
941         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
942 #ifdef CONFIG_IPV6_SUBTREES
943             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
944 #endif
945             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
946                 dst_release(dst);
947                 dst = NULL;
948         }
949
950 out:
951         return dst;
952 }
953
954 static int ip6_dst_lookup_tail(struct sock *sk,
955                                struct dst_entry **dst, struct flowi6 *fl6)
956 {
957         struct net *net = sock_net(sk);
958 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
959         struct neighbour *n;
960 #endif
961         int err;
962
963         if (*dst == NULL)
964                 *dst = ip6_route_output(net, sk, fl6);
965
966         if ((err = (*dst)->error))
967                 goto out_err_release;
968
969         if (ipv6_addr_any(&fl6->saddr)) {
970                 struct rt6_info *rt = (struct rt6_info *) *dst;
971                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
972                                           sk ? inet6_sk(sk)->srcprefs : 0,
973                                           &fl6->saddr);
974                 if (err)
975                         goto out_err_release;
976         }
977
978 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
979         /*
980          * Here if the dst entry we've looked up
981          * has a neighbour entry that is in the INCOMPLETE
982          * state and the src address from the flow is
983          * marked as OPTIMISTIC, we release the found
984          * dst entry and replace it instead with the
985          * dst entry of the nexthop router
986          */
987         rcu_read_lock();
988         n = dst_get_neighbour(*dst);
989         if (n && !(n->nud_state & NUD_VALID)) {
990                 struct inet6_ifaddr *ifp;
991                 struct flowi6 fl_gw6;
992                 int redirect;
993
994                 rcu_read_unlock();
995                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
996                                       (*dst)->dev, 1);
997
998                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
999                 if (ifp)
1000                         in6_ifa_put(ifp);
1001
1002                 if (redirect) {
1003                         /*
1004                          * We need to get the dst entry for the
1005                          * default router instead
1006                          */
1007                         dst_release(*dst);
1008                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1009                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1010                         *dst = ip6_route_output(net, sk, &fl_gw6);
1011                         if ((err = (*dst)->error))
1012                                 goto out_err_release;
1013                 }
1014         } else {
1015                 rcu_read_unlock();
1016         }
1017 #endif
1018
1019         return 0;
1020
1021 out_err_release:
1022         if (err == -ENETUNREACH)
1023                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1024         dst_release(*dst);
1025         *dst = NULL;
1026         return err;
1027 }
1028
1029 /**
1030  *      ip6_dst_lookup - perform route lookup on flow
1031  *      @sk: socket which provides route info
1032  *      @dst: pointer to dst_entry * for result
1033  *      @fl6: flow to lookup
1034  *
1035  *      This function performs a route lookup on the given flow.
1036  *
1037  *      It returns zero on success, or a standard errno code on error.
1038  */
1039 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1040 {
1041         *dst = NULL;
1042         return ip6_dst_lookup_tail(sk, dst, fl6);
1043 }
1044 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1045
1046 /**
1047  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1048  *      @sk: socket which provides route info
1049  *      @fl6: flow to lookup
1050  *      @final_dst: final destination address for ipsec lookup
1051  *      @can_sleep: we are in a sleepable context
1052  *
1053  *      This function performs a route lookup on the given flow.
1054  *
1055  *      It returns a valid dst pointer on success, or a pointer encoded
1056  *      error code.
1057  */
1058 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1059                                       const struct in6_addr *final_dst,
1060                                       bool can_sleep)
1061 {
1062         struct dst_entry *dst = NULL;
1063         int err;
1064
1065         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1066         if (err)
1067                 return ERR_PTR(err);
1068         if (final_dst)
1069                 ipv6_addr_copy(&fl6->daddr, final_dst);
1070         if (can_sleep)
1071                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1072
1073         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1074 }
1075 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1076
1077 /**
1078  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1079  *      @sk: socket which provides the dst cache and route info
1080  *      @fl6: flow to lookup
1081  *      @final_dst: final destination address for ipsec lookup
1082  *      @can_sleep: we are in a sleepable context
1083  *
1084  *      This function performs a route lookup on the given flow with the
1085  *      possibility of using the cached route in the socket if it is valid.
1086  *      It will take the socket dst lock when operating on the dst cache.
1087  *      As a result, this function can only be used in process context.
1088  *
1089  *      It returns a valid dst pointer on success, or a pointer encoded
1090  *      error code.
1091  */
1092 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1093                                          const struct in6_addr *final_dst,
1094                                          bool can_sleep)
1095 {
1096         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1097         int err;
1098
1099         dst = ip6_sk_dst_check(sk, dst, fl6);
1100
1101         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1102         if (err)
1103                 return ERR_PTR(err);
1104         if (final_dst)
1105                 ipv6_addr_copy(&fl6->daddr, final_dst);
1106         if (can_sleep)
1107                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1108
1109         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1110 }
1111 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1112
1113 static inline int ip6_ufo_append_data(struct sock *sk,
1114                         int getfrag(void *from, char *to, int offset, int len,
1115                         int odd, struct sk_buff *skb),
1116                         void *from, int length, int hh_len, int fragheaderlen,
1117                         int exthdrlen, int transhdrlen, int mtu,
1118                         unsigned int flags, struct rt6_info *rt)
1119 {
1120         struct sk_buff *skb;
1121         int err;
1122
1123         /* There is support for UDP large send offload by network
1124          * device, so create one single skb packet containing complete
1125          * udp datagram
1126          */
1127         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1128                 struct frag_hdr fhdr;
1129
1130                 skb = sock_alloc_send_skb(sk,
1131                         hh_len + fragheaderlen + transhdrlen + 20,
1132                         (flags & MSG_DONTWAIT), &err);
1133                 if (skb == NULL)
1134                         return err;
1135
1136                 /* reserve space for Hardware header */
1137                 skb_reserve(skb, hh_len);
1138
1139                 /* create space for UDP/IP header */
1140                 skb_put(skb,fragheaderlen + transhdrlen);
1141
1142                 /* initialize network header pointer */
1143                 skb_set_network_header(skb, exthdrlen);
1144
1145                 /* initialize protocol header pointer */
1146                 skb->transport_header = skb->network_header + fragheaderlen;
1147
1148                 skb->ip_summed = CHECKSUM_PARTIAL;
1149                 skb->csum = 0;
1150
1151                 /* Specify the length of each IPv6 datagram fragment.
1152                  * It has to be a multiple of 8.
1153                  */
1154                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1155                                              sizeof(struct frag_hdr)) & ~7;
1156                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1157                 ipv6_select_ident(&fhdr, rt);
1158                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1159                 __skb_queue_tail(&sk->sk_write_queue, skb);
1160         }
1161
1162         return skb_append_datato_frags(sk, skb, getfrag, from,
1163                                        (length - transhdrlen));
1164 }
1165
1166 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1167                                                gfp_t gfp)
1168 {
1169         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1170 }
1171
1172 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1173                                                 gfp_t gfp)
1174 {
1175         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1176 }
1177
1178 static void ip6_append_data_mtu(unsigned int *mtu,
1179                                 int *maxfraglen,
1180                                 unsigned int fragheaderlen,
1181                                 struct sk_buff *skb,
1182                                 struct rt6_info *rt,
1183                                 unsigned int orig_mtu)
1184 {
1185         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1186                 if (skb == NULL) {
1187                         /* first fragment, reserve header_len */
1188                         *mtu = orig_mtu - rt->dst.header_len;
1189
1190                 } else {
1191                         /*
1192                          * this fragment is not first, the headers
1193                          * space is regarded as data space.
1194                          */
1195                         *mtu = orig_mtu;
1196                 }
1197                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1198                               + fragheaderlen - sizeof(struct frag_hdr);
1199         }
1200 }
1201
1202 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1203         int offset, int len, int odd, struct sk_buff *skb),
1204         void *from, int length, int transhdrlen,
1205         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1206         struct rt6_info *rt, unsigned int flags, int dontfrag)
1207 {
1208         struct inet_sock *inet = inet_sk(sk);
1209         struct ipv6_pinfo *np = inet6_sk(sk);
1210         struct inet_cork *cork;
1211         struct sk_buff *skb, *skb_prev = NULL;
1212         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1213         int exthdrlen;
1214         int dst_exthdrlen;
1215         int hh_len;
1216         int copy;
1217         int err;
1218         int offset = 0;
1219         int csummode = CHECKSUM_NONE;
1220         __u8 tx_flags = 0;
1221
1222         if (flags&MSG_PROBE)
1223                 return 0;
1224         cork = &inet->cork.base;
1225         if (skb_queue_empty(&sk->sk_write_queue)) {
1226                 /*
1227                  * setup for corking
1228                  */
1229                 if (opt) {
1230                         if (WARN_ON(np->cork.opt))
1231                                 return -EINVAL;
1232
1233                         np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1234                         if (unlikely(np->cork.opt == NULL))
1235                                 return -ENOBUFS;
1236
1237                         np->cork.opt->tot_len = opt->tot_len;
1238                         np->cork.opt->opt_flen = opt->opt_flen;
1239                         np->cork.opt->opt_nflen = opt->opt_nflen;
1240
1241                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1242                                                             sk->sk_allocation);
1243                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1244                                 return -ENOBUFS;
1245
1246                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1247                                                             sk->sk_allocation);
1248                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1249                                 return -ENOBUFS;
1250
1251                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1252                                                            sk->sk_allocation);
1253                         if (opt->hopopt && !np->cork.opt->hopopt)
1254                                 return -ENOBUFS;
1255
1256                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1257                                                             sk->sk_allocation);
1258                         if (opt->srcrt && !np->cork.opt->srcrt)
1259                                 return -ENOBUFS;
1260
1261                         /* need source address above miyazawa*/
1262                 }
1263                 dst_hold(&rt->dst);
1264                 cork->dst = &rt->dst;
1265                 inet->cork.fl.u.ip6 = *fl6;
1266                 np->cork.hop_limit = hlimit;
1267                 np->cork.tclass = tclass;
1268                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1269                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1270                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1271                 else
1272                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1273                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1274                 if (np->frag_size < mtu) {
1275                         if (np->frag_size)
1276                                 mtu = np->frag_size;
1277                 }
1278                 cork->fragsize = mtu;
1279                 if (dst_allfrag(rt->dst.path))
1280                         cork->flags |= IPCORK_ALLFRAG;
1281                 cork->length = 0;
1282                 sk->sk_sndmsg_page = NULL;
1283                 sk->sk_sndmsg_off = 0;
1284                 exthdrlen = (opt ? opt->opt_flen : 0);
1285                 length += exthdrlen;
1286                 transhdrlen += exthdrlen;
1287                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1288         } else {
1289                 rt = (struct rt6_info *)cork->dst;
1290                 fl6 = &inet->cork.fl.u.ip6;
1291                 opt = np->cork.opt;
1292                 transhdrlen = 0;
1293                 exthdrlen = 0;
1294                 dst_exthdrlen = 0;
1295                 mtu = cork->fragsize;
1296         }
1297         orig_mtu = mtu;
1298
1299         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1300
1301         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1302                         (opt ? opt->opt_nflen : 0);
1303         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1304
1305         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1306                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1307                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1308                         return -EMSGSIZE;
1309                 }
1310         }
1311
1312         /* For UDP, check if TX timestamp is enabled */
1313         if (sk->sk_type == SOCK_DGRAM) {
1314                 err = sock_tx_timestamp(sk, &tx_flags);
1315                 if (err)
1316                         goto error;
1317         }
1318
1319         /*
1320          * Let's try using as much space as possible.
1321          * Use MTU if total length of the message fits into the MTU.
1322          * Otherwise, we need to reserve fragment header and
1323          * fragment alignment (= 8-15 octects, in total).
1324          *
1325          * Note that we may need to "move" the data from the tail of
1326          * of the buffer to the new fragment when we split
1327          * the message.
1328          *
1329          * FIXME: It may be fragmented into multiple chunks
1330          *        at once if non-fragmentable extension headers
1331          *        are too large.
1332          * --yoshfuji
1333          */
1334
1335         if ((length > mtu) && dontfrag && (sk->sk_protocol == IPPROTO_UDP ||
1336                                            sk->sk_protocol == IPPROTO_RAW)) {
1337                 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1338                 return -EMSGSIZE;
1339         }
1340
1341         skb = skb_peek_tail(&sk->sk_write_queue);
1342         cork->length += length;
1343         if (((length > mtu) ||
1344              (skb && skb_has_frags(skb))) &&
1345             (sk->sk_protocol == IPPROTO_UDP) &&
1346             (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
1347             (sk->sk_type == SOCK_DGRAM)) {
1348                 err = ip6_ufo_append_data(sk, getfrag, from, length,
1349                                           hh_len, fragheaderlen, exthdrlen,
1350                                           transhdrlen, mtu, flags, rt);
1351                 if (err)
1352                         goto error;
1353                 return 0;
1354         }
1355
1356         if (!skb)
1357                 goto alloc_new_skb;
1358
1359         while (length > 0) {
1360                 /* Check if the remaining data fits into current packet. */
1361                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1362                 if (copy < length)
1363                         copy = maxfraglen - skb->len;
1364
1365                 if (copy <= 0) {
1366                         char *data;
1367                         unsigned int datalen;
1368                         unsigned int fraglen;
1369                         unsigned int fraggap;
1370                         unsigned int alloclen;
1371 alloc_new_skb:
1372                         /* There's no room in the current skb */
1373                         if (skb)
1374                                 fraggap = skb->len - maxfraglen;
1375                         else
1376                                 fraggap = 0;
1377                         /* update mtu and maxfraglen if necessary */
1378                         if (skb == NULL || skb_prev == NULL)
1379                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1380                                                     fragheaderlen, skb, rt,
1381                                                     orig_mtu);
1382
1383                         skb_prev = skb;
1384
1385                         /*
1386                          * If remaining data exceeds the mtu,
1387                          * we know we need more fragment(s).
1388                          */
1389                         datalen = length + fraggap;
1390
1391                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1392                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1393                         if ((flags & MSG_MORE) &&
1394                             !(rt->dst.dev->features&NETIF_F_SG))
1395                                 alloclen = mtu;
1396                         else
1397                                 alloclen = datalen + fragheaderlen;
1398
1399                         alloclen += dst_exthdrlen;
1400
1401                         if (datalen != length + fraggap) {
1402                                 /*
1403                                  * this is not the last fragment, the trailer
1404                                  * space is regarded as data space.
1405                                  */
1406                                 datalen += rt->dst.trailer_len;
1407                         }
1408
1409                         alloclen += rt->dst.trailer_len;
1410                         fraglen = datalen + fragheaderlen;
1411
1412                         /*
1413                          * We just reserve space for fragment header.
1414                          * Note: this may be overallocation if the message
1415                          * (without MSG_MORE) fits into the MTU.
1416                          */
1417                         alloclen += sizeof(struct frag_hdr);
1418
1419                         copy = datalen - transhdrlen - fraggap;
1420                         if (copy < 0) {
1421                                 err = -EINVAL;
1422                                 goto error;
1423                         }
1424                         if (transhdrlen) {
1425                                 skb = sock_alloc_send_skb(sk,
1426                                                 alloclen + hh_len,
1427                                                 (flags & MSG_DONTWAIT), &err);
1428                         } else {
1429                                 skb = NULL;
1430                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1431                                     2 * sk->sk_sndbuf)
1432                                         skb = sock_wmalloc(sk,
1433                                                            alloclen + hh_len, 1,
1434                                                            sk->sk_allocation);
1435                                 if (unlikely(skb == NULL))
1436                                         err = -ENOBUFS;
1437                                 else {
1438                                         /* Only the initial fragment
1439                                          * is time stamped.
1440                                          */
1441                                         tx_flags = 0;
1442                                 }
1443                         }
1444                         if (skb == NULL)
1445                                 goto error;
1446                         /*
1447                          *      Fill in the control structures
1448                          */
1449                         skb->ip_summed = csummode;
1450                         skb->csum = 0;
1451                         /* reserve for fragmentation and ipsec header */
1452                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1453                                     dst_exthdrlen);
1454
1455                         if (sk->sk_type == SOCK_DGRAM)
1456                                 skb_shinfo(skb)->tx_flags = tx_flags;
1457
1458                         /*
1459                          *      Find where to start putting bytes
1460                          */
1461                         data = skb_put(skb, fraglen);
1462                         skb_set_network_header(skb, exthdrlen);
1463                         data += fragheaderlen;
1464                         skb->transport_header = (skb->network_header +
1465                                                  fragheaderlen);
1466                         if (fraggap) {
1467                                 skb->csum = skb_copy_and_csum_bits(
1468                                         skb_prev, maxfraglen,
1469                                         data + transhdrlen, fraggap, 0);
1470                                 skb_prev->csum = csum_sub(skb_prev->csum,
1471                                                           skb->csum);
1472                                 data += fraggap;
1473                                 pskb_trim_unique(skb_prev, maxfraglen);
1474                         }
1475                         if (copy > 0 &&
1476                             getfrag(from, data + transhdrlen, offset,
1477                                     copy, fraggap, skb) < 0) {
1478                                 err = -EFAULT;
1479                                 kfree_skb(skb);
1480                                 goto error;
1481                         }
1482
1483                         offset += copy;
1484                         length -= datalen - fraggap;
1485                         transhdrlen = 0;
1486                         exthdrlen = 0;
1487                         dst_exthdrlen = 0;
1488                         csummode = CHECKSUM_NONE;
1489
1490                         /*
1491                          * Put the packet on the pending queue
1492                          */
1493                         __skb_queue_tail(&sk->sk_write_queue, skb);
1494                         continue;
1495                 }
1496
1497                 if (copy > length)
1498                         copy = length;
1499
1500                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1501                         unsigned int off;
1502
1503                         off = skb->len;
1504                         if (getfrag(from, skb_put(skb, copy),
1505                                                 offset, copy, off, skb) < 0) {
1506                                 __skb_trim(skb, off);
1507                                 err = -EFAULT;
1508                                 goto error;
1509                         }
1510                 } else {
1511                         int i = skb_shinfo(skb)->nr_frags;
1512                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1513                         struct page *page = sk->sk_sndmsg_page;
1514                         int off = sk->sk_sndmsg_off;
1515                         unsigned int left;
1516
1517                         if (page && (left = PAGE_SIZE - off) > 0) {
1518                                 if (copy >= left)
1519                                         copy = left;
1520                                 if (page != skb_frag_page(frag)) {
1521                                         if (i == MAX_SKB_FRAGS) {
1522                                                 err = -EMSGSIZE;
1523                                                 goto error;
1524                                         }
1525                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1526                                         skb_frag_ref(skb, i);
1527                                         frag = &skb_shinfo(skb)->frags[i];
1528                                 }
1529                         } else if(i < MAX_SKB_FRAGS) {
1530                                 if (copy > PAGE_SIZE)
1531                                         copy = PAGE_SIZE;
1532                                 page = alloc_pages(sk->sk_allocation, 0);
1533                                 if (page == NULL) {
1534                                         err = -ENOMEM;
1535                                         goto error;
1536                                 }
1537                                 sk->sk_sndmsg_page = page;
1538                                 sk->sk_sndmsg_off = 0;
1539
1540                                 skb_fill_page_desc(skb, i, page, 0, 0);
1541                                 frag = &skb_shinfo(skb)->frags[i];
1542                         } else {
1543                                 err = -EMSGSIZE;
1544                                 goto error;
1545                         }
1546                         if (getfrag(from,
1547                                     skb_frag_address(frag) + skb_frag_size(frag),
1548                                     offset, copy, skb->len, skb) < 0) {
1549                                 err = -EFAULT;
1550                                 goto error;
1551                         }
1552                         sk->sk_sndmsg_off += copy;
1553                         skb_frag_size_add(frag, copy);
1554                         skb->len += copy;
1555                         skb->data_len += copy;
1556                         skb->truesize += copy;
1557                         atomic_add(copy, &sk->sk_wmem_alloc);
1558                 }
1559                 offset += copy;
1560                 length -= copy;
1561         }
1562         return 0;
1563 error:
1564         cork->length -= length;
1565         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1566         return err;
1567 }
1568
1569 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1570 {
1571         if (np->cork.opt) {
1572                 kfree(np->cork.opt->dst0opt);
1573                 kfree(np->cork.opt->dst1opt);
1574                 kfree(np->cork.opt->hopopt);
1575                 kfree(np->cork.opt->srcrt);
1576                 kfree(np->cork.opt);
1577                 np->cork.opt = NULL;
1578         }
1579
1580         if (inet->cork.base.dst) {
1581                 dst_release(inet->cork.base.dst);
1582                 inet->cork.base.dst = NULL;
1583                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1584         }
1585         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1586 }
1587
1588 int ip6_push_pending_frames(struct sock *sk)
1589 {
1590         struct sk_buff *skb, *tmp_skb;
1591         struct sk_buff **tail_skb;
1592         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1593         struct inet_sock *inet = inet_sk(sk);
1594         struct ipv6_pinfo *np = inet6_sk(sk);
1595         struct net *net = sock_net(sk);
1596         struct ipv6hdr *hdr;
1597         struct ipv6_txoptions *opt = np->cork.opt;
1598         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1599         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1600         unsigned char proto = fl6->flowi6_proto;
1601         int err = 0;
1602
1603         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1604                 goto out;
1605         tail_skb = &(skb_shinfo(skb)->frag_list);
1606
1607         /* move skb->data to ip header from ext header */
1608         if (skb->data < skb_network_header(skb))
1609                 __skb_pull(skb, skb_network_offset(skb));
1610         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1611                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1612                 *tail_skb = tmp_skb;
1613                 tail_skb = &(tmp_skb->next);
1614                 skb->len += tmp_skb->len;
1615                 skb->data_len += tmp_skb->len;
1616                 skb->truesize += tmp_skb->truesize;
1617                 tmp_skb->destructor = NULL;
1618                 tmp_skb->sk = NULL;
1619         }
1620
1621         /* Allow local fragmentation. */
1622         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1623                 skb->local_df = 1;
1624
1625         ipv6_addr_copy(final_dst, &fl6->daddr);
1626         __skb_pull(skb, skb_network_header_len(skb));
1627         if (opt && opt->opt_flen)
1628                 ipv6_push_frag_opts(skb, opt, &proto);
1629         if (opt && opt->opt_nflen)
1630                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1631
1632         skb_push(skb, sizeof(struct ipv6hdr));
1633         skb_reset_network_header(skb);
1634         hdr = ipv6_hdr(skb);
1635
1636         *(__be32*)hdr = fl6->flowlabel |
1637                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1638
1639         hdr->hop_limit = np->cork.hop_limit;
1640         hdr->nexthdr = proto;
1641         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1642         ipv6_addr_copy(&hdr->daddr, final_dst);
1643
1644         skb->priority = sk->sk_priority;
1645         skb->mark = sk->sk_mark;
1646
1647         skb_dst_set(skb, dst_clone(&rt->dst));
1648         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1649         if (proto == IPPROTO_ICMPV6) {
1650                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1651
1652                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1653                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1654         }
1655
1656         err = ip6_local_out(skb);
1657         if (err) {
1658                 if (err > 0)
1659                         err = net_xmit_errno(err);
1660                 if (err)
1661                         goto error;
1662         }
1663
1664 out:
1665         ip6_cork_release(inet, np);
1666         return err;
1667 error:
1668         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1669         goto out;
1670 }
1671
1672 void ip6_flush_pending_frames(struct sock *sk)
1673 {
1674         struct sk_buff *skb;
1675
1676         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1677                 if (skb_dst(skb))
1678                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1679                                       IPSTATS_MIB_OUTDISCARDS);
1680                 kfree_skb(skb);
1681         }
1682
1683         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1684 }