46fc6a381b681251fda91ce27eccae4d0b78d022
[pandora-kernel.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89         skb_reset_mac_header(newskb);
90         __skb_pull(newskb, skb_network_offset(newskb));
91         newskb->pkt_type = PACKET_LOOPBACK;
92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
93         WARN_ON(!skb_dst(newskb));
94
95         netif_rx_ni(newskb);
96         return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101         struct dst_entry *dst = skb_dst(skb);
102         struct net_device *dev = dst->dev;
103         struct neighbour *neigh;
104
105         skb->protocol = htons(ETH_P_IPV6);
106         skb->dev = dev;
107
108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110
111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112                     ((mroute6_socket(dev_net(dev), skb) &&
113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115                                          &ipv6_hdr(skb)->saddr))) {
116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117
118                         /* Do not check for IFF_ALLMULTI; multicast routing
119                            is not supported in any case.
120                          */
121                         if (newskb)
122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123                                         newskb, NULL, newskb->dev,
124                                         ip6_dev_loopback_xmit);
125
126                         if (ipv6_hdr(skb)->hop_limit == 0) {
127                                 IP6_INC_STATS(dev_net(dev), idev,
128                                               IPSTATS_MIB_OUTDISCARDS);
129                                 kfree_skb(skb);
130                                 return 0;
131                         }
132                 }
133
134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135                                 skb->len);
136         }
137
138         rcu_read_lock();
139         neigh = dst_get_neighbour(dst);
140         if (neigh) {
141                 int res = neigh_output(neigh, skb);
142
143                 rcu_read_unlock();
144                 return res;
145         }
146         rcu_read_unlock();
147         IP6_INC_STATS(dev_net(dst->dev),
148                       ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149         kfree_skb(skb);
150         return -EINVAL;
151 }
152
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156             dst_allfrag(skb_dst(skb)))
157                 return ip6_fragment(skb, ip6_finish_output2);
158         else
159                 return ip6_finish_output2(skb);
160 }
161
162 int ip6_output(struct sk_buff *skb)
163 {
164         struct net_device *dev = skb_dst(skb)->dev;
165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166         if (unlikely(idev->cnf.disable_ipv6)) {
167                 IP6_INC_STATS(dev_net(dev), idev,
168                               IPSTATS_MIB_OUTDISCARDS);
169                 kfree_skb(skb);
170                 return 0;
171         }
172
173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174                             ip6_finish_output,
175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177
178 /*
179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183              struct ipv6_txoptions *opt, int tclass)
184 {
185         struct net *net = sock_net(sk);
186         struct ipv6_pinfo *np = inet6_sk(sk);
187         struct in6_addr *first_hop = &fl6->daddr;
188         struct dst_entry *dst = skb_dst(skb);
189         struct ipv6hdr *hdr;
190         u8  proto = fl6->flowi6_proto;
191         int seg_len = skb->len;
192         int hlimit = -1;
193         u32 mtu;
194
195         if (opt) {
196                 unsigned int head_room;
197
198                 /* First: exthdrs may take lots of space (~8K for now)
199                    MAX_HEADER is not enough.
200                  */
201                 head_room = opt->opt_nflen + opt->opt_flen;
202                 seg_len += head_room;
203                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204
205                 if (skb_headroom(skb) < head_room) {
206                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207                         if (skb2 == NULL) {
208                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209                                               IPSTATS_MIB_OUTDISCARDS);
210                                 kfree_skb(skb);
211                                 return -ENOBUFS;
212                         }
213                         kfree_skb(skb);
214                         skb = skb2;
215                         skb_set_owner_w(skb, sk);
216                 }
217                 if (opt->opt_flen)
218                         ipv6_push_frag_opts(skb, opt, &proto);
219                 if (opt->opt_nflen)
220                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221         }
222
223         skb_push(skb, sizeof(struct ipv6hdr));
224         skb_reset_network_header(skb);
225         hdr = ipv6_hdr(skb);
226
227         /*
228          *      Fill in the IPv6 header
229          */
230         if (np)
231                 hlimit = np->hop_limit;
232         if (hlimit < 0)
233                 hlimit = ip6_dst_hoplimit(dst);
234
235         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236
237         hdr->payload_len = htons(seg_len);
238         hdr->nexthdr = proto;
239         hdr->hop_limit = hlimit;
240
241         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
242         ipv6_addr_copy(&hdr->daddr, first_hop);
243
244         skb->priority = sk->sk_priority;
245         skb->mark = sk->sk_mark;
246
247         mtu = dst_mtu(dst);
248         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250                               IPSTATS_MIB_OUT, skb->len);
251                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252                                dst->dev, dst_output);
253         }
254
255         if (net_ratelimit())
256                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
257         skb->dev = dst->dev;
258         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
259         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
260         kfree_skb(skb);
261         return -EMSGSIZE;
262 }
263
264 EXPORT_SYMBOL(ip6_xmit);
265
266 /*
267  *      To avoid extra problems ND packets are send through this
268  *      routine. It's code duplication but I really want to avoid
269  *      extra checks since ipv6_build_header is used by TCP (which
270  *      is for us performance critical)
271  */
272
273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
274                const struct in6_addr *saddr, const struct in6_addr *daddr,
275                int proto, int len)
276 {
277         struct ipv6_pinfo *np = inet6_sk(sk);
278         struct ipv6hdr *hdr;
279
280         skb->protocol = htons(ETH_P_IPV6);
281         skb->dev = dev;
282
283         skb_reset_network_header(skb);
284         skb_put(skb, sizeof(struct ipv6hdr));
285         hdr = ipv6_hdr(skb);
286
287         *(__be32*)hdr = htonl(0x60000000);
288
289         hdr->payload_len = htons(len);
290         hdr->nexthdr = proto;
291         hdr->hop_limit = np->hop_limit;
292
293         ipv6_addr_copy(&hdr->saddr, saddr);
294         ipv6_addr_copy(&hdr->daddr, daddr);
295
296         return 0;
297 }
298
299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
300 {
301         struct ip6_ra_chain *ra;
302         struct sock *last = NULL;
303
304         read_lock(&ip6_ra_lock);
305         for (ra = ip6_ra_chain; ra; ra = ra->next) {
306                 struct sock *sk = ra->sk;
307                 if (sk && ra->sel == sel &&
308                     (!sk->sk_bound_dev_if ||
309                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
310                         if (last) {
311                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
312                                 if (skb2)
313                                         rawv6_rcv(last, skb2);
314                         }
315                         last = sk;
316                 }
317         }
318
319         if (last) {
320                 rawv6_rcv(last, skb);
321                 read_unlock(&ip6_ra_lock);
322                 return 1;
323         }
324         read_unlock(&ip6_ra_lock);
325         return 0;
326 }
327
328 static int ip6_forward_proxy_check(struct sk_buff *skb)
329 {
330         struct ipv6hdr *hdr = ipv6_hdr(skb);
331         u8 nexthdr = hdr->nexthdr;
332         int offset;
333
334         if (ipv6_ext_hdr(nexthdr)) {
335                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
336                 if (offset < 0)
337                         return 0;
338         } else
339                 offset = sizeof(struct ipv6hdr);
340
341         if (nexthdr == IPPROTO_ICMPV6) {
342                 struct icmp6hdr *icmp6;
343
344                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345                                          offset + 1 - skb->data)))
346                         return 0;
347
348                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349
350                 switch (icmp6->icmp6_type) {
351                 case NDISC_ROUTER_SOLICITATION:
352                 case NDISC_ROUTER_ADVERTISEMENT:
353                 case NDISC_NEIGHBOUR_SOLICITATION:
354                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
355                 case NDISC_REDIRECT:
356                         /* For reaction involving unicast neighbor discovery
357                          * message destined to the proxied address, pass it to
358                          * input function.
359                          */
360                         return 1;
361                 default:
362                         break;
363                 }
364         }
365
366         /*
367          * The proxying router can't forward traffic sent to a link-local
368          * address, so signal the sender and discard the packet. This
369          * behavior is clarified by the MIPv6 specification.
370          */
371         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372                 dst_link_failure(skb);
373                 return -1;
374         }
375
376         return 0;
377 }
378
379 static inline int ip6_forward_finish(struct sk_buff *skb)
380 {
381         return dst_output(skb);
382 }
383
384 int ip6_forward(struct sk_buff *skb)
385 {
386         struct dst_entry *dst = skb_dst(skb);
387         struct ipv6hdr *hdr = ipv6_hdr(skb);
388         struct inet6_skb_parm *opt = IP6CB(skb);
389         struct net *net = dev_net(dst->dev);
390         struct neighbour *n;
391         u32 mtu;
392
393         if (net->ipv6.devconf_all->forwarding == 0)
394                 goto error;
395
396         if (skb_warn_if_lro(skb))
397                 goto drop;
398
399         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
401                 goto drop;
402         }
403
404         if (skb->pkt_type != PACKET_HOST)
405                 goto drop;
406
407         skb_forward_csum(skb);
408
409         /*
410          *      We DO NOT make any processing on
411          *      RA packets, pushing them to user level AS IS
412          *      without ane WARRANTY that application will be able
413          *      to interpret them. The reason is that we
414          *      cannot make anything clever here.
415          *
416          *      We are not end-node, so that if packet contains
417          *      AH/ESP, we cannot make anything.
418          *      Defragmentation also would be mistake, RA packets
419          *      cannot be fragmented, because there is no warranty
420          *      that different fragments will go along one path. --ANK
421          */
422         if (opt->ra) {
423                 u8 *ptr = skb_network_header(skb) + opt->ra;
424                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
425                         return 0;
426         }
427
428         /*
429          *      check and decrement ttl
430          */
431         if (hdr->hop_limit <= 1) {
432                 /* Force OUTPUT device used as source address */
433                 skb->dev = dst->dev;
434                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
435                 IP6_INC_STATS_BH(net,
436                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
437
438                 kfree_skb(skb);
439                 return -ETIMEDOUT;
440         }
441
442         /* XXX: idev->cnf.proxy_ndp? */
443         if (net->ipv6.devconf_all->proxy_ndp &&
444             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
445                 int proxied = ip6_forward_proxy_check(skb);
446                 if (proxied > 0)
447                         return ip6_input(skb);
448                 else if (proxied < 0) {
449                         IP6_INC_STATS(net, ip6_dst_idev(dst),
450                                       IPSTATS_MIB_INDISCARDS);
451                         goto drop;
452                 }
453         }
454
455         if (!xfrm6_route_forward(skb)) {
456                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
457                 goto drop;
458         }
459         dst = skb_dst(skb);
460
461         /* IPv6 specs say nothing about it, but it is clear that we cannot
462            send redirects to source routed frames.
463            We don't send redirects to frames decapsulated from IPsec.
464          */
465         n = dst_get_neighbour(dst);
466         if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
467                 struct in6_addr *target = NULL;
468                 struct rt6_info *rt;
469
470                 /*
471                  *      incoming and outgoing devices are the same
472                  *      send a redirect.
473                  */
474
475                 rt = (struct rt6_info *) dst;
476                 if ((rt->rt6i_flags & RTF_GATEWAY))
477                         target = (struct in6_addr*)&n->primary_key;
478                 else
479                         target = &hdr->daddr;
480
481                 if (!rt->rt6i_peer)
482                         rt6_bind_peer(rt, 1);
483
484                 /* Limit redirects both by destination (here)
485                    and by source (inside ndisc_send_redirect)
486                  */
487                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
488                         ndisc_send_redirect(skb, n, target);
489         } else {
490                 int addrtype = ipv6_addr_type(&hdr->saddr);
491
492                 /* This check is security critical. */
493                 if (addrtype == IPV6_ADDR_ANY ||
494                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
495                         goto error;
496                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
497                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
498                                     ICMPV6_NOT_NEIGHBOUR, 0);
499                         goto error;
500                 }
501         }
502
503         mtu = dst_mtu(dst);
504         if (mtu < IPV6_MIN_MTU)
505                 mtu = IPV6_MIN_MTU;
506
507         if (skb->len > mtu && !skb_is_gso(skb)) {
508                 /* Again, force OUTPUT device used as source address */
509                 skb->dev = dst->dev;
510                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
511                 IP6_INC_STATS_BH(net,
512                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
513                 IP6_INC_STATS_BH(net,
514                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
515                 kfree_skb(skb);
516                 return -EMSGSIZE;
517         }
518
519         if (skb_cow(skb, dst->dev->hard_header_len)) {
520                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
521                 goto drop;
522         }
523
524         hdr = ipv6_hdr(skb);
525
526         /* Mangling hops number delayed to point after skb COW */
527
528         hdr->hop_limit--;
529
530         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
531         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
532                        ip6_forward_finish);
533
534 error:
535         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
536 drop:
537         kfree_skb(skb);
538         return -EINVAL;
539 }
540
541 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
542 {
543         to->pkt_type = from->pkt_type;
544         to->priority = from->priority;
545         to->protocol = from->protocol;
546         skb_dst_drop(to);
547         skb_dst_set(to, dst_clone(skb_dst(from)));
548         to->dev = from->dev;
549         to->mark = from->mark;
550
551 #ifdef CONFIG_NET_SCHED
552         to->tc_index = from->tc_index;
553 #endif
554         nf_copy(to, from);
555 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
556     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
557         to->nf_trace = from->nf_trace;
558 #endif
559         skb_copy_secmark(to, from);
560 }
561
562 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
563 {
564         u16 offset = sizeof(struct ipv6hdr);
565         struct ipv6_opt_hdr *exthdr =
566                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
567         unsigned int packet_len = skb->tail - skb->network_header;
568         int found_rhdr = 0;
569         *nexthdr = &ipv6_hdr(skb)->nexthdr;
570
571         while (offset + 1 <= packet_len) {
572
573                 switch (**nexthdr) {
574
575                 case NEXTHDR_HOP:
576                         break;
577                 case NEXTHDR_ROUTING:
578                         found_rhdr = 1;
579                         break;
580                 case NEXTHDR_DEST:
581 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
582                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
583                                 break;
584 #endif
585                         if (found_rhdr)
586                                 return offset;
587                         break;
588                 default :
589                         return offset;
590                 }
591
592                 offset += ipv6_optlen(exthdr);
593                 *nexthdr = &exthdr->nexthdr;
594                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
595                                                  offset);
596         }
597
598         return offset;
599 }
600
601 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
602 {
603         static u32 ip6_idents_hashrnd __read_mostly;
604         static bool hashrnd_initialized = false;
605         u32 hash, id;
606
607         if (unlikely(!hashrnd_initialized)) {
608                 hashrnd_initialized = true;
609                 get_random_bytes(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
610         }
611         hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd);
612         hash = __ipv6_addr_jhash(&rt->rt6i_src.addr, hash);
613
614         id = ip_idents_reserve(hash, 1);
615         fhdr->identification = htonl(id);
616 }
617
618 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
619 {
620         struct sk_buff *frag;
621         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
622         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
623         struct ipv6hdr *tmp_hdr;
624         struct frag_hdr *fh;
625         unsigned int mtu, hlen, left, len;
626         __be32 frag_id = 0;
627         int ptr, offset = 0, err=0;
628         u8 *prevhdr, nexthdr = 0;
629         struct net *net = dev_net(skb_dst(skb)->dev);
630
631         hlen = ip6_find_1stfragopt(skb, &prevhdr);
632         nexthdr = *prevhdr;
633
634         mtu = ip6_skb_dst_mtu(skb);
635
636         /* We must not fragment if the socket is set to force MTU discovery
637          * or if the skb it not generated by a local socket.
638          */
639         if (!skb->local_df && skb->len > mtu) {
640                 skb->dev = skb_dst(skb)->dev;
641                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
642                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
643                               IPSTATS_MIB_FRAGFAILS);
644                 kfree_skb(skb);
645                 return -EMSGSIZE;
646         }
647
648         if (np && np->frag_size < mtu) {
649                 if (np->frag_size)
650                         mtu = np->frag_size;
651         }
652         mtu -= hlen + sizeof(struct frag_hdr);
653
654         if (skb_has_frag_list(skb)) {
655                 int first_len = skb_pagelen(skb);
656                 struct sk_buff *frag2;
657
658                 if (first_len - hlen > mtu ||
659                     ((first_len - hlen) & 7) ||
660                     skb_cloned(skb))
661                         goto slow_path;
662
663                 skb_walk_frags(skb, frag) {
664                         /* Correct geometry. */
665                         if (frag->len > mtu ||
666                             ((frag->len & 7) && frag->next) ||
667                             skb_headroom(frag) < hlen)
668                                 goto slow_path_clean;
669
670                         /* Partially cloned skb? */
671                         if (skb_shared(frag))
672                                 goto slow_path_clean;
673
674                         BUG_ON(frag->sk);
675                         if (skb->sk) {
676                                 frag->sk = skb->sk;
677                                 frag->destructor = sock_wfree;
678                         }
679                         skb->truesize -= frag->truesize;
680                 }
681
682                 err = 0;
683                 offset = 0;
684                 frag = skb_shinfo(skb)->frag_list;
685                 skb_frag_list_init(skb);
686                 /* BUILD HEADER */
687
688                 *prevhdr = NEXTHDR_FRAGMENT;
689                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
690                 if (!tmp_hdr) {
691                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
692                                       IPSTATS_MIB_FRAGFAILS);
693                         return -ENOMEM;
694                 }
695
696                 __skb_pull(skb, hlen);
697                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
698                 __skb_push(skb, hlen);
699                 skb_reset_network_header(skb);
700                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
701
702                 ipv6_select_ident(fh, rt);
703                 fh->nexthdr = nexthdr;
704                 fh->reserved = 0;
705                 fh->frag_off = htons(IP6_MF);
706                 frag_id = fh->identification;
707
708                 first_len = skb_pagelen(skb);
709                 skb->data_len = first_len - skb_headlen(skb);
710                 skb->len = first_len;
711                 ipv6_hdr(skb)->payload_len = htons(first_len -
712                                                    sizeof(struct ipv6hdr));
713
714                 dst_hold(&rt->dst);
715
716                 for (;;) {
717                         /* Prepare header of the next frame,
718                          * before previous one went down. */
719                         if (frag) {
720                                 frag->ip_summed = CHECKSUM_NONE;
721                                 skb_reset_transport_header(frag);
722                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
723                                 __skb_push(frag, hlen);
724                                 skb_reset_network_header(frag);
725                                 memcpy(skb_network_header(frag), tmp_hdr,
726                                        hlen);
727                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
728                                 fh->nexthdr = nexthdr;
729                                 fh->reserved = 0;
730                                 fh->frag_off = htons(offset);
731                                 if (frag->next != NULL)
732                                         fh->frag_off |= htons(IP6_MF);
733                                 fh->identification = frag_id;
734                                 ipv6_hdr(frag)->payload_len =
735                                                 htons(frag->len -
736                                                       sizeof(struct ipv6hdr));
737                                 ip6_copy_metadata(frag, skb);
738                         }
739
740                         err = output(skb);
741                         if(!err)
742                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
743                                               IPSTATS_MIB_FRAGCREATES);
744
745                         if (err || !frag)
746                                 break;
747
748                         skb = frag;
749                         frag = skb->next;
750                         skb->next = NULL;
751                 }
752
753                 kfree(tmp_hdr);
754
755                 if (err == 0) {
756                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
757                                       IPSTATS_MIB_FRAGOKS);
758                         dst_release(&rt->dst);
759                         return 0;
760                 }
761
762                 while (frag) {
763                         skb = frag->next;
764                         kfree_skb(frag);
765                         frag = skb;
766                 }
767
768                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
769                               IPSTATS_MIB_FRAGFAILS);
770                 dst_release(&rt->dst);
771                 return err;
772
773 slow_path_clean:
774                 skb_walk_frags(skb, frag2) {
775                         if (frag2 == frag)
776                                 break;
777                         frag2->sk = NULL;
778                         frag2->destructor = NULL;
779                         skb->truesize += frag2->truesize;
780                 }
781         }
782
783 slow_path:
784         left = skb->len - hlen;         /* Space per frame */
785         ptr = hlen;                     /* Where to start from */
786
787         /*
788          *      Fragment the datagram.
789          */
790
791         *prevhdr = NEXTHDR_FRAGMENT;
792
793         /*
794          *      Keep copying data until we run out.
795          */
796         while(left > 0) {
797                 len = left;
798                 /* IF: it doesn't fit, use 'mtu' - the data space left */
799                 if (len > mtu)
800                         len = mtu;
801                 /* IF: we are not sending up to and including the packet end
802                    then align the next start on an eight byte boundary */
803                 if (len < left) {
804                         len &= ~7;
805                 }
806                 /*
807                  *      Allocate buffer.
808                  */
809
810                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
811                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
812                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
813                                       IPSTATS_MIB_FRAGFAILS);
814                         err = -ENOMEM;
815                         goto fail;
816                 }
817
818                 /*
819                  *      Set up data on packet
820                  */
821
822                 ip6_copy_metadata(frag, skb);
823                 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
824                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
825                 skb_reset_network_header(frag);
826                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
827                 frag->transport_header = (frag->network_header + hlen +
828                                           sizeof(struct frag_hdr));
829
830                 /*
831                  *      Charge the memory for the fragment to any owner
832                  *      it might possess
833                  */
834                 if (skb->sk)
835                         skb_set_owner_w(frag, skb->sk);
836
837                 /*
838                  *      Copy the packet header into the new buffer.
839                  */
840                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
841
842                 /*
843                  *      Build fragment header.
844                  */
845                 fh->nexthdr = nexthdr;
846                 fh->reserved = 0;
847                 if (!frag_id) {
848                         ipv6_select_ident(fh, rt);
849                         frag_id = fh->identification;
850                 } else
851                         fh->identification = frag_id;
852
853                 /*
854                  *      Copy a block of the IP datagram.
855                  */
856                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
857                         BUG();
858                 left -= len;
859
860                 fh->frag_off = htons(offset);
861                 if (left > 0)
862                         fh->frag_off |= htons(IP6_MF);
863                 ipv6_hdr(frag)->payload_len = htons(frag->len -
864                                                     sizeof(struct ipv6hdr));
865
866                 ptr += len;
867                 offset += len;
868
869                 /*
870                  *      Put this fragment into the sending queue.
871                  */
872                 err = output(frag);
873                 if (err)
874                         goto fail;
875
876                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
877                               IPSTATS_MIB_FRAGCREATES);
878         }
879         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
880                       IPSTATS_MIB_FRAGOKS);
881         kfree_skb(skb);
882         return err;
883
884 fail:
885         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
886                       IPSTATS_MIB_FRAGFAILS);
887         kfree_skb(skb);
888         return err;
889 }
890
891 static inline int ip6_rt_check(const struct rt6key *rt_key,
892                                const struct in6_addr *fl_addr,
893                                const struct in6_addr *addr_cache)
894 {
895         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
896                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
897 }
898
899 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
900                                           struct dst_entry *dst,
901                                           const struct flowi6 *fl6)
902 {
903         struct ipv6_pinfo *np = inet6_sk(sk);
904         struct rt6_info *rt;
905
906         if (!dst)
907                 goto out;
908
909         if (dst->ops->family != AF_INET6) {
910                 dst_release(dst);
911                 return NULL;
912         }
913
914         rt = (struct rt6_info *)dst;
915         /* Yes, checking route validity in not connected
916          * case is not very simple. Take into account,
917          * that we do not support routing by source, TOS,
918          * and MSG_DONTROUTE            --ANK (980726)
919          *
920          * 1. ip6_rt_check(): If route was host route,
921          *    check that cached destination is current.
922          *    If it is network route, we still may
923          *    check its validity using saved pointer
924          *    to the last used address: daddr_cache.
925          *    We do not want to save whole address now,
926          *    (because main consumer of this service
927          *    is tcp, which has not this problem),
928          *    so that the last trick works only on connected
929          *    sockets.
930          * 2. oif also should be the same.
931          */
932         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
933 #ifdef CONFIG_IPV6_SUBTREES
934             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
935 #endif
936             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
937                 dst_release(dst);
938                 dst = NULL;
939         }
940
941 out:
942         return dst;
943 }
944
945 static int ip6_dst_lookup_tail(struct sock *sk,
946                                struct dst_entry **dst, struct flowi6 *fl6)
947 {
948         struct net *net = sock_net(sk);
949 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
950         struct neighbour *n;
951 #endif
952         int err;
953
954         if (*dst == NULL)
955                 *dst = ip6_route_output(net, sk, fl6);
956
957         if ((err = (*dst)->error))
958                 goto out_err_release;
959
960         if (ipv6_addr_any(&fl6->saddr)) {
961                 struct rt6_info *rt = (struct rt6_info *) *dst;
962                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
963                                           sk ? inet6_sk(sk)->srcprefs : 0,
964                                           &fl6->saddr);
965                 if (err)
966                         goto out_err_release;
967         }
968
969 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
970         /*
971          * Here if the dst entry we've looked up
972          * has a neighbour entry that is in the INCOMPLETE
973          * state and the src address from the flow is
974          * marked as OPTIMISTIC, we release the found
975          * dst entry and replace it instead with the
976          * dst entry of the nexthop router
977          */
978         rcu_read_lock();
979         n = dst_get_neighbour(*dst);
980         if (n && !(n->nud_state & NUD_VALID)) {
981                 struct inet6_ifaddr *ifp;
982                 struct flowi6 fl_gw6;
983                 int redirect;
984
985                 rcu_read_unlock();
986                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
987                                       (*dst)->dev, 1);
988
989                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
990                 if (ifp)
991                         in6_ifa_put(ifp);
992
993                 if (redirect) {
994                         /*
995                          * We need to get the dst entry for the
996                          * default router instead
997                          */
998                         dst_release(*dst);
999                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1000                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1001                         *dst = ip6_route_output(net, sk, &fl_gw6);
1002                         if ((err = (*dst)->error))
1003                                 goto out_err_release;
1004                 }
1005         } else {
1006                 rcu_read_unlock();
1007         }
1008 #endif
1009
1010         return 0;
1011
1012 out_err_release:
1013         if (err == -ENETUNREACH)
1014                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1015         dst_release(*dst);
1016         *dst = NULL;
1017         return err;
1018 }
1019
1020 /**
1021  *      ip6_dst_lookup - perform route lookup on flow
1022  *      @sk: socket which provides route info
1023  *      @dst: pointer to dst_entry * for result
1024  *      @fl6: flow to lookup
1025  *
1026  *      This function performs a route lookup on the given flow.
1027  *
1028  *      It returns zero on success, or a standard errno code on error.
1029  */
1030 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1031 {
1032         *dst = NULL;
1033         return ip6_dst_lookup_tail(sk, dst, fl6);
1034 }
1035 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1036
1037 /**
1038  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1039  *      @sk: socket which provides route info
1040  *      @fl6: flow to lookup
1041  *      @final_dst: final destination address for ipsec lookup
1042  *      @can_sleep: we are in a sleepable context
1043  *
1044  *      This function performs a route lookup on the given flow.
1045  *
1046  *      It returns a valid dst pointer on success, or a pointer encoded
1047  *      error code.
1048  */
1049 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1050                                       const struct in6_addr *final_dst,
1051                                       bool can_sleep)
1052 {
1053         struct dst_entry *dst = NULL;
1054         int err;
1055
1056         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1057         if (err)
1058                 return ERR_PTR(err);
1059         if (final_dst)
1060                 ipv6_addr_copy(&fl6->daddr, final_dst);
1061         if (can_sleep)
1062                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1063
1064         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1065 }
1066 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1067
1068 /**
1069  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1070  *      @sk: socket which provides the dst cache and route info
1071  *      @fl6: flow to lookup
1072  *      @final_dst: final destination address for ipsec lookup
1073  *      @can_sleep: we are in a sleepable context
1074  *
1075  *      This function performs a route lookup on the given flow with the
1076  *      possibility of using the cached route in the socket if it is valid.
1077  *      It will take the socket dst lock when operating on the dst cache.
1078  *      As a result, this function can only be used in process context.
1079  *
1080  *      It returns a valid dst pointer on success, or a pointer encoded
1081  *      error code.
1082  */
1083 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1084                                          const struct in6_addr *final_dst,
1085                                          bool can_sleep)
1086 {
1087         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1088         int err;
1089
1090         dst = ip6_sk_dst_check(sk, dst, fl6);
1091
1092         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1093         if (err)
1094                 return ERR_PTR(err);
1095         if (final_dst)
1096                 ipv6_addr_copy(&fl6->daddr, final_dst);
1097         if (can_sleep)
1098                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1099
1100         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1101 }
1102 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1103
1104 static inline int ip6_ufo_append_data(struct sock *sk,
1105                         int getfrag(void *from, char *to, int offset, int len,
1106                         int odd, struct sk_buff *skb),
1107                         void *from, int length, int hh_len, int fragheaderlen,
1108                         int transhdrlen, int mtu,unsigned int flags,
1109                         struct rt6_info *rt)
1110
1111 {
1112         struct sk_buff *skb;
1113         int err;
1114
1115         /* There is support for UDP large send offload by network
1116          * device, so create one single skb packet containing complete
1117          * udp datagram
1118          */
1119         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1120                 struct frag_hdr fhdr;
1121
1122                 skb = sock_alloc_send_skb(sk,
1123                         hh_len + fragheaderlen + transhdrlen + 20,
1124                         (flags & MSG_DONTWAIT), &err);
1125                 if (skb == NULL)
1126                         return err;
1127
1128                 /* reserve space for Hardware header */
1129                 skb_reserve(skb, hh_len);
1130
1131                 /* create space for UDP/IP header */
1132                 skb_put(skb,fragheaderlen + transhdrlen);
1133
1134                 /* initialize network header pointer */
1135                 skb_reset_network_header(skb);
1136
1137                 /* initialize protocol header pointer */
1138                 skb->transport_header = skb->network_header + fragheaderlen;
1139
1140                 skb->ip_summed = CHECKSUM_PARTIAL;
1141                 skb->csum = 0;
1142
1143                 /* Specify the length of each IPv6 datagram fragment.
1144                  * It has to be a multiple of 8.
1145                  */
1146                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1147                                              sizeof(struct frag_hdr)) & ~7;
1148                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1149                 ipv6_select_ident(&fhdr, rt);
1150                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1151                 __skb_queue_tail(&sk->sk_write_queue, skb);
1152         }
1153
1154         return skb_append_datato_frags(sk, skb, getfrag, from,
1155                                        (length - transhdrlen));
1156 }
1157
1158 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1159                                                gfp_t gfp)
1160 {
1161         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1162 }
1163
1164 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1165                                                 gfp_t gfp)
1166 {
1167         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1168 }
1169
1170 static void ip6_append_data_mtu(unsigned int *mtu,
1171                                 int *maxfraglen,
1172                                 unsigned int fragheaderlen,
1173                                 struct sk_buff *skb,
1174                                 struct rt6_info *rt,
1175                                 unsigned int orig_mtu)
1176 {
1177         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1178                 if (skb == NULL) {
1179                         /* first fragment, reserve header_len */
1180                         *mtu = orig_mtu - rt->dst.header_len;
1181
1182                 } else {
1183                         /*
1184                          * this fragment is not first, the headers
1185                          * space is regarded as data space.
1186                          */
1187                         *mtu = orig_mtu;
1188                 }
1189                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1190                               + fragheaderlen - sizeof(struct frag_hdr);
1191         }
1192 }
1193
1194 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1195         int offset, int len, int odd, struct sk_buff *skb),
1196         void *from, int length, int transhdrlen,
1197         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1198         struct rt6_info *rt, unsigned int flags, int dontfrag)
1199 {
1200         struct inet_sock *inet = inet_sk(sk);
1201         struct ipv6_pinfo *np = inet6_sk(sk);
1202         struct inet_cork *cork;
1203         struct sk_buff *skb, *skb_prev = NULL;
1204         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1205         int exthdrlen;
1206         int dst_exthdrlen;
1207         int hh_len;
1208         int copy;
1209         int err;
1210         int offset = 0;
1211         int csummode = CHECKSUM_NONE;
1212         __u8 tx_flags = 0;
1213
1214         if (flags&MSG_PROBE)
1215                 return 0;
1216         cork = &inet->cork.base;
1217         if (skb_queue_empty(&sk->sk_write_queue)) {
1218                 /*
1219                  * setup for corking
1220                  */
1221                 if (opt) {
1222                         if (WARN_ON(np->cork.opt))
1223                                 return -EINVAL;
1224
1225                         np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1226                         if (unlikely(np->cork.opt == NULL))
1227                                 return -ENOBUFS;
1228
1229                         np->cork.opt->tot_len = opt->tot_len;
1230                         np->cork.opt->opt_flen = opt->opt_flen;
1231                         np->cork.opt->opt_nflen = opt->opt_nflen;
1232
1233                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1234                                                             sk->sk_allocation);
1235                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1236                                 return -ENOBUFS;
1237
1238                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1239                                                             sk->sk_allocation);
1240                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1241                                 return -ENOBUFS;
1242
1243                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1244                                                            sk->sk_allocation);
1245                         if (opt->hopopt && !np->cork.opt->hopopt)
1246                                 return -ENOBUFS;
1247
1248                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1249                                                             sk->sk_allocation);
1250                         if (opt->srcrt && !np->cork.opt->srcrt)
1251                                 return -ENOBUFS;
1252
1253                         /* need source address above miyazawa*/
1254                 }
1255                 dst_hold(&rt->dst);
1256                 cork->dst = &rt->dst;
1257                 inet->cork.fl.u.ip6 = *fl6;
1258                 np->cork.hop_limit = hlimit;
1259                 np->cork.tclass = tclass;
1260                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1261                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1262                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1263                 else
1264                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1265                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1266                 if (np->frag_size < mtu) {
1267                         if (np->frag_size)
1268                                 mtu = np->frag_size;
1269                 }
1270                 cork->fragsize = mtu;
1271                 if (dst_allfrag(rt->dst.path))
1272                         cork->flags |= IPCORK_ALLFRAG;
1273                 cork->length = 0;
1274                 sk->sk_sndmsg_page = NULL;
1275                 sk->sk_sndmsg_off = 0;
1276                 exthdrlen = (opt ? opt->opt_flen : 0);
1277                 length += exthdrlen;
1278                 transhdrlen += exthdrlen;
1279                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1280         } else {
1281                 rt = (struct rt6_info *)cork->dst;
1282                 fl6 = &inet->cork.fl.u.ip6;
1283                 opt = np->cork.opt;
1284                 transhdrlen = 0;
1285                 exthdrlen = 0;
1286                 dst_exthdrlen = 0;
1287                 mtu = cork->fragsize;
1288         }
1289         orig_mtu = mtu;
1290
1291         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1292
1293         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1294                         (opt ? opt->opt_nflen : 0);
1295         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1296
1297         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1298                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1299                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1300                         return -EMSGSIZE;
1301                 }
1302         }
1303
1304         /* For UDP, check if TX timestamp is enabled */
1305         if (sk->sk_type == SOCK_DGRAM) {
1306                 err = sock_tx_timestamp(sk, &tx_flags);
1307                 if (err)
1308                         goto error;
1309         }
1310
1311         /*
1312          * Let's try using as much space as possible.
1313          * Use MTU if total length of the message fits into the MTU.
1314          * Otherwise, we need to reserve fragment header and
1315          * fragment alignment (= 8-15 octects, in total).
1316          *
1317          * Note that we may need to "move" the data from the tail of
1318          * of the buffer to the new fragment when we split
1319          * the message.
1320          *
1321          * FIXME: It may be fragmented into multiple chunks
1322          *        at once if non-fragmentable extension headers
1323          *        are too large.
1324          * --yoshfuji
1325          */
1326
1327         if ((length > mtu) && dontfrag && (sk->sk_protocol == IPPROTO_UDP ||
1328                                            sk->sk_protocol == IPPROTO_RAW)) {
1329                 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1330                 return -EMSGSIZE;
1331         }
1332
1333         skb = skb_peek_tail(&sk->sk_write_queue);
1334         cork->length += length;
1335         if (((length > mtu) ||
1336              (skb && skb_has_frags(skb))) &&
1337             (sk->sk_protocol == IPPROTO_UDP) &&
1338             (rt->dst.dev->features & NETIF_F_UFO)) {
1339                 err = ip6_ufo_append_data(sk, getfrag, from, length,
1340                                           hh_len, fragheaderlen,
1341                                           transhdrlen, mtu, flags, rt);
1342                 if (err)
1343                         goto error;
1344                 return 0;
1345         }
1346
1347         if (!skb)
1348                 goto alloc_new_skb;
1349
1350         while (length > 0) {
1351                 /* Check if the remaining data fits into current packet. */
1352                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1353                 if (copy < length)
1354                         copy = maxfraglen - skb->len;
1355
1356                 if (copy <= 0) {
1357                         char *data;
1358                         unsigned int datalen;
1359                         unsigned int fraglen;
1360                         unsigned int fraggap;
1361                         unsigned int alloclen;
1362 alloc_new_skb:
1363                         /* There's no room in the current skb */
1364                         if (skb)
1365                                 fraggap = skb->len - maxfraglen;
1366                         else
1367                                 fraggap = 0;
1368                         /* update mtu and maxfraglen if necessary */
1369                         if (skb == NULL || skb_prev == NULL)
1370                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1371                                                     fragheaderlen, skb, rt,
1372                                                     orig_mtu);
1373
1374                         skb_prev = skb;
1375
1376                         /*
1377                          * If remaining data exceeds the mtu,
1378                          * we know we need more fragment(s).
1379                          */
1380                         datalen = length + fraggap;
1381
1382                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1383                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1384                         if ((flags & MSG_MORE) &&
1385                             !(rt->dst.dev->features&NETIF_F_SG))
1386                                 alloclen = mtu;
1387                         else
1388                                 alloclen = datalen + fragheaderlen;
1389
1390                         alloclen += dst_exthdrlen;
1391
1392                         if (datalen != length + fraggap) {
1393                                 /*
1394                                  * this is not the last fragment, the trailer
1395                                  * space is regarded as data space.
1396                                  */
1397                                 datalen += rt->dst.trailer_len;
1398                         }
1399
1400                         alloclen += rt->dst.trailer_len;
1401                         fraglen = datalen + fragheaderlen;
1402
1403                         /*
1404                          * We just reserve space for fragment header.
1405                          * Note: this may be overallocation if the message
1406                          * (without MSG_MORE) fits into the MTU.
1407                          */
1408                         alloclen += sizeof(struct frag_hdr);
1409
1410                         if (transhdrlen) {
1411                                 skb = sock_alloc_send_skb(sk,
1412                                                 alloclen + hh_len,
1413                                                 (flags & MSG_DONTWAIT), &err);
1414                         } else {
1415                                 skb = NULL;
1416                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1417                                     2 * sk->sk_sndbuf)
1418                                         skb = sock_wmalloc(sk,
1419                                                            alloclen + hh_len, 1,
1420                                                            sk->sk_allocation);
1421                                 if (unlikely(skb == NULL))
1422                                         err = -ENOBUFS;
1423                                 else {
1424                                         /* Only the initial fragment
1425                                          * is time stamped.
1426                                          */
1427                                         tx_flags = 0;
1428                                 }
1429                         }
1430                         if (skb == NULL)
1431                                 goto error;
1432                         /*
1433                          *      Fill in the control structures
1434                          */
1435                         skb->ip_summed = csummode;
1436                         skb->csum = 0;
1437                         /* reserve for fragmentation and ipsec header */
1438                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1439                                     dst_exthdrlen);
1440
1441                         if (sk->sk_type == SOCK_DGRAM)
1442                                 skb_shinfo(skb)->tx_flags = tx_flags;
1443
1444                         /*
1445                          *      Find where to start putting bytes
1446                          */
1447                         data = skb_put(skb, fraglen);
1448                         skb_set_network_header(skb, exthdrlen);
1449                         data += fragheaderlen;
1450                         skb->transport_header = (skb->network_header +
1451                                                  fragheaderlen);
1452                         if (fraggap) {
1453                                 skb->csum = skb_copy_and_csum_bits(
1454                                         skb_prev, maxfraglen,
1455                                         data + transhdrlen, fraggap, 0);
1456                                 skb_prev->csum = csum_sub(skb_prev->csum,
1457                                                           skb->csum);
1458                                 data += fraggap;
1459                                 pskb_trim_unique(skb_prev, maxfraglen);
1460                         }
1461                         copy = datalen - transhdrlen - fraggap;
1462
1463                         if (copy < 0) {
1464                                 err = -EINVAL;
1465                                 kfree_skb(skb);
1466                                 goto error;
1467                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1468                                 err = -EFAULT;
1469                                 kfree_skb(skb);
1470                                 goto error;
1471                         }
1472
1473                         offset += copy;
1474                         length -= datalen - fraggap;
1475                         transhdrlen = 0;
1476                         exthdrlen = 0;
1477                         dst_exthdrlen = 0;
1478                         csummode = CHECKSUM_NONE;
1479
1480                         /*
1481                          * Put the packet on the pending queue
1482                          */
1483                         __skb_queue_tail(&sk->sk_write_queue, skb);
1484                         continue;
1485                 }
1486
1487                 if (copy > length)
1488                         copy = length;
1489
1490                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1491                         unsigned int off;
1492
1493                         off = skb->len;
1494                         if (getfrag(from, skb_put(skb, copy),
1495                                                 offset, copy, off, skb) < 0) {
1496                                 __skb_trim(skb, off);
1497                                 err = -EFAULT;
1498                                 goto error;
1499                         }
1500                 } else {
1501                         int i = skb_shinfo(skb)->nr_frags;
1502                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1503                         struct page *page = sk->sk_sndmsg_page;
1504                         int off = sk->sk_sndmsg_off;
1505                         unsigned int left;
1506
1507                         if (page && (left = PAGE_SIZE - off) > 0) {
1508                                 if (copy >= left)
1509                                         copy = left;
1510                                 if (page != skb_frag_page(frag)) {
1511                                         if (i == MAX_SKB_FRAGS) {
1512                                                 err = -EMSGSIZE;
1513                                                 goto error;
1514                                         }
1515                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1516                                         skb_frag_ref(skb, i);
1517                                         frag = &skb_shinfo(skb)->frags[i];
1518                                 }
1519                         } else if(i < MAX_SKB_FRAGS) {
1520                                 if (copy > PAGE_SIZE)
1521                                         copy = PAGE_SIZE;
1522                                 page = alloc_pages(sk->sk_allocation, 0);
1523                                 if (page == NULL) {
1524                                         err = -ENOMEM;
1525                                         goto error;
1526                                 }
1527                                 sk->sk_sndmsg_page = page;
1528                                 sk->sk_sndmsg_off = 0;
1529
1530                                 skb_fill_page_desc(skb, i, page, 0, 0);
1531                                 frag = &skb_shinfo(skb)->frags[i];
1532                         } else {
1533                                 err = -EMSGSIZE;
1534                                 goto error;
1535                         }
1536                         if (getfrag(from,
1537                                     skb_frag_address(frag) + skb_frag_size(frag),
1538                                     offset, copy, skb->len, skb) < 0) {
1539                                 err = -EFAULT;
1540                                 goto error;
1541                         }
1542                         sk->sk_sndmsg_off += copy;
1543                         skb_frag_size_add(frag, copy);
1544                         skb->len += copy;
1545                         skb->data_len += copy;
1546                         skb->truesize += copy;
1547                         atomic_add(copy, &sk->sk_wmem_alloc);
1548                 }
1549                 offset += copy;
1550                 length -= copy;
1551         }
1552         return 0;
1553 error:
1554         cork->length -= length;
1555         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1556         return err;
1557 }
1558
1559 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1560 {
1561         if (np->cork.opt) {
1562                 kfree(np->cork.opt->dst0opt);
1563                 kfree(np->cork.opt->dst1opt);
1564                 kfree(np->cork.opt->hopopt);
1565                 kfree(np->cork.opt->srcrt);
1566                 kfree(np->cork.opt);
1567                 np->cork.opt = NULL;
1568         }
1569
1570         if (inet->cork.base.dst) {
1571                 dst_release(inet->cork.base.dst);
1572                 inet->cork.base.dst = NULL;
1573                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1574         }
1575         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1576 }
1577
1578 int ip6_push_pending_frames(struct sock *sk)
1579 {
1580         struct sk_buff *skb, *tmp_skb;
1581         struct sk_buff **tail_skb;
1582         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1583         struct inet_sock *inet = inet_sk(sk);
1584         struct ipv6_pinfo *np = inet6_sk(sk);
1585         struct net *net = sock_net(sk);
1586         struct ipv6hdr *hdr;
1587         struct ipv6_txoptions *opt = np->cork.opt;
1588         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1589         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1590         unsigned char proto = fl6->flowi6_proto;
1591         int err = 0;
1592
1593         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1594                 goto out;
1595         tail_skb = &(skb_shinfo(skb)->frag_list);
1596
1597         /* move skb->data to ip header from ext header */
1598         if (skb->data < skb_network_header(skb))
1599                 __skb_pull(skb, skb_network_offset(skb));
1600         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1601                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1602                 *tail_skb = tmp_skb;
1603                 tail_skb = &(tmp_skb->next);
1604                 skb->len += tmp_skb->len;
1605                 skb->data_len += tmp_skb->len;
1606                 skb->truesize += tmp_skb->truesize;
1607                 tmp_skb->destructor = NULL;
1608                 tmp_skb->sk = NULL;
1609         }
1610
1611         /* Allow local fragmentation. */
1612         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1613                 skb->local_df = 1;
1614
1615         ipv6_addr_copy(final_dst, &fl6->daddr);
1616         __skb_pull(skb, skb_network_header_len(skb));
1617         if (opt && opt->opt_flen)
1618                 ipv6_push_frag_opts(skb, opt, &proto);
1619         if (opt && opt->opt_nflen)
1620                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1621
1622         skb_push(skb, sizeof(struct ipv6hdr));
1623         skb_reset_network_header(skb);
1624         hdr = ipv6_hdr(skb);
1625
1626         *(__be32*)hdr = fl6->flowlabel |
1627                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1628
1629         hdr->hop_limit = np->cork.hop_limit;
1630         hdr->nexthdr = proto;
1631         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1632         ipv6_addr_copy(&hdr->daddr, final_dst);
1633
1634         skb->priority = sk->sk_priority;
1635         skb->mark = sk->sk_mark;
1636
1637         skb_dst_set(skb, dst_clone(&rt->dst));
1638         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1639         if (proto == IPPROTO_ICMPV6) {
1640                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1641
1642                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1643                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1644         }
1645
1646         err = ip6_local_out(skb);
1647         if (err) {
1648                 if (err > 0)
1649                         err = net_xmit_errno(err);
1650                 if (err)
1651                         goto error;
1652         }
1653
1654 out:
1655         ip6_cork_release(inet, np);
1656         return err;
1657 error:
1658         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1659         goto out;
1660 }
1661
1662 void ip6_flush_pending_frames(struct sock *sk)
1663 {
1664         struct sk_buff *skb;
1665
1666         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1667                 if (skb_dst(skb))
1668                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1669                                       IPSTATS_MIB_OUTDISCARDS);
1670                 kfree_skb(skb);
1671         }
1672
1673         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1674 }