Merge git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6-nmw
[pandora-kernel.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89         skb_reset_mac_header(newskb);
90         __skb_pull(newskb, skb_network_offset(newskb));
91         newskb->pkt_type = PACKET_LOOPBACK;
92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
93         WARN_ON(!skb_dst(newskb));
94
95         netif_rx_ni(newskb);
96         return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101         struct dst_entry *dst = skb_dst(skb);
102         struct net_device *dev = dst->dev;
103
104         skb->protocol = htons(ETH_P_IPV6);
105         skb->dev = dev;
106
107         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
108                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
109
110                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
111                     ((mroute6_socket(dev_net(dev), skb) &&
112                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
113                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
114                                          &ipv6_hdr(skb)->saddr))) {
115                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
116
117                         /* Do not check for IFF_ALLMULTI; multicast routing
118                            is not supported in any case.
119                          */
120                         if (newskb)
121                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
122                                         newskb, NULL, newskb->dev,
123                                         ip6_dev_loopback_xmit);
124
125                         if (ipv6_hdr(skb)->hop_limit == 0) {
126                                 IP6_INC_STATS(dev_net(dev), idev,
127                                               IPSTATS_MIB_OUTDISCARDS);
128                                 kfree_skb(skb);
129                                 return 0;
130                         }
131                 }
132
133                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
134                                 skb->len);
135         }
136
137         if (dst->hh)
138                 return neigh_hh_output(dst->hh, skb);
139         else if (dst->neighbour)
140                 return dst->neighbour->output(skb);
141
142         IP6_INC_STATS_BH(dev_net(dst->dev),
143                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
144         kfree_skb(skb);
145         return -EINVAL;
146 }
147
148 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
149 {
150         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
151
152         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
153                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
154 }
155
156 static int ip6_finish_output(struct sk_buff *skb)
157 {
158         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
159             dst_allfrag(skb_dst(skb)))
160                 return ip6_fragment(skb, ip6_finish_output2);
161         else
162                 return ip6_finish_output2(skb);
163 }
164
165 int ip6_output(struct sk_buff *skb)
166 {
167         struct net_device *dev = skb_dst(skb)->dev;
168         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
169         if (unlikely(idev->cnf.disable_ipv6)) {
170                 IP6_INC_STATS(dev_net(dev), idev,
171                               IPSTATS_MIB_OUTDISCARDS);
172                 kfree_skb(skb);
173                 return 0;
174         }
175
176         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
177                             ip6_finish_output,
178                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
179 }
180
181 /*
182  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
183  */
184
185 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
186              struct ipv6_txoptions *opt)
187 {
188         struct net *net = sock_net(sk);
189         struct ipv6_pinfo *np = inet6_sk(sk);
190         struct in6_addr *first_hop = &fl->fl6_dst;
191         struct dst_entry *dst = skb_dst(skb);
192         struct ipv6hdr *hdr;
193         u8  proto = fl->proto;
194         int seg_len = skb->len;
195         int hlimit = -1;
196         int tclass = 0;
197         u32 mtu;
198
199         if (opt) {
200                 unsigned int head_room;
201
202                 /* First: exthdrs may take lots of space (~8K for now)
203                    MAX_HEADER is not enough.
204                  */
205                 head_room = opt->opt_nflen + opt->opt_flen;
206                 seg_len += head_room;
207                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
208
209                 if (skb_headroom(skb) < head_room) {
210                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
211                         if (skb2 == NULL) {
212                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
213                                               IPSTATS_MIB_OUTDISCARDS);
214                                 kfree_skb(skb);
215                                 return -ENOBUFS;
216                         }
217                         kfree_skb(skb);
218                         skb = skb2;
219                         skb_set_owner_w(skb, sk);
220                 }
221                 if (opt->opt_flen)
222                         ipv6_push_frag_opts(skb, opt, &proto);
223                 if (opt->opt_nflen)
224                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
225         }
226
227         skb_push(skb, sizeof(struct ipv6hdr));
228         skb_reset_network_header(skb);
229         hdr = ipv6_hdr(skb);
230
231         /*
232          *      Fill in the IPv6 header
233          */
234         if (np) {
235                 tclass = np->tclass;
236                 hlimit = np->hop_limit;
237         }
238         if (hlimit < 0)
239                 hlimit = ip6_dst_hoplimit(dst);
240
241         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
242
243         hdr->payload_len = htons(seg_len);
244         hdr->nexthdr = proto;
245         hdr->hop_limit = hlimit;
246
247         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
248         ipv6_addr_copy(&hdr->daddr, first_hop);
249
250         skb->priority = sk->sk_priority;
251         skb->mark = sk->sk_mark;
252
253         mtu = dst_mtu(dst);
254         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
255                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
256                               IPSTATS_MIB_OUT, skb->len);
257                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
258                                dst->dev, dst_output);
259         }
260
261         if (net_ratelimit())
262                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
263         skb->dev = dst->dev;
264         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
265         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
266         kfree_skb(skb);
267         return -EMSGSIZE;
268 }
269
270 EXPORT_SYMBOL(ip6_xmit);
271
272 /*
273  *      To avoid extra problems ND packets are send through this
274  *      routine. It's code duplication but I really want to avoid
275  *      extra checks since ipv6_build_header is used by TCP (which
276  *      is for us performance critical)
277  */
278
279 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
280                const struct in6_addr *saddr, const struct in6_addr *daddr,
281                int proto, int len)
282 {
283         struct ipv6_pinfo *np = inet6_sk(sk);
284         struct ipv6hdr *hdr;
285         int totlen;
286
287         skb->protocol = htons(ETH_P_IPV6);
288         skb->dev = dev;
289
290         totlen = len + sizeof(struct ipv6hdr);
291
292         skb_reset_network_header(skb);
293         skb_put(skb, sizeof(struct ipv6hdr));
294         hdr = ipv6_hdr(skb);
295
296         *(__be32*)hdr = htonl(0x60000000);
297
298         hdr->payload_len = htons(len);
299         hdr->nexthdr = proto;
300         hdr->hop_limit = np->hop_limit;
301
302         ipv6_addr_copy(&hdr->saddr, saddr);
303         ipv6_addr_copy(&hdr->daddr, daddr);
304
305         return 0;
306 }
307
308 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
309 {
310         struct ip6_ra_chain *ra;
311         struct sock *last = NULL;
312
313         read_lock(&ip6_ra_lock);
314         for (ra = ip6_ra_chain; ra; ra = ra->next) {
315                 struct sock *sk = ra->sk;
316                 if (sk && ra->sel == sel &&
317                     (!sk->sk_bound_dev_if ||
318                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
319                         if (last) {
320                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
321                                 if (skb2)
322                                         rawv6_rcv(last, skb2);
323                         }
324                         last = sk;
325                 }
326         }
327
328         if (last) {
329                 rawv6_rcv(last, skb);
330                 read_unlock(&ip6_ra_lock);
331                 return 1;
332         }
333         read_unlock(&ip6_ra_lock);
334         return 0;
335 }
336
337 static int ip6_forward_proxy_check(struct sk_buff *skb)
338 {
339         struct ipv6hdr *hdr = ipv6_hdr(skb);
340         u8 nexthdr = hdr->nexthdr;
341         int offset;
342
343         if (ipv6_ext_hdr(nexthdr)) {
344                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
345                 if (offset < 0)
346                         return 0;
347         } else
348                 offset = sizeof(struct ipv6hdr);
349
350         if (nexthdr == IPPROTO_ICMPV6) {
351                 struct icmp6hdr *icmp6;
352
353                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
354                                          offset + 1 - skb->data)))
355                         return 0;
356
357                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
358
359                 switch (icmp6->icmp6_type) {
360                 case NDISC_ROUTER_SOLICITATION:
361                 case NDISC_ROUTER_ADVERTISEMENT:
362                 case NDISC_NEIGHBOUR_SOLICITATION:
363                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
364                 case NDISC_REDIRECT:
365                         /* For reaction involving unicast neighbor discovery
366                          * message destined to the proxied address, pass it to
367                          * input function.
368                          */
369                         return 1;
370                 default:
371                         break;
372                 }
373         }
374
375         /*
376          * The proxying router can't forward traffic sent to a link-local
377          * address, so signal the sender and discard the packet. This
378          * behavior is clarified by the MIPv6 specification.
379          */
380         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
381                 dst_link_failure(skb);
382                 return -1;
383         }
384
385         return 0;
386 }
387
388 static inline int ip6_forward_finish(struct sk_buff *skb)
389 {
390         return dst_output(skb);
391 }
392
393 int ip6_forward(struct sk_buff *skb)
394 {
395         struct dst_entry *dst = skb_dst(skb);
396         struct ipv6hdr *hdr = ipv6_hdr(skb);
397         struct inet6_skb_parm *opt = IP6CB(skb);
398         struct net *net = dev_net(dst->dev);
399         u32 mtu;
400
401         if (net->ipv6.devconf_all->forwarding == 0)
402                 goto error;
403
404         if (skb_warn_if_lro(skb))
405                 goto drop;
406
407         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
408                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
409                 goto drop;
410         }
411
412         skb_forward_csum(skb);
413
414         /*
415          *      We DO NOT make any processing on
416          *      RA packets, pushing them to user level AS IS
417          *      without ane WARRANTY that application will be able
418          *      to interpret them. The reason is that we
419          *      cannot make anything clever here.
420          *
421          *      We are not end-node, so that if packet contains
422          *      AH/ESP, we cannot make anything.
423          *      Defragmentation also would be mistake, RA packets
424          *      cannot be fragmented, because there is no warranty
425          *      that different fragments will go along one path. --ANK
426          */
427         if (opt->ra) {
428                 u8 *ptr = skb_network_header(skb) + opt->ra;
429                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
430                         return 0;
431         }
432
433         /*
434          *      check and decrement ttl
435          */
436         if (hdr->hop_limit <= 1) {
437                 /* Force OUTPUT device used as source address */
438                 skb->dev = dst->dev;
439                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
440                 IP6_INC_STATS_BH(net,
441                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
442
443                 kfree_skb(skb);
444                 return -ETIMEDOUT;
445         }
446
447         /* XXX: idev->cnf.proxy_ndp? */
448         if (net->ipv6.devconf_all->proxy_ndp &&
449             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
450                 int proxied = ip6_forward_proxy_check(skb);
451                 if (proxied > 0)
452                         return ip6_input(skb);
453                 else if (proxied < 0) {
454                         IP6_INC_STATS(net, ip6_dst_idev(dst),
455                                       IPSTATS_MIB_INDISCARDS);
456                         goto drop;
457                 }
458         }
459
460         if (!xfrm6_route_forward(skb)) {
461                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
462                 goto drop;
463         }
464         dst = skb_dst(skb);
465
466         /* IPv6 specs say nothing about it, but it is clear that we cannot
467            send redirects to source routed frames.
468            We don't send redirects to frames decapsulated from IPsec.
469          */
470         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
471             !skb_sec_path(skb)) {
472                 struct in6_addr *target = NULL;
473                 struct rt6_info *rt;
474                 struct neighbour *n = dst->neighbour;
475
476                 /*
477                  *      incoming and outgoing devices are the same
478                  *      send a redirect.
479                  */
480
481                 rt = (struct rt6_info *) dst;
482                 if ((rt->rt6i_flags & RTF_GATEWAY))
483                         target = (struct in6_addr*)&n->primary_key;
484                 else
485                         target = &hdr->daddr;
486
487                 /* Limit redirects both by destination (here)
488                    and by source (inside ndisc_send_redirect)
489                  */
490                 if (xrlim_allow(dst, 1*HZ))
491                         ndisc_send_redirect(skb, n, target);
492         } else {
493                 int addrtype = ipv6_addr_type(&hdr->saddr);
494
495                 /* This check is security critical. */
496                 if (addrtype == IPV6_ADDR_ANY ||
497                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
498                         goto error;
499                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
500                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
501                                     ICMPV6_NOT_NEIGHBOUR, 0);
502                         goto error;
503                 }
504         }
505
506         mtu = dst_mtu(dst);
507         if (mtu < IPV6_MIN_MTU)
508                 mtu = IPV6_MIN_MTU;
509
510         if (skb->len > mtu && !skb_is_gso(skb)) {
511                 /* Again, force OUTPUT device used as source address */
512                 skb->dev = dst->dev;
513                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
514                 IP6_INC_STATS_BH(net,
515                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
516                 IP6_INC_STATS_BH(net,
517                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
518                 kfree_skb(skb);
519                 return -EMSGSIZE;
520         }
521
522         if (skb_cow(skb, dst->dev->hard_header_len)) {
523                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
524                 goto drop;
525         }
526
527         hdr = ipv6_hdr(skb);
528
529         /* Mangling hops number delayed to point after skb COW */
530
531         hdr->hop_limit--;
532
533         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
534         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
535                        ip6_forward_finish);
536
537 error:
538         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
539 drop:
540         kfree_skb(skb);
541         return -EINVAL;
542 }
543
544 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
545 {
546         to->pkt_type = from->pkt_type;
547         to->priority = from->priority;
548         to->protocol = from->protocol;
549         skb_dst_drop(to);
550         skb_dst_set(to, dst_clone(skb_dst(from)));
551         to->dev = from->dev;
552         to->mark = from->mark;
553
554 #ifdef CONFIG_NET_SCHED
555         to->tc_index = from->tc_index;
556 #endif
557         nf_copy(to, from);
558 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
559     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
560         to->nf_trace = from->nf_trace;
561 #endif
562         skb_copy_secmark(to, from);
563 }
564
565 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
566 {
567         u16 offset = sizeof(struct ipv6hdr);
568         struct ipv6_opt_hdr *exthdr =
569                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
570         unsigned int packet_len = skb->tail - skb->network_header;
571         int found_rhdr = 0;
572         *nexthdr = &ipv6_hdr(skb)->nexthdr;
573
574         while (offset + 1 <= packet_len) {
575
576                 switch (**nexthdr) {
577
578                 case NEXTHDR_HOP:
579                         break;
580                 case NEXTHDR_ROUTING:
581                         found_rhdr = 1;
582                         break;
583                 case NEXTHDR_DEST:
584 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
585                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
586                                 break;
587 #endif
588                         if (found_rhdr)
589                                 return offset;
590                         break;
591                 default :
592                         return offset;
593                 }
594
595                 offset += ipv6_optlen(exthdr);
596                 *nexthdr = &exthdr->nexthdr;
597                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
598                                                  offset);
599         }
600
601         return offset;
602 }
603
604 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
605 {
606         struct sk_buff *frag;
607         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
608         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
609         struct ipv6hdr *tmp_hdr;
610         struct frag_hdr *fh;
611         unsigned int mtu, hlen, left, len;
612         __be32 frag_id = 0;
613         int ptr, offset = 0, err=0;
614         u8 *prevhdr, nexthdr = 0;
615         struct net *net = dev_net(skb_dst(skb)->dev);
616
617         hlen = ip6_find_1stfragopt(skb, &prevhdr);
618         nexthdr = *prevhdr;
619
620         mtu = ip6_skb_dst_mtu(skb);
621
622         /* We must not fragment if the socket is set to force MTU discovery
623          * or if the skb it not generated by a local socket.
624          */
625         if (!skb->local_df && skb->len > mtu) {
626                 skb->dev = skb_dst(skb)->dev;
627                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
628                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
629                               IPSTATS_MIB_FRAGFAILS);
630                 kfree_skb(skb);
631                 return -EMSGSIZE;
632         }
633
634         if (np && np->frag_size < mtu) {
635                 if (np->frag_size)
636                         mtu = np->frag_size;
637         }
638         mtu -= hlen + sizeof(struct frag_hdr);
639
640         if (skb_has_frags(skb)) {
641                 int first_len = skb_pagelen(skb);
642                 struct sk_buff *frag2;
643
644                 if (first_len - hlen > mtu ||
645                     ((first_len - hlen) & 7) ||
646                     skb_cloned(skb))
647                         goto slow_path;
648
649                 skb_walk_frags(skb, frag) {
650                         /* Correct geometry. */
651                         if (frag->len > mtu ||
652                             ((frag->len & 7) && frag->next) ||
653                             skb_headroom(frag) < hlen)
654                                 goto slow_path_clean;
655
656                         /* Partially cloned skb? */
657                         if (skb_shared(frag))
658                                 goto slow_path_clean;
659
660                         BUG_ON(frag->sk);
661                         if (skb->sk) {
662                                 frag->sk = skb->sk;
663                                 frag->destructor = sock_wfree;
664                         }
665                         skb->truesize -= frag->truesize;
666                 }
667
668                 err = 0;
669                 offset = 0;
670                 frag = skb_shinfo(skb)->frag_list;
671                 skb_frag_list_init(skb);
672                 /* BUILD HEADER */
673
674                 *prevhdr = NEXTHDR_FRAGMENT;
675                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
676                 if (!tmp_hdr) {
677                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
678                                       IPSTATS_MIB_FRAGFAILS);
679                         return -ENOMEM;
680                 }
681
682                 __skb_pull(skb, hlen);
683                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
684                 __skb_push(skb, hlen);
685                 skb_reset_network_header(skb);
686                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
687
688                 ipv6_select_ident(fh);
689                 fh->nexthdr = nexthdr;
690                 fh->reserved = 0;
691                 fh->frag_off = htons(IP6_MF);
692                 frag_id = fh->identification;
693
694                 first_len = skb_pagelen(skb);
695                 skb->data_len = first_len - skb_headlen(skb);
696                 skb->len = first_len;
697                 ipv6_hdr(skb)->payload_len = htons(first_len -
698                                                    sizeof(struct ipv6hdr));
699
700                 dst_hold(&rt->dst);
701
702                 for (;;) {
703                         /* Prepare header of the next frame,
704                          * before previous one went down. */
705                         if (frag) {
706                                 frag->ip_summed = CHECKSUM_NONE;
707                                 skb_reset_transport_header(frag);
708                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
709                                 __skb_push(frag, hlen);
710                                 skb_reset_network_header(frag);
711                                 memcpy(skb_network_header(frag), tmp_hdr,
712                                        hlen);
713                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
714                                 fh->nexthdr = nexthdr;
715                                 fh->reserved = 0;
716                                 fh->frag_off = htons(offset);
717                                 if (frag->next != NULL)
718                                         fh->frag_off |= htons(IP6_MF);
719                                 fh->identification = frag_id;
720                                 ipv6_hdr(frag)->payload_len =
721                                                 htons(frag->len -
722                                                       sizeof(struct ipv6hdr));
723                                 ip6_copy_metadata(frag, skb);
724                         }
725
726                         err = output(skb);
727                         if(!err)
728                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
729                                               IPSTATS_MIB_FRAGCREATES);
730
731                         if (err || !frag)
732                                 break;
733
734                         skb = frag;
735                         frag = skb->next;
736                         skb->next = NULL;
737                 }
738
739                 kfree(tmp_hdr);
740
741                 if (err == 0) {
742                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
743                                       IPSTATS_MIB_FRAGOKS);
744                         dst_release(&rt->dst);
745                         return 0;
746                 }
747
748                 while (frag) {
749                         skb = frag->next;
750                         kfree_skb(frag);
751                         frag = skb;
752                 }
753
754                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
755                               IPSTATS_MIB_FRAGFAILS);
756                 dst_release(&rt->dst);
757                 return err;
758
759 slow_path_clean:
760                 skb_walk_frags(skb, frag2) {
761                         if (frag2 == frag)
762                                 break;
763                         frag2->sk = NULL;
764                         frag2->destructor = NULL;
765                         skb->truesize += frag2->truesize;
766                 }
767         }
768
769 slow_path:
770         left = skb->len - hlen;         /* Space per frame */
771         ptr = hlen;                     /* Where to start from */
772
773         /*
774          *      Fragment the datagram.
775          */
776
777         *prevhdr = NEXTHDR_FRAGMENT;
778
779         /*
780          *      Keep copying data until we run out.
781          */
782         while(left > 0) {
783                 len = left;
784                 /* IF: it doesn't fit, use 'mtu' - the data space left */
785                 if (len > mtu)
786                         len = mtu;
787                 /* IF: we are not sending upto and including the packet end
788                    then align the next start on an eight byte boundary */
789                 if (len < left) {
790                         len &= ~7;
791                 }
792                 /*
793                  *      Allocate buffer.
794                  */
795
796                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
797                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
798                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
799                                       IPSTATS_MIB_FRAGFAILS);
800                         err = -ENOMEM;
801                         goto fail;
802                 }
803
804                 /*
805                  *      Set up data on packet
806                  */
807
808                 ip6_copy_metadata(frag, skb);
809                 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
810                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
811                 skb_reset_network_header(frag);
812                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
813                 frag->transport_header = (frag->network_header + hlen +
814                                           sizeof(struct frag_hdr));
815
816                 /*
817                  *      Charge the memory for the fragment to any owner
818                  *      it might possess
819                  */
820                 if (skb->sk)
821                         skb_set_owner_w(frag, skb->sk);
822
823                 /*
824                  *      Copy the packet header into the new buffer.
825                  */
826                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
827
828                 /*
829                  *      Build fragment header.
830                  */
831                 fh->nexthdr = nexthdr;
832                 fh->reserved = 0;
833                 if (!frag_id) {
834                         ipv6_select_ident(fh);
835                         frag_id = fh->identification;
836                 } else
837                         fh->identification = frag_id;
838
839                 /*
840                  *      Copy a block of the IP datagram.
841                  */
842                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
843                         BUG();
844                 left -= len;
845
846                 fh->frag_off = htons(offset);
847                 if (left > 0)
848                         fh->frag_off |= htons(IP6_MF);
849                 ipv6_hdr(frag)->payload_len = htons(frag->len -
850                                                     sizeof(struct ipv6hdr));
851
852                 ptr += len;
853                 offset += len;
854
855                 /*
856                  *      Put this fragment into the sending queue.
857                  */
858                 err = output(frag);
859                 if (err)
860                         goto fail;
861
862                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
863                               IPSTATS_MIB_FRAGCREATES);
864         }
865         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
866                       IPSTATS_MIB_FRAGOKS);
867         kfree_skb(skb);
868         return err;
869
870 fail:
871         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
872                       IPSTATS_MIB_FRAGFAILS);
873         kfree_skb(skb);
874         return err;
875 }
876
877 static inline int ip6_rt_check(struct rt6key *rt_key,
878                                struct in6_addr *fl_addr,
879                                struct in6_addr *addr_cache)
880 {
881         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
882                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
883 }
884
885 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
886                                           struct dst_entry *dst,
887                                           struct flowi *fl)
888 {
889         struct ipv6_pinfo *np = inet6_sk(sk);
890         struct rt6_info *rt = (struct rt6_info *)dst;
891
892         if (!dst)
893                 goto out;
894
895         /* Yes, checking route validity in not connected
896          * case is not very simple. Take into account,
897          * that we do not support routing by source, TOS,
898          * and MSG_DONTROUTE            --ANK (980726)
899          *
900          * 1. ip6_rt_check(): If route was host route,
901          *    check that cached destination is current.
902          *    If it is network route, we still may
903          *    check its validity using saved pointer
904          *    to the last used address: daddr_cache.
905          *    We do not want to save whole address now,
906          *    (because main consumer of this service
907          *    is tcp, which has not this problem),
908          *    so that the last trick works only on connected
909          *    sockets.
910          * 2. oif also should be the same.
911          */
912         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
913 #ifdef CONFIG_IPV6_SUBTREES
914             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
915 #endif
916             (fl->oif && fl->oif != dst->dev->ifindex)) {
917                 dst_release(dst);
918                 dst = NULL;
919         }
920
921 out:
922         return dst;
923 }
924
925 static int ip6_dst_lookup_tail(struct sock *sk,
926                                struct dst_entry **dst, struct flowi *fl)
927 {
928         int err;
929         struct net *net = sock_net(sk);
930
931         if (*dst == NULL)
932                 *dst = ip6_route_output(net, sk, fl);
933
934         if ((err = (*dst)->error))
935                 goto out_err_release;
936
937         if (ipv6_addr_any(&fl->fl6_src)) {
938                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
939                                          &fl->fl6_dst,
940                                          sk ? inet6_sk(sk)->srcprefs : 0,
941                                          &fl->fl6_src);
942                 if (err)
943                         goto out_err_release;
944         }
945
946 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
947         /*
948          * Here if the dst entry we've looked up
949          * has a neighbour entry that is in the INCOMPLETE
950          * state and the src address from the flow is
951          * marked as OPTIMISTIC, we release the found
952          * dst entry and replace it instead with the
953          * dst entry of the nexthop router
954          */
955         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
956                 struct inet6_ifaddr *ifp;
957                 struct flowi fl_gw;
958                 int redirect;
959
960                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
961                                       (*dst)->dev, 1);
962
963                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
964                 if (ifp)
965                         in6_ifa_put(ifp);
966
967                 if (redirect) {
968                         /*
969                          * We need to get the dst entry for the
970                          * default router instead
971                          */
972                         dst_release(*dst);
973                         memcpy(&fl_gw, fl, sizeof(struct flowi));
974                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
975                         *dst = ip6_route_output(net, sk, &fl_gw);
976                         if ((err = (*dst)->error))
977                                 goto out_err_release;
978                 }
979         }
980 #endif
981
982         return 0;
983
984 out_err_release:
985         if (err == -ENETUNREACH)
986                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
987         dst_release(*dst);
988         *dst = NULL;
989         return err;
990 }
991
992 /**
993  *      ip6_dst_lookup - perform route lookup on flow
994  *      @sk: socket which provides route info
995  *      @dst: pointer to dst_entry * for result
996  *      @fl: flow to lookup
997  *
998  *      This function performs a route lookup on the given flow.
999  *
1000  *      It returns zero on success, or a standard errno code on error.
1001  */
1002 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1003 {
1004         *dst = NULL;
1005         return ip6_dst_lookup_tail(sk, dst, fl);
1006 }
1007 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1008
1009 /**
1010  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1011  *      @sk: socket which provides the dst cache and route info
1012  *      @dst: pointer to dst_entry * for result
1013  *      @fl: flow to lookup
1014  *
1015  *      This function performs a route lookup on the given flow with the
1016  *      possibility of using the cached route in the socket if it is valid.
1017  *      It will take the socket dst lock when operating on the dst cache.
1018  *      As a result, this function can only be used in process context.
1019  *
1020  *      It returns zero on success, or a standard errno code on error.
1021  */
1022 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1023 {
1024         *dst = NULL;
1025         if (sk) {
1026                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1027                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1028         }
1029
1030         return ip6_dst_lookup_tail(sk, dst, fl);
1031 }
1032 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1033
1034 static inline int ip6_ufo_append_data(struct sock *sk,
1035                         int getfrag(void *from, char *to, int offset, int len,
1036                         int odd, struct sk_buff *skb),
1037                         void *from, int length, int hh_len, int fragheaderlen,
1038                         int transhdrlen, int mtu,unsigned int flags)
1039
1040 {
1041         struct sk_buff *skb;
1042         int err;
1043
1044         /* There is support for UDP large send offload by network
1045          * device, so create one single skb packet containing complete
1046          * udp datagram
1047          */
1048         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1049                 skb = sock_alloc_send_skb(sk,
1050                         hh_len + fragheaderlen + transhdrlen + 20,
1051                         (flags & MSG_DONTWAIT), &err);
1052                 if (skb == NULL)
1053                         return -ENOMEM;
1054
1055                 /* reserve space for Hardware header */
1056                 skb_reserve(skb, hh_len);
1057
1058                 /* create space for UDP/IP header */
1059                 skb_put(skb,fragheaderlen + transhdrlen);
1060
1061                 /* initialize network header pointer */
1062                 skb_reset_network_header(skb);
1063
1064                 /* initialize protocol header pointer */
1065                 skb->transport_header = skb->network_header + fragheaderlen;
1066
1067                 skb->ip_summed = CHECKSUM_PARTIAL;
1068                 skb->csum = 0;
1069                 sk->sk_sndmsg_off = 0;
1070         }
1071
1072         err = skb_append_datato_frags(sk,skb, getfrag, from,
1073                                       (length - transhdrlen));
1074         if (!err) {
1075                 struct frag_hdr fhdr;
1076
1077                 /* Specify the length of each IPv6 datagram fragment.
1078                  * It has to be a multiple of 8.
1079                  */
1080                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1081                                              sizeof(struct frag_hdr)) & ~7;
1082                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1083                 ipv6_select_ident(&fhdr);
1084                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1085                 __skb_queue_tail(&sk->sk_write_queue, skb);
1086
1087                 return 0;
1088         }
1089         /* There is not enough support do UPD LSO,
1090          * so follow normal path
1091          */
1092         kfree_skb(skb);
1093
1094         return err;
1095 }
1096
1097 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1098                                                gfp_t gfp)
1099 {
1100         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1101 }
1102
1103 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1104                                                 gfp_t gfp)
1105 {
1106         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1107 }
1108
1109 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1110         int offset, int len, int odd, struct sk_buff *skb),
1111         void *from, int length, int transhdrlen,
1112         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1113         struct rt6_info *rt, unsigned int flags, int dontfrag)
1114 {
1115         struct inet_sock *inet = inet_sk(sk);
1116         struct ipv6_pinfo *np = inet6_sk(sk);
1117         struct sk_buff *skb;
1118         unsigned int maxfraglen, fragheaderlen;
1119         int exthdrlen;
1120         int hh_len;
1121         int mtu;
1122         int copy;
1123         int err;
1124         int offset = 0;
1125         int csummode = CHECKSUM_NONE;
1126
1127         if (flags&MSG_PROBE)
1128                 return 0;
1129         if (skb_queue_empty(&sk->sk_write_queue)) {
1130                 /*
1131                  * setup for corking
1132                  */
1133                 if (opt) {
1134                         if (WARN_ON(np->cork.opt))
1135                                 return -EINVAL;
1136
1137                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1138                         if (unlikely(np->cork.opt == NULL))
1139                                 return -ENOBUFS;
1140
1141                         np->cork.opt->tot_len = opt->tot_len;
1142                         np->cork.opt->opt_flen = opt->opt_flen;
1143                         np->cork.opt->opt_nflen = opt->opt_nflen;
1144
1145                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1146                                                             sk->sk_allocation);
1147                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1148                                 return -ENOBUFS;
1149
1150                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1151                                                             sk->sk_allocation);
1152                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1153                                 return -ENOBUFS;
1154
1155                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1156                                                            sk->sk_allocation);
1157                         if (opt->hopopt && !np->cork.opt->hopopt)
1158                                 return -ENOBUFS;
1159
1160                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1161                                                             sk->sk_allocation);
1162                         if (opt->srcrt && !np->cork.opt->srcrt)
1163                                 return -ENOBUFS;
1164
1165                         /* need source address above miyazawa*/
1166                 }
1167                 dst_hold(&rt->dst);
1168                 inet->cork.dst = &rt->dst;
1169                 inet->cork.fl = *fl;
1170                 np->cork.hop_limit = hlimit;
1171                 np->cork.tclass = tclass;
1172                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1173                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1174                 if (np->frag_size < mtu) {
1175                         if (np->frag_size)
1176                                 mtu = np->frag_size;
1177                 }
1178                 inet->cork.fragsize = mtu;
1179                 if (dst_allfrag(rt->dst.path))
1180                         inet->cork.flags |= IPCORK_ALLFRAG;
1181                 inet->cork.length = 0;
1182                 sk->sk_sndmsg_page = NULL;
1183                 sk->sk_sndmsg_off = 0;
1184                 exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1185                             rt->rt6i_nfheader_len;
1186                 length += exthdrlen;
1187                 transhdrlen += exthdrlen;
1188         } else {
1189                 rt = (struct rt6_info *)inet->cork.dst;
1190                 fl = &inet->cork.fl;
1191                 opt = np->cork.opt;
1192                 transhdrlen = 0;
1193                 exthdrlen = 0;
1194                 mtu = inet->cork.fragsize;
1195         }
1196
1197         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1198
1199         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1200                         (opt ? opt->opt_nflen : 0);
1201         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1202
1203         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1204                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1205                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1206                         return -EMSGSIZE;
1207                 }
1208         }
1209
1210         /*
1211          * Let's try using as much space as possible.
1212          * Use MTU if total length of the message fits into the MTU.
1213          * Otherwise, we need to reserve fragment header and
1214          * fragment alignment (= 8-15 octects, in total).
1215          *
1216          * Note that we may need to "move" the data from the tail of
1217          * of the buffer to the new fragment when we split
1218          * the message.
1219          *
1220          * FIXME: It may be fragmented into multiple chunks
1221          *        at once if non-fragmentable extension headers
1222          *        are too large.
1223          * --yoshfuji
1224          */
1225
1226         inet->cork.length += length;
1227         if (length > mtu) {
1228                 int proto = sk->sk_protocol;
1229                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1230                         ipv6_local_rxpmtu(sk, fl, mtu-exthdrlen);
1231                         return -EMSGSIZE;
1232                 }
1233
1234                 if (proto == IPPROTO_UDP &&
1235                     (rt->dst.dev->features & NETIF_F_UFO)) {
1236
1237                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1238                                                   hh_len, fragheaderlen,
1239                                                   transhdrlen, mtu, flags);
1240                         if (err)
1241                                 goto error;
1242                         return 0;
1243                 }
1244         }
1245
1246         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1247                 goto alloc_new_skb;
1248
1249         while (length > 0) {
1250                 /* Check if the remaining data fits into current packet. */
1251                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1252                 if (copy < length)
1253                         copy = maxfraglen - skb->len;
1254
1255                 if (copy <= 0) {
1256                         char *data;
1257                         unsigned int datalen;
1258                         unsigned int fraglen;
1259                         unsigned int fraggap;
1260                         unsigned int alloclen;
1261                         struct sk_buff *skb_prev;
1262 alloc_new_skb:
1263                         skb_prev = skb;
1264
1265                         /* There's no room in the current skb */
1266                         if (skb_prev)
1267                                 fraggap = skb_prev->len - maxfraglen;
1268                         else
1269                                 fraggap = 0;
1270
1271                         /*
1272                          * If remaining data exceeds the mtu,
1273                          * we know we need more fragment(s).
1274                          */
1275                         datalen = length + fraggap;
1276                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1277                                 datalen = maxfraglen - fragheaderlen;
1278
1279                         fraglen = datalen + fragheaderlen;
1280                         if ((flags & MSG_MORE) &&
1281                             !(rt->dst.dev->features&NETIF_F_SG))
1282                                 alloclen = mtu;
1283                         else
1284                                 alloclen = datalen + fragheaderlen;
1285
1286                         /*
1287                          * The last fragment gets additional space at tail.
1288                          * Note: we overallocate on fragments with MSG_MODE
1289                          * because we have no idea if we're the last one.
1290                          */
1291                         if (datalen == length + fraggap)
1292                                 alloclen += rt->dst.trailer_len;
1293
1294                         /*
1295                          * We just reserve space for fragment header.
1296                          * Note: this may be overallocation if the message
1297                          * (without MSG_MORE) fits into the MTU.
1298                          */
1299                         alloclen += sizeof(struct frag_hdr);
1300
1301                         if (transhdrlen) {
1302                                 skb = sock_alloc_send_skb(sk,
1303                                                 alloclen + hh_len,
1304                                                 (flags & MSG_DONTWAIT), &err);
1305                         } else {
1306                                 skb = NULL;
1307                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1308                                     2 * sk->sk_sndbuf)
1309                                         skb = sock_wmalloc(sk,
1310                                                            alloclen + hh_len, 1,
1311                                                            sk->sk_allocation);
1312                                 if (unlikely(skb == NULL))
1313                                         err = -ENOBUFS;
1314                         }
1315                         if (skb == NULL)
1316                                 goto error;
1317                         /*
1318                          *      Fill in the control structures
1319                          */
1320                         skb->ip_summed = csummode;
1321                         skb->csum = 0;
1322                         /* reserve for fragmentation */
1323                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1324
1325                         /*
1326                          *      Find where to start putting bytes
1327                          */
1328                         data = skb_put(skb, fraglen);
1329                         skb_set_network_header(skb, exthdrlen);
1330                         data += fragheaderlen;
1331                         skb->transport_header = (skb->network_header +
1332                                                  fragheaderlen);
1333                         if (fraggap) {
1334                                 skb->csum = skb_copy_and_csum_bits(
1335                                         skb_prev, maxfraglen,
1336                                         data + transhdrlen, fraggap, 0);
1337                                 skb_prev->csum = csum_sub(skb_prev->csum,
1338                                                           skb->csum);
1339                                 data += fraggap;
1340                                 pskb_trim_unique(skb_prev, maxfraglen);
1341                         }
1342                         copy = datalen - transhdrlen - fraggap;
1343                         if (copy < 0) {
1344                                 err = -EINVAL;
1345                                 kfree_skb(skb);
1346                                 goto error;
1347                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1348                                 err = -EFAULT;
1349                                 kfree_skb(skb);
1350                                 goto error;
1351                         }
1352
1353                         offset += copy;
1354                         length -= datalen - fraggap;
1355                         transhdrlen = 0;
1356                         exthdrlen = 0;
1357                         csummode = CHECKSUM_NONE;
1358
1359                         /*
1360                          * Put the packet on the pending queue
1361                          */
1362                         __skb_queue_tail(&sk->sk_write_queue, skb);
1363                         continue;
1364                 }
1365
1366                 if (copy > length)
1367                         copy = length;
1368
1369                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1370                         unsigned int off;
1371
1372                         off = skb->len;
1373                         if (getfrag(from, skb_put(skb, copy),
1374                                                 offset, copy, off, skb) < 0) {
1375                                 __skb_trim(skb, off);
1376                                 err = -EFAULT;
1377                                 goto error;
1378                         }
1379                 } else {
1380                         int i = skb_shinfo(skb)->nr_frags;
1381                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1382                         struct page *page = sk->sk_sndmsg_page;
1383                         int off = sk->sk_sndmsg_off;
1384                         unsigned int left;
1385
1386                         if (page && (left = PAGE_SIZE - off) > 0) {
1387                                 if (copy >= left)
1388                                         copy = left;
1389                                 if (page != frag->page) {
1390                                         if (i == MAX_SKB_FRAGS) {
1391                                                 err = -EMSGSIZE;
1392                                                 goto error;
1393                                         }
1394                                         get_page(page);
1395                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1396                                         frag = &skb_shinfo(skb)->frags[i];
1397                                 }
1398                         } else if(i < MAX_SKB_FRAGS) {
1399                                 if (copy > PAGE_SIZE)
1400                                         copy = PAGE_SIZE;
1401                                 page = alloc_pages(sk->sk_allocation, 0);
1402                                 if (page == NULL) {
1403                                         err = -ENOMEM;
1404                                         goto error;
1405                                 }
1406                                 sk->sk_sndmsg_page = page;
1407                                 sk->sk_sndmsg_off = 0;
1408
1409                                 skb_fill_page_desc(skb, i, page, 0, 0);
1410                                 frag = &skb_shinfo(skb)->frags[i];
1411                         } else {
1412                                 err = -EMSGSIZE;
1413                                 goto error;
1414                         }
1415                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1416                                 err = -EFAULT;
1417                                 goto error;
1418                         }
1419                         sk->sk_sndmsg_off += copy;
1420                         frag->size += copy;
1421                         skb->len += copy;
1422                         skb->data_len += copy;
1423                         skb->truesize += copy;
1424                         atomic_add(copy, &sk->sk_wmem_alloc);
1425                 }
1426                 offset += copy;
1427                 length -= copy;
1428         }
1429         return 0;
1430 error:
1431         inet->cork.length -= length;
1432         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1433         return err;
1434 }
1435
1436 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1437 {
1438         if (np->cork.opt) {
1439                 kfree(np->cork.opt->dst0opt);
1440                 kfree(np->cork.opt->dst1opt);
1441                 kfree(np->cork.opt->hopopt);
1442                 kfree(np->cork.opt->srcrt);
1443                 kfree(np->cork.opt);
1444                 np->cork.opt = NULL;
1445         }
1446
1447         if (inet->cork.dst) {
1448                 dst_release(inet->cork.dst);
1449                 inet->cork.dst = NULL;
1450                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1451         }
1452         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1453 }
1454
1455 int ip6_push_pending_frames(struct sock *sk)
1456 {
1457         struct sk_buff *skb, *tmp_skb;
1458         struct sk_buff **tail_skb;
1459         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1460         struct inet_sock *inet = inet_sk(sk);
1461         struct ipv6_pinfo *np = inet6_sk(sk);
1462         struct net *net = sock_net(sk);
1463         struct ipv6hdr *hdr;
1464         struct ipv6_txoptions *opt = np->cork.opt;
1465         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1466         struct flowi *fl = &inet->cork.fl;
1467         unsigned char proto = fl->proto;
1468         int err = 0;
1469
1470         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1471                 goto out;
1472         tail_skb = &(skb_shinfo(skb)->frag_list);
1473
1474         /* move skb->data to ip header from ext header */
1475         if (skb->data < skb_network_header(skb))
1476                 __skb_pull(skb, skb_network_offset(skb));
1477         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1478                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1479                 *tail_skb = tmp_skb;
1480                 tail_skb = &(tmp_skb->next);
1481                 skb->len += tmp_skb->len;
1482                 skb->data_len += tmp_skb->len;
1483                 skb->truesize += tmp_skb->truesize;
1484                 tmp_skb->destructor = NULL;
1485                 tmp_skb->sk = NULL;
1486         }
1487
1488         /* Allow local fragmentation. */
1489         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1490                 skb->local_df = 1;
1491
1492         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1493         __skb_pull(skb, skb_network_header_len(skb));
1494         if (opt && opt->opt_flen)
1495                 ipv6_push_frag_opts(skb, opt, &proto);
1496         if (opt && opt->opt_nflen)
1497                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1498
1499         skb_push(skb, sizeof(struct ipv6hdr));
1500         skb_reset_network_header(skb);
1501         hdr = ipv6_hdr(skb);
1502
1503         *(__be32*)hdr = fl->fl6_flowlabel |
1504                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1505
1506         hdr->hop_limit = np->cork.hop_limit;
1507         hdr->nexthdr = proto;
1508         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1509         ipv6_addr_copy(&hdr->daddr, final_dst);
1510
1511         skb->priority = sk->sk_priority;
1512         skb->mark = sk->sk_mark;
1513
1514         skb_dst_set(skb, dst_clone(&rt->dst));
1515         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1516         if (proto == IPPROTO_ICMPV6) {
1517                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1518
1519                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1520                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1521         }
1522
1523         err = ip6_local_out(skb);
1524         if (err) {
1525                 if (err > 0)
1526                         err = net_xmit_errno(err);
1527                 if (err)
1528                         goto error;
1529         }
1530
1531 out:
1532         ip6_cork_release(inet, np);
1533         return err;
1534 error:
1535         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1536         goto out;
1537 }
1538
1539 void ip6_flush_pending_frames(struct sock *sk)
1540 {
1541         struct sk_buff *skb;
1542
1543         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1544                 if (skb_dst(skb))
1545                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1546                                       IPSTATS_MIB_OUTDISCARDS);
1547                 kfree_skb(skb);
1548         }
1549
1550         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1551 }