ipv6: fix possible seqlock deadlock in ip6_finish_output2
[pandora-kernel.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89         skb_reset_mac_header(newskb);
90         __skb_pull(newskb, skb_network_offset(newskb));
91         newskb->pkt_type = PACKET_LOOPBACK;
92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
93         WARN_ON(!skb_dst(newskb));
94
95         netif_rx_ni(newskb);
96         return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101         struct dst_entry *dst = skb_dst(skb);
102         struct net_device *dev = dst->dev;
103         struct neighbour *neigh;
104
105         skb->protocol = htons(ETH_P_IPV6);
106         skb->dev = dev;
107
108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110
111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112                     ((mroute6_socket(dev_net(dev), skb) &&
113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115                                          &ipv6_hdr(skb)->saddr))) {
116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117
118                         /* Do not check for IFF_ALLMULTI; multicast routing
119                            is not supported in any case.
120                          */
121                         if (newskb)
122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123                                         newskb, NULL, newskb->dev,
124                                         ip6_dev_loopback_xmit);
125
126                         if (ipv6_hdr(skb)->hop_limit == 0) {
127                                 IP6_INC_STATS(dev_net(dev), idev,
128                                               IPSTATS_MIB_OUTDISCARDS);
129                                 kfree_skb(skb);
130                                 return 0;
131                         }
132                 }
133
134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135                                 skb->len);
136         }
137
138         rcu_read_lock();
139         neigh = dst_get_neighbour(dst);
140         if (neigh) {
141                 int res = neigh_output(neigh, skb);
142
143                 rcu_read_unlock();
144                 return res;
145         }
146         rcu_read_unlock();
147         IP6_INC_STATS(dev_net(dst->dev),
148                       ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149         kfree_skb(skb);
150         return -EINVAL;
151 }
152
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156             dst_allfrag(skb_dst(skb)))
157                 return ip6_fragment(skb, ip6_finish_output2);
158         else
159                 return ip6_finish_output2(skb);
160 }
161
162 int ip6_output(struct sk_buff *skb)
163 {
164         struct net_device *dev = skb_dst(skb)->dev;
165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166         if (unlikely(idev->cnf.disable_ipv6)) {
167                 IP6_INC_STATS(dev_net(dev), idev,
168                               IPSTATS_MIB_OUTDISCARDS);
169                 kfree_skb(skb);
170                 return 0;
171         }
172
173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174                             ip6_finish_output,
175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177
178 /*
179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183              struct ipv6_txoptions *opt, int tclass)
184 {
185         struct net *net = sock_net(sk);
186         struct ipv6_pinfo *np = inet6_sk(sk);
187         struct in6_addr *first_hop = &fl6->daddr;
188         struct dst_entry *dst = skb_dst(skb);
189         struct ipv6hdr *hdr;
190         u8  proto = fl6->flowi6_proto;
191         int seg_len = skb->len;
192         int hlimit = -1;
193         u32 mtu;
194
195         if (opt) {
196                 unsigned int head_room;
197
198                 /* First: exthdrs may take lots of space (~8K for now)
199                    MAX_HEADER is not enough.
200                  */
201                 head_room = opt->opt_nflen + opt->opt_flen;
202                 seg_len += head_room;
203                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204
205                 if (skb_headroom(skb) < head_room) {
206                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207                         if (skb2 == NULL) {
208                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209                                               IPSTATS_MIB_OUTDISCARDS);
210                                 kfree_skb(skb);
211                                 return -ENOBUFS;
212                         }
213                         kfree_skb(skb);
214                         skb = skb2;
215                         skb_set_owner_w(skb, sk);
216                 }
217                 if (opt->opt_flen)
218                         ipv6_push_frag_opts(skb, opt, &proto);
219                 if (opt->opt_nflen)
220                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221         }
222
223         skb_push(skb, sizeof(struct ipv6hdr));
224         skb_reset_network_header(skb);
225         hdr = ipv6_hdr(skb);
226
227         /*
228          *      Fill in the IPv6 header
229          */
230         if (np)
231                 hlimit = np->hop_limit;
232         if (hlimit < 0)
233                 hlimit = ip6_dst_hoplimit(dst);
234
235         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236
237         hdr->payload_len = htons(seg_len);
238         hdr->nexthdr = proto;
239         hdr->hop_limit = hlimit;
240
241         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
242         ipv6_addr_copy(&hdr->daddr, first_hop);
243
244         skb->priority = sk->sk_priority;
245         skb->mark = sk->sk_mark;
246
247         mtu = dst_mtu(dst);
248         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250                               IPSTATS_MIB_OUT, skb->len);
251                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252                                dst->dev, dst_output);
253         }
254
255         if (net_ratelimit())
256                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
257         skb->dev = dst->dev;
258         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
259         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
260         kfree_skb(skb);
261         return -EMSGSIZE;
262 }
263
264 EXPORT_SYMBOL(ip6_xmit);
265
266 /*
267  *      To avoid extra problems ND packets are send through this
268  *      routine. It's code duplication but I really want to avoid
269  *      extra checks since ipv6_build_header is used by TCP (which
270  *      is for us performance critical)
271  */
272
273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
274                const struct in6_addr *saddr, const struct in6_addr *daddr,
275                int proto, int len)
276 {
277         struct ipv6_pinfo *np = inet6_sk(sk);
278         struct ipv6hdr *hdr;
279
280         skb->protocol = htons(ETH_P_IPV6);
281         skb->dev = dev;
282
283         skb_reset_network_header(skb);
284         skb_put(skb, sizeof(struct ipv6hdr));
285         hdr = ipv6_hdr(skb);
286
287         *(__be32*)hdr = htonl(0x60000000);
288
289         hdr->payload_len = htons(len);
290         hdr->nexthdr = proto;
291         hdr->hop_limit = np->hop_limit;
292
293         ipv6_addr_copy(&hdr->saddr, saddr);
294         ipv6_addr_copy(&hdr->daddr, daddr);
295
296         return 0;
297 }
298
299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
300 {
301         struct ip6_ra_chain *ra;
302         struct sock *last = NULL;
303
304         read_lock(&ip6_ra_lock);
305         for (ra = ip6_ra_chain; ra; ra = ra->next) {
306                 struct sock *sk = ra->sk;
307                 if (sk && ra->sel == sel &&
308                     (!sk->sk_bound_dev_if ||
309                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
310                         if (last) {
311                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
312                                 if (skb2)
313                                         rawv6_rcv(last, skb2);
314                         }
315                         last = sk;
316                 }
317         }
318
319         if (last) {
320                 rawv6_rcv(last, skb);
321                 read_unlock(&ip6_ra_lock);
322                 return 1;
323         }
324         read_unlock(&ip6_ra_lock);
325         return 0;
326 }
327
328 static int ip6_forward_proxy_check(struct sk_buff *skb)
329 {
330         struct ipv6hdr *hdr = ipv6_hdr(skb);
331         u8 nexthdr = hdr->nexthdr;
332         int offset;
333
334         if (ipv6_ext_hdr(nexthdr)) {
335                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
336                 if (offset < 0)
337                         return 0;
338         } else
339                 offset = sizeof(struct ipv6hdr);
340
341         if (nexthdr == IPPROTO_ICMPV6) {
342                 struct icmp6hdr *icmp6;
343
344                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345                                          offset + 1 - skb->data)))
346                         return 0;
347
348                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349
350                 switch (icmp6->icmp6_type) {
351                 case NDISC_ROUTER_SOLICITATION:
352                 case NDISC_ROUTER_ADVERTISEMENT:
353                 case NDISC_NEIGHBOUR_SOLICITATION:
354                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
355                 case NDISC_REDIRECT:
356                         /* For reaction involving unicast neighbor discovery
357                          * message destined to the proxied address, pass it to
358                          * input function.
359                          */
360                         return 1;
361                 default:
362                         break;
363                 }
364         }
365
366         /*
367          * The proxying router can't forward traffic sent to a link-local
368          * address, so signal the sender and discard the packet. This
369          * behavior is clarified by the MIPv6 specification.
370          */
371         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372                 dst_link_failure(skb);
373                 return -1;
374         }
375
376         return 0;
377 }
378
379 static inline int ip6_forward_finish(struct sk_buff *skb)
380 {
381         return dst_output(skb);
382 }
383
384 int ip6_forward(struct sk_buff *skb)
385 {
386         struct dst_entry *dst = skb_dst(skb);
387         struct ipv6hdr *hdr = ipv6_hdr(skb);
388         struct inet6_skb_parm *opt = IP6CB(skb);
389         struct net *net = dev_net(dst->dev);
390         struct neighbour *n;
391         u32 mtu;
392
393         if (net->ipv6.devconf_all->forwarding == 0)
394                 goto error;
395
396         if (skb_warn_if_lro(skb))
397                 goto drop;
398
399         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
401                 goto drop;
402         }
403
404         if (skb->pkt_type != PACKET_HOST)
405                 goto drop;
406
407         skb_forward_csum(skb);
408
409         /*
410          *      We DO NOT make any processing on
411          *      RA packets, pushing them to user level AS IS
412          *      without ane WARRANTY that application will be able
413          *      to interpret them. The reason is that we
414          *      cannot make anything clever here.
415          *
416          *      We are not end-node, so that if packet contains
417          *      AH/ESP, we cannot make anything.
418          *      Defragmentation also would be mistake, RA packets
419          *      cannot be fragmented, because there is no warranty
420          *      that different fragments will go along one path. --ANK
421          */
422         if (opt->ra) {
423                 u8 *ptr = skb_network_header(skb) + opt->ra;
424                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
425                         return 0;
426         }
427
428         /*
429          *      check and decrement ttl
430          */
431         if (hdr->hop_limit <= 1) {
432                 /* Force OUTPUT device used as source address */
433                 skb->dev = dst->dev;
434                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
435                 IP6_INC_STATS_BH(net,
436                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
437
438                 kfree_skb(skb);
439                 return -ETIMEDOUT;
440         }
441
442         /* XXX: idev->cnf.proxy_ndp? */
443         if (net->ipv6.devconf_all->proxy_ndp &&
444             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
445                 int proxied = ip6_forward_proxy_check(skb);
446                 if (proxied > 0)
447                         return ip6_input(skb);
448                 else if (proxied < 0) {
449                         IP6_INC_STATS(net, ip6_dst_idev(dst),
450                                       IPSTATS_MIB_INDISCARDS);
451                         goto drop;
452                 }
453         }
454
455         if (!xfrm6_route_forward(skb)) {
456                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
457                 goto drop;
458         }
459         dst = skb_dst(skb);
460
461         /* IPv6 specs say nothing about it, but it is clear that we cannot
462            send redirects to source routed frames.
463            We don't send redirects to frames decapsulated from IPsec.
464          */
465         n = dst_get_neighbour(dst);
466         if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
467                 struct in6_addr *target = NULL;
468                 struct rt6_info *rt;
469
470                 /*
471                  *      incoming and outgoing devices are the same
472                  *      send a redirect.
473                  */
474
475                 rt = (struct rt6_info *) dst;
476                 if ((rt->rt6i_flags & RTF_GATEWAY))
477                         target = (struct in6_addr*)&n->primary_key;
478                 else
479                         target = &hdr->daddr;
480
481                 if (!rt->rt6i_peer)
482                         rt6_bind_peer(rt, 1);
483
484                 /* Limit redirects both by destination (here)
485                    and by source (inside ndisc_send_redirect)
486                  */
487                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
488                         ndisc_send_redirect(skb, n, target);
489         } else {
490                 int addrtype = ipv6_addr_type(&hdr->saddr);
491
492                 /* This check is security critical. */
493                 if (addrtype == IPV6_ADDR_ANY ||
494                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
495                         goto error;
496                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
497                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
498                                     ICMPV6_NOT_NEIGHBOUR, 0);
499                         goto error;
500                 }
501         }
502
503         mtu = dst_mtu(dst);
504         if (mtu < IPV6_MIN_MTU)
505                 mtu = IPV6_MIN_MTU;
506
507         if (skb->len > mtu && !skb_is_gso(skb)) {
508                 /* Again, force OUTPUT device used as source address */
509                 skb->dev = dst->dev;
510                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
511                 IP6_INC_STATS_BH(net,
512                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
513                 IP6_INC_STATS_BH(net,
514                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
515                 kfree_skb(skb);
516                 return -EMSGSIZE;
517         }
518
519         if (skb_cow(skb, dst->dev->hard_header_len)) {
520                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
521                 goto drop;
522         }
523
524         hdr = ipv6_hdr(skb);
525
526         /* Mangling hops number delayed to point after skb COW */
527
528         hdr->hop_limit--;
529
530         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
531         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
532                        ip6_forward_finish);
533
534 error:
535         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
536 drop:
537         kfree_skb(skb);
538         return -EINVAL;
539 }
540
541 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
542 {
543         to->pkt_type = from->pkt_type;
544         to->priority = from->priority;
545         to->protocol = from->protocol;
546         skb_dst_drop(to);
547         skb_dst_set(to, dst_clone(skb_dst(from)));
548         to->dev = from->dev;
549         to->mark = from->mark;
550
551 #ifdef CONFIG_NET_SCHED
552         to->tc_index = from->tc_index;
553 #endif
554         nf_copy(to, from);
555 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
556     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
557         to->nf_trace = from->nf_trace;
558 #endif
559         skb_copy_secmark(to, from);
560 }
561
562 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
563 {
564         u16 offset = sizeof(struct ipv6hdr);
565         struct ipv6_opt_hdr *exthdr =
566                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
567         unsigned int packet_len = skb->tail - skb->network_header;
568         int found_rhdr = 0;
569         *nexthdr = &ipv6_hdr(skb)->nexthdr;
570
571         while (offset + 1 <= packet_len) {
572
573                 switch (**nexthdr) {
574
575                 case NEXTHDR_HOP:
576                         break;
577                 case NEXTHDR_ROUTING:
578                         found_rhdr = 1;
579                         break;
580                 case NEXTHDR_DEST:
581 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
582                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
583                                 break;
584 #endif
585                         if (found_rhdr)
586                                 return offset;
587                         break;
588                 default :
589                         return offset;
590                 }
591
592                 offset += ipv6_optlen(exthdr);
593                 *nexthdr = &exthdr->nexthdr;
594                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
595                                                  offset);
596         }
597
598         return offset;
599 }
600
601 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
602 {
603         static atomic_t ipv6_fragmentation_id;
604         int old, new;
605
606         if (rt && !(rt->dst.flags & DST_NOPEER)) {
607                 struct inet_peer *peer;
608
609                 if (!rt->rt6i_peer)
610                         rt6_bind_peer(rt, 1);
611                 peer = rt->rt6i_peer;
612                 if (peer) {
613                         fhdr->identification = htonl(inet_getid(peer, 0));
614                         return;
615                 }
616         }
617         do {
618                 old = atomic_read(&ipv6_fragmentation_id);
619                 new = old + 1;
620                 if (!new)
621                         new = 1;
622         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
623         fhdr->identification = htonl(new);
624 }
625
626 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
627 {
628         struct sk_buff *frag;
629         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
630         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
631         struct ipv6hdr *tmp_hdr;
632         struct frag_hdr *fh;
633         unsigned int mtu, hlen, left, len;
634         __be32 frag_id = 0;
635         int ptr, offset = 0, err=0;
636         u8 *prevhdr, nexthdr = 0;
637         struct net *net = dev_net(skb_dst(skb)->dev);
638
639         hlen = ip6_find_1stfragopt(skb, &prevhdr);
640         nexthdr = *prevhdr;
641
642         mtu = ip6_skb_dst_mtu(skb);
643
644         /* We must not fragment if the socket is set to force MTU discovery
645          * or if the skb it not generated by a local socket.
646          */
647         if (!skb->local_df && skb->len > mtu) {
648                 skb->dev = skb_dst(skb)->dev;
649                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
650                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
651                               IPSTATS_MIB_FRAGFAILS);
652                 kfree_skb(skb);
653                 return -EMSGSIZE;
654         }
655
656         if (np && np->frag_size < mtu) {
657                 if (np->frag_size)
658                         mtu = np->frag_size;
659         }
660         mtu -= hlen + sizeof(struct frag_hdr);
661
662         if (skb_has_frag_list(skb)) {
663                 int first_len = skb_pagelen(skb);
664                 struct sk_buff *frag2;
665
666                 if (first_len - hlen > mtu ||
667                     ((first_len - hlen) & 7) ||
668                     skb_cloned(skb))
669                         goto slow_path;
670
671                 skb_walk_frags(skb, frag) {
672                         /* Correct geometry. */
673                         if (frag->len > mtu ||
674                             ((frag->len & 7) && frag->next) ||
675                             skb_headroom(frag) < hlen)
676                                 goto slow_path_clean;
677
678                         /* Partially cloned skb? */
679                         if (skb_shared(frag))
680                                 goto slow_path_clean;
681
682                         BUG_ON(frag->sk);
683                         if (skb->sk) {
684                                 frag->sk = skb->sk;
685                                 frag->destructor = sock_wfree;
686                         }
687                         skb->truesize -= frag->truesize;
688                 }
689
690                 err = 0;
691                 offset = 0;
692                 frag = skb_shinfo(skb)->frag_list;
693                 skb_frag_list_init(skb);
694                 /* BUILD HEADER */
695
696                 *prevhdr = NEXTHDR_FRAGMENT;
697                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
698                 if (!tmp_hdr) {
699                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
700                                       IPSTATS_MIB_FRAGFAILS);
701                         return -ENOMEM;
702                 }
703
704                 __skb_pull(skb, hlen);
705                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
706                 __skb_push(skb, hlen);
707                 skb_reset_network_header(skb);
708                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
709
710                 ipv6_select_ident(fh, rt);
711                 fh->nexthdr = nexthdr;
712                 fh->reserved = 0;
713                 fh->frag_off = htons(IP6_MF);
714                 frag_id = fh->identification;
715
716                 first_len = skb_pagelen(skb);
717                 skb->data_len = first_len - skb_headlen(skb);
718                 skb->len = first_len;
719                 ipv6_hdr(skb)->payload_len = htons(first_len -
720                                                    sizeof(struct ipv6hdr));
721
722                 dst_hold(&rt->dst);
723
724                 for (;;) {
725                         /* Prepare header of the next frame,
726                          * before previous one went down. */
727                         if (frag) {
728                                 frag->ip_summed = CHECKSUM_NONE;
729                                 skb_reset_transport_header(frag);
730                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
731                                 __skb_push(frag, hlen);
732                                 skb_reset_network_header(frag);
733                                 memcpy(skb_network_header(frag), tmp_hdr,
734                                        hlen);
735                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
736                                 fh->nexthdr = nexthdr;
737                                 fh->reserved = 0;
738                                 fh->frag_off = htons(offset);
739                                 if (frag->next != NULL)
740                                         fh->frag_off |= htons(IP6_MF);
741                                 fh->identification = frag_id;
742                                 ipv6_hdr(frag)->payload_len =
743                                                 htons(frag->len -
744                                                       sizeof(struct ipv6hdr));
745                                 ip6_copy_metadata(frag, skb);
746                         }
747
748                         err = output(skb);
749                         if(!err)
750                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
751                                               IPSTATS_MIB_FRAGCREATES);
752
753                         if (err || !frag)
754                                 break;
755
756                         skb = frag;
757                         frag = skb->next;
758                         skb->next = NULL;
759                 }
760
761                 kfree(tmp_hdr);
762
763                 if (err == 0) {
764                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
765                                       IPSTATS_MIB_FRAGOKS);
766                         dst_release(&rt->dst);
767                         return 0;
768                 }
769
770                 while (frag) {
771                         skb = frag->next;
772                         kfree_skb(frag);
773                         frag = skb;
774                 }
775
776                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
777                               IPSTATS_MIB_FRAGFAILS);
778                 dst_release(&rt->dst);
779                 return err;
780
781 slow_path_clean:
782                 skb_walk_frags(skb, frag2) {
783                         if (frag2 == frag)
784                                 break;
785                         frag2->sk = NULL;
786                         frag2->destructor = NULL;
787                         skb->truesize += frag2->truesize;
788                 }
789         }
790
791 slow_path:
792         left = skb->len - hlen;         /* Space per frame */
793         ptr = hlen;                     /* Where to start from */
794
795         /*
796          *      Fragment the datagram.
797          */
798
799         *prevhdr = NEXTHDR_FRAGMENT;
800
801         /*
802          *      Keep copying data until we run out.
803          */
804         while(left > 0) {
805                 len = left;
806                 /* IF: it doesn't fit, use 'mtu' - the data space left */
807                 if (len > mtu)
808                         len = mtu;
809                 /* IF: we are not sending up to and including the packet end
810                    then align the next start on an eight byte boundary */
811                 if (len < left) {
812                         len &= ~7;
813                 }
814                 /*
815                  *      Allocate buffer.
816                  */
817
818                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
819                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
820                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
821                                       IPSTATS_MIB_FRAGFAILS);
822                         err = -ENOMEM;
823                         goto fail;
824                 }
825
826                 /*
827                  *      Set up data on packet
828                  */
829
830                 ip6_copy_metadata(frag, skb);
831                 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
832                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
833                 skb_reset_network_header(frag);
834                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
835                 frag->transport_header = (frag->network_header + hlen +
836                                           sizeof(struct frag_hdr));
837
838                 /*
839                  *      Charge the memory for the fragment to any owner
840                  *      it might possess
841                  */
842                 if (skb->sk)
843                         skb_set_owner_w(frag, skb->sk);
844
845                 /*
846                  *      Copy the packet header into the new buffer.
847                  */
848                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
849
850                 /*
851                  *      Build fragment header.
852                  */
853                 fh->nexthdr = nexthdr;
854                 fh->reserved = 0;
855                 if (!frag_id) {
856                         ipv6_select_ident(fh, rt);
857                         frag_id = fh->identification;
858                 } else
859                         fh->identification = frag_id;
860
861                 /*
862                  *      Copy a block of the IP datagram.
863                  */
864                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
865                         BUG();
866                 left -= len;
867
868                 fh->frag_off = htons(offset);
869                 if (left > 0)
870                         fh->frag_off |= htons(IP6_MF);
871                 ipv6_hdr(frag)->payload_len = htons(frag->len -
872                                                     sizeof(struct ipv6hdr));
873
874                 ptr += len;
875                 offset += len;
876
877                 /*
878                  *      Put this fragment into the sending queue.
879                  */
880                 err = output(frag);
881                 if (err)
882                         goto fail;
883
884                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
885                               IPSTATS_MIB_FRAGCREATES);
886         }
887         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888                       IPSTATS_MIB_FRAGOKS);
889         kfree_skb(skb);
890         return err;
891
892 fail:
893         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
894                       IPSTATS_MIB_FRAGFAILS);
895         kfree_skb(skb);
896         return err;
897 }
898
899 static inline int ip6_rt_check(const struct rt6key *rt_key,
900                                const struct in6_addr *fl_addr,
901                                const struct in6_addr *addr_cache)
902 {
903         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
904                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
905 }
906
907 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
908                                           struct dst_entry *dst,
909                                           const struct flowi6 *fl6)
910 {
911         struct ipv6_pinfo *np = inet6_sk(sk);
912         struct rt6_info *rt;
913
914         if (!dst)
915                 goto out;
916
917         if (dst->ops->family != AF_INET6) {
918                 dst_release(dst);
919                 return NULL;
920         }
921
922         rt = (struct rt6_info *)dst;
923         /* Yes, checking route validity in not connected
924          * case is not very simple. Take into account,
925          * that we do not support routing by source, TOS,
926          * and MSG_DONTROUTE            --ANK (980726)
927          *
928          * 1. ip6_rt_check(): If route was host route,
929          *    check that cached destination is current.
930          *    If it is network route, we still may
931          *    check its validity using saved pointer
932          *    to the last used address: daddr_cache.
933          *    We do not want to save whole address now,
934          *    (because main consumer of this service
935          *    is tcp, which has not this problem),
936          *    so that the last trick works only on connected
937          *    sockets.
938          * 2. oif also should be the same.
939          */
940         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
941 #ifdef CONFIG_IPV6_SUBTREES
942             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
943 #endif
944             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
945                 dst_release(dst);
946                 dst = NULL;
947         }
948
949 out:
950         return dst;
951 }
952
953 static int ip6_dst_lookup_tail(struct sock *sk,
954                                struct dst_entry **dst, struct flowi6 *fl6)
955 {
956         struct net *net = sock_net(sk);
957 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
958         struct neighbour *n;
959 #endif
960         int err;
961
962         if (*dst == NULL)
963                 *dst = ip6_route_output(net, sk, fl6);
964
965         if ((err = (*dst)->error))
966                 goto out_err_release;
967
968         if (ipv6_addr_any(&fl6->saddr)) {
969                 struct rt6_info *rt = (struct rt6_info *) *dst;
970                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
971                                           sk ? inet6_sk(sk)->srcprefs : 0,
972                                           &fl6->saddr);
973                 if (err)
974                         goto out_err_release;
975         }
976
977 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
978         /*
979          * Here if the dst entry we've looked up
980          * has a neighbour entry that is in the INCOMPLETE
981          * state and the src address from the flow is
982          * marked as OPTIMISTIC, we release the found
983          * dst entry and replace it instead with the
984          * dst entry of the nexthop router
985          */
986         rcu_read_lock();
987         n = dst_get_neighbour(*dst);
988         if (n && !(n->nud_state & NUD_VALID)) {
989                 struct inet6_ifaddr *ifp;
990                 struct flowi6 fl_gw6;
991                 int redirect;
992
993                 rcu_read_unlock();
994                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
995                                       (*dst)->dev, 1);
996
997                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
998                 if (ifp)
999                         in6_ifa_put(ifp);
1000
1001                 if (redirect) {
1002                         /*
1003                          * We need to get the dst entry for the
1004                          * default router instead
1005                          */
1006                         dst_release(*dst);
1007                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1008                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1009                         *dst = ip6_route_output(net, sk, &fl_gw6);
1010                         if ((err = (*dst)->error))
1011                                 goto out_err_release;
1012                 }
1013         } else {
1014                 rcu_read_unlock();
1015         }
1016 #endif
1017
1018         return 0;
1019
1020 out_err_release:
1021         if (err == -ENETUNREACH)
1022                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1023         dst_release(*dst);
1024         *dst = NULL;
1025         return err;
1026 }
1027
1028 /**
1029  *      ip6_dst_lookup - perform route lookup on flow
1030  *      @sk: socket which provides route info
1031  *      @dst: pointer to dst_entry * for result
1032  *      @fl6: flow to lookup
1033  *
1034  *      This function performs a route lookup on the given flow.
1035  *
1036  *      It returns zero on success, or a standard errno code on error.
1037  */
1038 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1039 {
1040         *dst = NULL;
1041         return ip6_dst_lookup_tail(sk, dst, fl6);
1042 }
1043 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1044
1045 /**
1046  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1047  *      @sk: socket which provides route info
1048  *      @fl6: flow to lookup
1049  *      @final_dst: final destination address for ipsec lookup
1050  *      @can_sleep: we are in a sleepable context
1051  *
1052  *      This function performs a route lookup on the given flow.
1053  *
1054  *      It returns a valid dst pointer on success, or a pointer encoded
1055  *      error code.
1056  */
1057 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1058                                       const struct in6_addr *final_dst,
1059                                       bool can_sleep)
1060 {
1061         struct dst_entry *dst = NULL;
1062         int err;
1063
1064         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1065         if (err)
1066                 return ERR_PTR(err);
1067         if (final_dst)
1068                 ipv6_addr_copy(&fl6->daddr, final_dst);
1069         if (can_sleep)
1070                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1071
1072         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1073 }
1074 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1075
1076 /**
1077  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1078  *      @sk: socket which provides the dst cache and route info
1079  *      @fl6: flow to lookup
1080  *      @final_dst: final destination address for ipsec lookup
1081  *      @can_sleep: we are in a sleepable context
1082  *
1083  *      This function performs a route lookup on the given flow with the
1084  *      possibility of using the cached route in the socket if it is valid.
1085  *      It will take the socket dst lock when operating on the dst cache.
1086  *      As a result, this function can only be used in process context.
1087  *
1088  *      It returns a valid dst pointer on success, or a pointer encoded
1089  *      error code.
1090  */
1091 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1092                                          const struct in6_addr *final_dst,
1093                                          bool can_sleep)
1094 {
1095         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1096         int err;
1097
1098         dst = ip6_sk_dst_check(sk, dst, fl6);
1099
1100         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1101         if (err)
1102                 return ERR_PTR(err);
1103         if (final_dst)
1104                 ipv6_addr_copy(&fl6->daddr, final_dst);
1105         if (can_sleep)
1106                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1107
1108         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1109 }
1110 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1111
1112 static inline int ip6_ufo_append_data(struct sock *sk,
1113                         int getfrag(void *from, char *to, int offset, int len,
1114                         int odd, struct sk_buff *skb),
1115                         void *from, int length, int hh_len, int fragheaderlen,
1116                         int transhdrlen, int mtu,unsigned int flags,
1117                         struct rt6_info *rt)
1118
1119 {
1120         struct sk_buff *skb;
1121         int err;
1122
1123         /* There is support for UDP large send offload by network
1124          * device, so create one single skb packet containing complete
1125          * udp datagram
1126          */
1127         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1128                 struct frag_hdr fhdr;
1129
1130                 skb = sock_alloc_send_skb(sk,
1131                         hh_len + fragheaderlen + transhdrlen + 20,
1132                         (flags & MSG_DONTWAIT), &err);
1133                 if (skb == NULL)
1134                         return err;
1135
1136                 /* reserve space for Hardware header */
1137                 skb_reserve(skb, hh_len);
1138
1139                 /* create space for UDP/IP header */
1140                 skb_put(skb,fragheaderlen + transhdrlen);
1141
1142                 /* initialize network header pointer */
1143                 skb_reset_network_header(skb);
1144
1145                 /* initialize protocol header pointer */
1146                 skb->transport_header = skb->network_header + fragheaderlen;
1147
1148                 skb->ip_summed = CHECKSUM_PARTIAL;
1149                 skb->csum = 0;
1150
1151                 /* Specify the length of each IPv6 datagram fragment.
1152                  * It has to be a multiple of 8.
1153                  */
1154                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1155                                              sizeof(struct frag_hdr)) & ~7;
1156                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1157                 ipv6_select_ident(&fhdr, rt);
1158                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1159                 __skb_queue_tail(&sk->sk_write_queue, skb);
1160         }
1161
1162         return skb_append_datato_frags(sk, skb, getfrag, from,
1163                                        (length - transhdrlen));
1164 }
1165
1166 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1167                                                gfp_t gfp)
1168 {
1169         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1170 }
1171
1172 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1173                                                 gfp_t gfp)
1174 {
1175         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1176 }
1177
1178 static void ip6_append_data_mtu(unsigned int *mtu,
1179                                 int *maxfraglen,
1180                                 unsigned int fragheaderlen,
1181                                 struct sk_buff *skb,
1182                                 struct rt6_info *rt,
1183                                 bool pmtuprobe)
1184 {
1185         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1186                 if (skb == NULL) {
1187                         /* first fragment, reserve header_len */
1188                         *mtu = *mtu - rt->dst.header_len;
1189
1190                 } else {
1191                         /*
1192                          * this fragment is not first, the headers
1193                          * space is regarded as data space.
1194                          */
1195                         *mtu = min(*mtu, pmtuprobe ?
1196                                    rt->dst.dev->mtu :
1197                                    dst_mtu(rt->dst.path));
1198                 }
1199                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1200                               + fragheaderlen - sizeof(struct frag_hdr);
1201         }
1202 }
1203
1204 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1205         int offset, int len, int odd, struct sk_buff *skb),
1206         void *from, int length, int transhdrlen,
1207         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1208         struct rt6_info *rt, unsigned int flags, int dontfrag)
1209 {
1210         struct inet_sock *inet = inet_sk(sk);
1211         struct ipv6_pinfo *np = inet6_sk(sk);
1212         struct inet_cork *cork;
1213         struct sk_buff *skb, *skb_prev = NULL;
1214         unsigned int maxfraglen, fragheaderlen, mtu;
1215         int exthdrlen;
1216         int dst_exthdrlen;
1217         int hh_len;
1218         int copy;
1219         int err;
1220         int offset = 0;
1221         int csummode = CHECKSUM_NONE;
1222         __u8 tx_flags = 0;
1223
1224         if (flags&MSG_PROBE)
1225                 return 0;
1226         cork = &inet->cork.base;
1227         if (skb_queue_empty(&sk->sk_write_queue)) {
1228                 /*
1229                  * setup for corking
1230                  */
1231                 if (opt) {
1232                         if (WARN_ON(np->cork.opt))
1233                                 return -EINVAL;
1234
1235                         np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1236                         if (unlikely(np->cork.opt == NULL))
1237                                 return -ENOBUFS;
1238
1239                         np->cork.opt->tot_len = opt->tot_len;
1240                         np->cork.opt->opt_flen = opt->opt_flen;
1241                         np->cork.opt->opt_nflen = opt->opt_nflen;
1242
1243                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1244                                                             sk->sk_allocation);
1245                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1246                                 return -ENOBUFS;
1247
1248                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1249                                                             sk->sk_allocation);
1250                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1251                                 return -ENOBUFS;
1252
1253                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1254                                                            sk->sk_allocation);
1255                         if (opt->hopopt && !np->cork.opt->hopopt)
1256                                 return -ENOBUFS;
1257
1258                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1259                                                             sk->sk_allocation);
1260                         if (opt->srcrt && !np->cork.opt->srcrt)
1261                                 return -ENOBUFS;
1262
1263                         /* need source address above miyazawa*/
1264                 }
1265                 dst_hold(&rt->dst);
1266                 cork->dst = &rt->dst;
1267                 inet->cork.fl.u.ip6 = *fl6;
1268                 np->cork.hop_limit = hlimit;
1269                 np->cork.tclass = tclass;
1270                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1271                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1272                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1273                 else
1274                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1275                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1276                 if (np->frag_size < mtu) {
1277                         if (np->frag_size)
1278                                 mtu = np->frag_size;
1279                 }
1280                 cork->fragsize = mtu;
1281                 if (dst_allfrag(rt->dst.path))
1282                         cork->flags |= IPCORK_ALLFRAG;
1283                 cork->length = 0;
1284                 sk->sk_sndmsg_page = NULL;
1285                 sk->sk_sndmsg_off = 0;
1286                 exthdrlen = (opt ? opt->opt_flen : 0);
1287                 length += exthdrlen;
1288                 transhdrlen += exthdrlen;
1289                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1290         } else {
1291                 rt = (struct rt6_info *)cork->dst;
1292                 fl6 = &inet->cork.fl.u.ip6;
1293                 opt = np->cork.opt;
1294                 transhdrlen = 0;
1295                 exthdrlen = 0;
1296                 dst_exthdrlen = 0;
1297                 mtu = cork->fragsize;
1298         }
1299
1300         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1301
1302         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1303                         (opt ? opt->opt_nflen : 0);
1304         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1305
1306         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1307                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1308                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1309                         return -EMSGSIZE;
1310                 }
1311         }
1312
1313         /* For UDP, check if TX timestamp is enabled */
1314         if (sk->sk_type == SOCK_DGRAM) {
1315                 err = sock_tx_timestamp(sk, &tx_flags);
1316                 if (err)
1317                         goto error;
1318         }
1319
1320         /*
1321          * Let's try using as much space as possible.
1322          * Use MTU if total length of the message fits into the MTU.
1323          * Otherwise, we need to reserve fragment header and
1324          * fragment alignment (= 8-15 octects, in total).
1325          *
1326          * Note that we may need to "move" the data from the tail of
1327          * of the buffer to the new fragment when we split
1328          * the message.
1329          *
1330          * FIXME: It may be fragmented into multiple chunks
1331          *        at once if non-fragmentable extension headers
1332          *        are too large.
1333          * --yoshfuji
1334          */
1335
1336         if ((length > mtu) && dontfrag && (sk->sk_protocol == IPPROTO_UDP ||
1337                                            sk->sk_protocol == IPPROTO_RAW)) {
1338                 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1339                 return -EMSGSIZE;
1340         }
1341
1342         skb = skb_peek_tail(&sk->sk_write_queue);
1343         cork->length += length;
1344         if (((length > mtu) ||
1345              (skb && skb_has_frags(skb))) &&
1346             (sk->sk_protocol == IPPROTO_UDP) &&
1347             (rt->dst.dev->features & NETIF_F_UFO)) {
1348                 err = ip6_ufo_append_data(sk, getfrag, from, length,
1349                                           hh_len, fragheaderlen,
1350                                           transhdrlen, mtu, flags, rt);
1351                 if (err)
1352                         goto error;
1353                 return 0;
1354         }
1355
1356         if (!skb)
1357                 goto alloc_new_skb;
1358
1359         while (length > 0) {
1360                 /* Check if the remaining data fits into current packet. */
1361                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1362                 if (copy < length)
1363                         copy = maxfraglen - skb->len;
1364
1365                 if (copy <= 0) {
1366                         char *data;
1367                         unsigned int datalen;
1368                         unsigned int fraglen;
1369                         unsigned int fraggap;
1370                         unsigned int alloclen;
1371 alloc_new_skb:
1372                         /* There's no room in the current skb */
1373                         if (skb)
1374                                 fraggap = skb->len - maxfraglen;
1375                         else
1376                                 fraggap = 0;
1377                         /* update mtu and maxfraglen if necessary */
1378                         if (skb == NULL || skb_prev == NULL)
1379                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1380                                                     fragheaderlen, skb, rt,
1381                                                     np->pmtudisc ==
1382                                                     IPV6_PMTUDISC_PROBE);
1383
1384                         skb_prev = skb;
1385
1386                         /*
1387                          * If remaining data exceeds the mtu,
1388                          * we know we need more fragment(s).
1389                          */
1390                         datalen = length + fraggap;
1391
1392                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1393                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1394                         if ((flags & MSG_MORE) &&
1395                             !(rt->dst.dev->features&NETIF_F_SG))
1396                                 alloclen = mtu;
1397                         else
1398                                 alloclen = datalen + fragheaderlen;
1399
1400                         alloclen += dst_exthdrlen;
1401
1402                         if (datalen != length + fraggap) {
1403                                 /*
1404                                  * this is not the last fragment, the trailer
1405                                  * space is regarded as data space.
1406                                  */
1407                                 datalen += rt->dst.trailer_len;
1408                         }
1409
1410                         alloclen += rt->dst.trailer_len;
1411                         fraglen = datalen + fragheaderlen;
1412
1413                         /*
1414                          * We just reserve space for fragment header.
1415                          * Note: this may be overallocation if the message
1416                          * (without MSG_MORE) fits into the MTU.
1417                          */
1418                         alloclen += sizeof(struct frag_hdr);
1419
1420                         if (transhdrlen) {
1421                                 skb = sock_alloc_send_skb(sk,
1422                                                 alloclen + hh_len,
1423                                                 (flags & MSG_DONTWAIT), &err);
1424                         } else {
1425                                 skb = NULL;
1426                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1427                                     2 * sk->sk_sndbuf)
1428                                         skb = sock_wmalloc(sk,
1429                                                            alloclen + hh_len, 1,
1430                                                            sk->sk_allocation);
1431                                 if (unlikely(skb == NULL))
1432                                         err = -ENOBUFS;
1433                                 else {
1434                                         /* Only the initial fragment
1435                                          * is time stamped.
1436                                          */
1437                                         tx_flags = 0;
1438                                 }
1439                         }
1440                         if (skb == NULL)
1441                                 goto error;
1442                         /*
1443                          *      Fill in the control structures
1444                          */
1445                         skb->ip_summed = csummode;
1446                         skb->csum = 0;
1447                         /* reserve for fragmentation and ipsec header */
1448                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1449                                     dst_exthdrlen);
1450
1451                         if (sk->sk_type == SOCK_DGRAM)
1452                                 skb_shinfo(skb)->tx_flags = tx_flags;
1453
1454                         /*
1455                          *      Find where to start putting bytes
1456                          */
1457                         data = skb_put(skb, fraglen);
1458                         skb_set_network_header(skb, exthdrlen);
1459                         data += fragheaderlen;
1460                         skb->transport_header = (skb->network_header +
1461                                                  fragheaderlen);
1462                         if (fraggap) {
1463                                 skb->csum = skb_copy_and_csum_bits(
1464                                         skb_prev, maxfraglen,
1465                                         data + transhdrlen, fraggap, 0);
1466                                 skb_prev->csum = csum_sub(skb_prev->csum,
1467                                                           skb->csum);
1468                                 data += fraggap;
1469                                 pskb_trim_unique(skb_prev, maxfraglen);
1470                         }
1471                         copy = datalen - transhdrlen - fraggap;
1472
1473                         if (copy < 0) {
1474                                 err = -EINVAL;
1475                                 kfree_skb(skb);
1476                                 goto error;
1477                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1478                                 err = -EFAULT;
1479                                 kfree_skb(skb);
1480                                 goto error;
1481                         }
1482
1483                         offset += copy;
1484                         length -= datalen - fraggap;
1485                         transhdrlen = 0;
1486                         exthdrlen = 0;
1487                         dst_exthdrlen = 0;
1488                         csummode = CHECKSUM_NONE;
1489
1490                         /*
1491                          * Put the packet on the pending queue
1492                          */
1493                         __skb_queue_tail(&sk->sk_write_queue, skb);
1494                         continue;
1495                 }
1496
1497                 if (copy > length)
1498                         copy = length;
1499
1500                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1501                         unsigned int off;
1502
1503                         off = skb->len;
1504                         if (getfrag(from, skb_put(skb, copy),
1505                                                 offset, copy, off, skb) < 0) {
1506                                 __skb_trim(skb, off);
1507                                 err = -EFAULT;
1508                                 goto error;
1509                         }
1510                 } else {
1511                         int i = skb_shinfo(skb)->nr_frags;
1512                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1513                         struct page *page = sk->sk_sndmsg_page;
1514                         int off = sk->sk_sndmsg_off;
1515                         unsigned int left;
1516
1517                         if (page && (left = PAGE_SIZE - off) > 0) {
1518                                 if (copy >= left)
1519                                         copy = left;
1520                                 if (page != skb_frag_page(frag)) {
1521                                         if (i == MAX_SKB_FRAGS) {
1522                                                 err = -EMSGSIZE;
1523                                                 goto error;
1524                                         }
1525                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1526                                         skb_frag_ref(skb, i);
1527                                         frag = &skb_shinfo(skb)->frags[i];
1528                                 }
1529                         } else if(i < MAX_SKB_FRAGS) {
1530                                 if (copy > PAGE_SIZE)
1531                                         copy = PAGE_SIZE;
1532                                 page = alloc_pages(sk->sk_allocation, 0);
1533                                 if (page == NULL) {
1534                                         err = -ENOMEM;
1535                                         goto error;
1536                                 }
1537                                 sk->sk_sndmsg_page = page;
1538                                 sk->sk_sndmsg_off = 0;
1539
1540                                 skb_fill_page_desc(skb, i, page, 0, 0);
1541                                 frag = &skb_shinfo(skb)->frags[i];
1542                         } else {
1543                                 err = -EMSGSIZE;
1544                                 goto error;
1545                         }
1546                         if (getfrag(from,
1547                                     skb_frag_address(frag) + skb_frag_size(frag),
1548                                     offset, copy, skb->len, skb) < 0) {
1549                                 err = -EFAULT;
1550                                 goto error;
1551                         }
1552                         sk->sk_sndmsg_off += copy;
1553                         skb_frag_size_add(frag, copy);
1554                         skb->len += copy;
1555                         skb->data_len += copy;
1556                         skb->truesize += copy;
1557                         atomic_add(copy, &sk->sk_wmem_alloc);
1558                 }
1559                 offset += copy;
1560                 length -= copy;
1561         }
1562         return 0;
1563 error:
1564         cork->length -= length;
1565         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1566         return err;
1567 }
1568
1569 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1570 {
1571         if (np->cork.opt) {
1572                 kfree(np->cork.opt->dst0opt);
1573                 kfree(np->cork.opt->dst1opt);
1574                 kfree(np->cork.opt->hopopt);
1575                 kfree(np->cork.opt->srcrt);
1576                 kfree(np->cork.opt);
1577                 np->cork.opt = NULL;
1578         }
1579
1580         if (inet->cork.base.dst) {
1581                 dst_release(inet->cork.base.dst);
1582                 inet->cork.base.dst = NULL;
1583                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1584         }
1585         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1586 }
1587
1588 int ip6_push_pending_frames(struct sock *sk)
1589 {
1590         struct sk_buff *skb, *tmp_skb;
1591         struct sk_buff **tail_skb;
1592         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1593         struct inet_sock *inet = inet_sk(sk);
1594         struct ipv6_pinfo *np = inet6_sk(sk);
1595         struct net *net = sock_net(sk);
1596         struct ipv6hdr *hdr;
1597         struct ipv6_txoptions *opt = np->cork.opt;
1598         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1599         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1600         unsigned char proto = fl6->flowi6_proto;
1601         int err = 0;
1602
1603         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1604                 goto out;
1605         tail_skb = &(skb_shinfo(skb)->frag_list);
1606
1607         /* move skb->data to ip header from ext header */
1608         if (skb->data < skb_network_header(skb))
1609                 __skb_pull(skb, skb_network_offset(skb));
1610         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1611                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1612                 *tail_skb = tmp_skb;
1613                 tail_skb = &(tmp_skb->next);
1614                 skb->len += tmp_skb->len;
1615                 skb->data_len += tmp_skb->len;
1616                 skb->truesize += tmp_skb->truesize;
1617                 tmp_skb->destructor = NULL;
1618                 tmp_skb->sk = NULL;
1619         }
1620
1621         /* Allow local fragmentation. */
1622         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1623                 skb->local_df = 1;
1624
1625         ipv6_addr_copy(final_dst, &fl6->daddr);
1626         __skb_pull(skb, skb_network_header_len(skb));
1627         if (opt && opt->opt_flen)
1628                 ipv6_push_frag_opts(skb, opt, &proto);
1629         if (opt && opt->opt_nflen)
1630                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1631
1632         skb_push(skb, sizeof(struct ipv6hdr));
1633         skb_reset_network_header(skb);
1634         hdr = ipv6_hdr(skb);
1635
1636         *(__be32*)hdr = fl6->flowlabel |
1637                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1638
1639         hdr->hop_limit = np->cork.hop_limit;
1640         hdr->nexthdr = proto;
1641         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1642         ipv6_addr_copy(&hdr->daddr, final_dst);
1643
1644         skb->priority = sk->sk_priority;
1645         skb->mark = sk->sk_mark;
1646
1647         skb_dst_set(skb, dst_clone(&rt->dst));
1648         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1649         if (proto == IPPROTO_ICMPV6) {
1650                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1651
1652                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1653                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1654         }
1655
1656         err = ip6_local_out(skb);
1657         if (err) {
1658                 if (err > 0)
1659                         err = net_xmit_errno(err);
1660                 if (err)
1661                         goto error;
1662         }
1663
1664 out:
1665         ip6_cork_release(inet, np);
1666         return err;
1667 error:
1668         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1669         goto out;
1670 }
1671
1672 void ip6_flush_pending_frames(struct sock *sk)
1673 {
1674         struct sk_buff *skb;
1675
1676         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1677                 if (skb_dst(skb))
1678                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1679                                       IPSTATS_MIB_OUTDISCARDS);
1680                 kfree_skb(skb);
1681         }
1682
1683         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1684 }