ipv6: don't call addrconf_dst_alloc again when enable lo
[pandora-kernel.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89         skb_reset_mac_header(newskb);
90         __skb_pull(newskb, skb_network_offset(newskb));
91         newskb->pkt_type = PACKET_LOOPBACK;
92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
93         WARN_ON(!skb_dst(newskb));
94
95         netif_rx_ni(newskb);
96         return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101         struct dst_entry *dst = skb_dst(skb);
102         struct net_device *dev = dst->dev;
103         struct neighbour *neigh;
104
105         skb->protocol = htons(ETH_P_IPV6);
106         skb->dev = dev;
107
108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110
111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112                     ((mroute6_socket(dev_net(dev), skb) &&
113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115                                          &ipv6_hdr(skb)->saddr))) {
116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117
118                         /* Do not check for IFF_ALLMULTI; multicast routing
119                            is not supported in any case.
120                          */
121                         if (newskb)
122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123                                         newskb, NULL, newskb->dev,
124                                         ip6_dev_loopback_xmit);
125
126                         if (ipv6_hdr(skb)->hop_limit == 0) {
127                                 IP6_INC_STATS(dev_net(dev), idev,
128                                               IPSTATS_MIB_OUTDISCARDS);
129                                 kfree_skb(skb);
130                                 return 0;
131                         }
132                 }
133
134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135                                 skb->len);
136         }
137
138         rcu_read_lock();
139         neigh = dst_get_neighbour(dst);
140         if (neigh) {
141                 int res = neigh_output(neigh, skb);
142
143                 rcu_read_unlock();
144                 return res;
145         }
146         rcu_read_unlock();
147         IP6_INC_STATS_BH(dev_net(dst->dev),
148                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149         kfree_skb(skb);
150         return -EINVAL;
151 }
152
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156             dst_allfrag(skb_dst(skb)))
157                 return ip6_fragment(skb, ip6_finish_output2);
158         else
159                 return ip6_finish_output2(skb);
160 }
161
162 int ip6_output(struct sk_buff *skb)
163 {
164         struct net_device *dev = skb_dst(skb)->dev;
165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166         if (unlikely(idev->cnf.disable_ipv6)) {
167                 IP6_INC_STATS(dev_net(dev), idev,
168                               IPSTATS_MIB_OUTDISCARDS);
169                 kfree_skb(skb);
170                 return 0;
171         }
172
173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174                             ip6_finish_output,
175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177
178 /*
179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183              struct ipv6_txoptions *opt, int tclass)
184 {
185         struct net *net = sock_net(sk);
186         struct ipv6_pinfo *np = inet6_sk(sk);
187         struct in6_addr *first_hop = &fl6->daddr;
188         struct dst_entry *dst = skb_dst(skb);
189         struct ipv6hdr *hdr;
190         u8  proto = fl6->flowi6_proto;
191         int seg_len = skb->len;
192         int hlimit = -1;
193         u32 mtu;
194
195         if (opt) {
196                 unsigned int head_room;
197
198                 /* First: exthdrs may take lots of space (~8K for now)
199                    MAX_HEADER is not enough.
200                  */
201                 head_room = opt->opt_nflen + opt->opt_flen;
202                 seg_len += head_room;
203                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204
205                 if (skb_headroom(skb) < head_room) {
206                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207                         if (skb2 == NULL) {
208                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209                                               IPSTATS_MIB_OUTDISCARDS);
210                                 kfree_skb(skb);
211                                 return -ENOBUFS;
212                         }
213                         kfree_skb(skb);
214                         skb = skb2;
215                         skb_set_owner_w(skb, sk);
216                 }
217                 if (opt->opt_flen)
218                         ipv6_push_frag_opts(skb, opt, &proto);
219                 if (opt->opt_nflen)
220                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221         }
222
223         skb_push(skb, sizeof(struct ipv6hdr));
224         skb_reset_network_header(skb);
225         hdr = ipv6_hdr(skb);
226
227         /*
228          *      Fill in the IPv6 header
229          */
230         if (np)
231                 hlimit = np->hop_limit;
232         if (hlimit < 0)
233                 hlimit = ip6_dst_hoplimit(dst);
234
235         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236
237         hdr->payload_len = htons(seg_len);
238         hdr->nexthdr = proto;
239         hdr->hop_limit = hlimit;
240
241         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
242         ipv6_addr_copy(&hdr->daddr, first_hop);
243
244         skb->priority = sk->sk_priority;
245         skb->mark = sk->sk_mark;
246
247         mtu = dst_mtu(dst);
248         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250                               IPSTATS_MIB_OUT, skb->len);
251                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252                                dst->dev, dst_output);
253         }
254
255         if (net_ratelimit())
256                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
257         skb->dev = dst->dev;
258         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
259         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
260         kfree_skb(skb);
261         return -EMSGSIZE;
262 }
263
264 EXPORT_SYMBOL(ip6_xmit);
265
266 /*
267  *      To avoid extra problems ND packets are send through this
268  *      routine. It's code duplication but I really want to avoid
269  *      extra checks since ipv6_build_header is used by TCP (which
270  *      is for us performance critical)
271  */
272
273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
274                const struct in6_addr *saddr, const struct in6_addr *daddr,
275                int proto, int len)
276 {
277         struct ipv6_pinfo *np = inet6_sk(sk);
278         struct ipv6hdr *hdr;
279
280         skb->protocol = htons(ETH_P_IPV6);
281         skb->dev = dev;
282
283         skb_reset_network_header(skb);
284         skb_put(skb, sizeof(struct ipv6hdr));
285         hdr = ipv6_hdr(skb);
286
287         *(__be32*)hdr = htonl(0x60000000);
288
289         hdr->payload_len = htons(len);
290         hdr->nexthdr = proto;
291         hdr->hop_limit = np->hop_limit;
292
293         ipv6_addr_copy(&hdr->saddr, saddr);
294         ipv6_addr_copy(&hdr->daddr, daddr);
295
296         return 0;
297 }
298
299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
300 {
301         struct ip6_ra_chain *ra;
302         struct sock *last = NULL;
303
304         read_lock(&ip6_ra_lock);
305         for (ra = ip6_ra_chain; ra; ra = ra->next) {
306                 struct sock *sk = ra->sk;
307                 if (sk && ra->sel == sel &&
308                     (!sk->sk_bound_dev_if ||
309                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
310                         if (last) {
311                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
312                                 if (skb2)
313                                         rawv6_rcv(last, skb2);
314                         }
315                         last = sk;
316                 }
317         }
318
319         if (last) {
320                 rawv6_rcv(last, skb);
321                 read_unlock(&ip6_ra_lock);
322                 return 1;
323         }
324         read_unlock(&ip6_ra_lock);
325         return 0;
326 }
327
328 static int ip6_forward_proxy_check(struct sk_buff *skb)
329 {
330         struct ipv6hdr *hdr = ipv6_hdr(skb);
331         u8 nexthdr = hdr->nexthdr;
332         int offset;
333
334         if (ipv6_ext_hdr(nexthdr)) {
335                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
336                 if (offset < 0)
337                         return 0;
338         } else
339                 offset = sizeof(struct ipv6hdr);
340
341         if (nexthdr == IPPROTO_ICMPV6) {
342                 struct icmp6hdr *icmp6;
343
344                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345                                          offset + 1 - skb->data)))
346                         return 0;
347
348                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349
350                 switch (icmp6->icmp6_type) {
351                 case NDISC_ROUTER_SOLICITATION:
352                 case NDISC_ROUTER_ADVERTISEMENT:
353                 case NDISC_NEIGHBOUR_SOLICITATION:
354                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
355                 case NDISC_REDIRECT:
356                         /* For reaction involving unicast neighbor discovery
357                          * message destined to the proxied address, pass it to
358                          * input function.
359                          */
360                         return 1;
361                 default:
362                         break;
363                 }
364         }
365
366         /*
367          * The proxying router can't forward traffic sent to a link-local
368          * address, so signal the sender and discard the packet. This
369          * behavior is clarified by the MIPv6 specification.
370          */
371         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372                 dst_link_failure(skb);
373                 return -1;
374         }
375
376         return 0;
377 }
378
379 static inline int ip6_forward_finish(struct sk_buff *skb)
380 {
381         return dst_output(skb);
382 }
383
384 int ip6_forward(struct sk_buff *skb)
385 {
386         struct dst_entry *dst = skb_dst(skb);
387         struct ipv6hdr *hdr = ipv6_hdr(skb);
388         struct inet6_skb_parm *opt = IP6CB(skb);
389         struct net *net = dev_net(dst->dev);
390         struct neighbour *n;
391         u32 mtu;
392
393         if (net->ipv6.devconf_all->forwarding == 0)
394                 goto error;
395
396         if (skb_warn_if_lro(skb))
397                 goto drop;
398
399         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
401                 goto drop;
402         }
403
404         if (skb->pkt_type != PACKET_HOST)
405                 goto drop;
406
407         skb_forward_csum(skb);
408
409         /*
410          *      We DO NOT make any processing on
411          *      RA packets, pushing them to user level AS IS
412          *      without ane WARRANTY that application will be able
413          *      to interpret them. The reason is that we
414          *      cannot make anything clever here.
415          *
416          *      We are not end-node, so that if packet contains
417          *      AH/ESP, we cannot make anything.
418          *      Defragmentation also would be mistake, RA packets
419          *      cannot be fragmented, because there is no warranty
420          *      that different fragments will go along one path. --ANK
421          */
422         if (opt->ra) {
423                 u8 *ptr = skb_network_header(skb) + opt->ra;
424                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
425                         return 0;
426         }
427
428         /*
429          *      check and decrement ttl
430          */
431         if (hdr->hop_limit <= 1) {
432                 /* Force OUTPUT device used as source address */
433                 skb->dev = dst->dev;
434                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
435                 IP6_INC_STATS_BH(net,
436                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
437
438                 kfree_skb(skb);
439                 return -ETIMEDOUT;
440         }
441
442         /* XXX: idev->cnf.proxy_ndp? */
443         if (net->ipv6.devconf_all->proxy_ndp &&
444             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
445                 int proxied = ip6_forward_proxy_check(skb);
446                 if (proxied > 0)
447                         return ip6_input(skb);
448                 else if (proxied < 0) {
449                         IP6_INC_STATS(net, ip6_dst_idev(dst),
450                                       IPSTATS_MIB_INDISCARDS);
451                         goto drop;
452                 }
453         }
454
455         if (!xfrm6_route_forward(skb)) {
456                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
457                 goto drop;
458         }
459         dst = skb_dst(skb);
460
461         /* IPv6 specs say nothing about it, but it is clear that we cannot
462            send redirects to source routed frames.
463            We don't send redirects to frames decapsulated from IPsec.
464          */
465         n = dst_get_neighbour(dst);
466         if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
467                 struct in6_addr *target = NULL;
468                 struct rt6_info *rt;
469
470                 /*
471                  *      incoming and outgoing devices are the same
472                  *      send a redirect.
473                  */
474
475                 rt = (struct rt6_info *) dst;
476                 if ((rt->rt6i_flags & RTF_GATEWAY))
477                         target = (struct in6_addr*)&n->primary_key;
478                 else
479                         target = &hdr->daddr;
480
481                 if (!rt->rt6i_peer)
482                         rt6_bind_peer(rt, 1);
483
484                 /* Limit redirects both by destination (here)
485                    and by source (inside ndisc_send_redirect)
486                  */
487                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
488                         ndisc_send_redirect(skb, n, target);
489         } else {
490                 int addrtype = ipv6_addr_type(&hdr->saddr);
491
492                 /* This check is security critical. */
493                 if (addrtype == IPV6_ADDR_ANY ||
494                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
495                         goto error;
496                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
497                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
498                                     ICMPV6_NOT_NEIGHBOUR, 0);
499                         goto error;
500                 }
501         }
502
503         mtu = dst_mtu(dst);
504         if (mtu < IPV6_MIN_MTU)
505                 mtu = IPV6_MIN_MTU;
506
507         if (skb->len > mtu && !skb_is_gso(skb)) {
508                 /* Again, force OUTPUT device used as source address */
509                 skb->dev = dst->dev;
510                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
511                 IP6_INC_STATS_BH(net,
512                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
513                 IP6_INC_STATS_BH(net,
514                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
515                 kfree_skb(skb);
516                 return -EMSGSIZE;
517         }
518
519         if (skb_cow(skb, dst->dev->hard_header_len)) {
520                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
521                 goto drop;
522         }
523
524         hdr = ipv6_hdr(skb);
525
526         /* Mangling hops number delayed to point after skb COW */
527
528         hdr->hop_limit--;
529
530         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
531         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
532                        ip6_forward_finish);
533
534 error:
535         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
536 drop:
537         kfree_skb(skb);
538         return -EINVAL;
539 }
540
541 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
542 {
543         to->pkt_type = from->pkt_type;
544         to->priority = from->priority;
545         to->protocol = from->protocol;
546         skb_dst_drop(to);
547         skb_dst_set(to, dst_clone(skb_dst(from)));
548         to->dev = from->dev;
549         to->mark = from->mark;
550
551 #ifdef CONFIG_NET_SCHED
552         to->tc_index = from->tc_index;
553 #endif
554         nf_copy(to, from);
555 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
556     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
557         to->nf_trace = from->nf_trace;
558 #endif
559         skb_copy_secmark(to, from);
560 }
561
562 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
563 {
564         u16 offset = sizeof(struct ipv6hdr);
565         struct ipv6_opt_hdr *exthdr =
566                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
567         unsigned int packet_len = skb->tail - skb->network_header;
568         int found_rhdr = 0;
569         *nexthdr = &ipv6_hdr(skb)->nexthdr;
570
571         while (offset + 1 <= packet_len) {
572
573                 switch (**nexthdr) {
574
575                 case NEXTHDR_HOP:
576                         break;
577                 case NEXTHDR_ROUTING:
578                         found_rhdr = 1;
579                         break;
580                 case NEXTHDR_DEST:
581 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
582                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
583                                 break;
584 #endif
585                         if (found_rhdr)
586                                 return offset;
587                         break;
588                 default :
589                         return offset;
590                 }
591
592                 offset += ipv6_optlen(exthdr);
593                 *nexthdr = &exthdr->nexthdr;
594                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
595                                                  offset);
596         }
597
598         return offset;
599 }
600
601 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
602 {
603         static atomic_t ipv6_fragmentation_id;
604         int old, new;
605
606         if (rt && !(rt->dst.flags & DST_NOPEER)) {
607                 struct inet_peer *peer;
608
609                 if (!rt->rt6i_peer)
610                         rt6_bind_peer(rt, 1);
611                 peer = rt->rt6i_peer;
612                 if (peer) {
613                         fhdr->identification = htonl(inet_getid(peer, 0));
614                         return;
615                 }
616         }
617         do {
618                 old = atomic_read(&ipv6_fragmentation_id);
619                 new = old + 1;
620                 if (!new)
621                         new = 1;
622         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
623         fhdr->identification = htonl(new);
624 }
625
626 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
627 {
628         struct sk_buff *frag;
629         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
630         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
631         struct ipv6hdr *tmp_hdr;
632         struct frag_hdr *fh;
633         unsigned int mtu, hlen, left, len;
634         __be32 frag_id = 0;
635         int ptr, offset = 0, err=0;
636         u8 *prevhdr, nexthdr = 0;
637         struct net *net = dev_net(skb_dst(skb)->dev);
638
639         hlen = ip6_find_1stfragopt(skb, &prevhdr);
640         nexthdr = *prevhdr;
641
642         mtu = ip6_skb_dst_mtu(skb);
643
644         /* We must not fragment if the socket is set to force MTU discovery
645          * or if the skb it not generated by a local socket.
646          */
647         if (!skb->local_df && skb->len > mtu) {
648                 skb->dev = skb_dst(skb)->dev;
649                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
650                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
651                               IPSTATS_MIB_FRAGFAILS);
652                 kfree_skb(skb);
653                 return -EMSGSIZE;
654         }
655
656         if (np && np->frag_size < mtu) {
657                 if (np->frag_size)
658                         mtu = np->frag_size;
659         }
660         mtu -= hlen + sizeof(struct frag_hdr);
661
662         if (skb_has_frag_list(skb)) {
663                 int first_len = skb_pagelen(skb);
664                 struct sk_buff *frag2;
665
666                 if (first_len - hlen > mtu ||
667                     ((first_len - hlen) & 7) ||
668                     skb_cloned(skb))
669                         goto slow_path;
670
671                 skb_walk_frags(skb, frag) {
672                         /* Correct geometry. */
673                         if (frag->len > mtu ||
674                             ((frag->len & 7) && frag->next) ||
675                             skb_headroom(frag) < hlen)
676                                 goto slow_path_clean;
677
678                         /* Partially cloned skb? */
679                         if (skb_shared(frag))
680                                 goto slow_path_clean;
681
682                         BUG_ON(frag->sk);
683                         if (skb->sk) {
684                                 frag->sk = skb->sk;
685                                 frag->destructor = sock_wfree;
686                         }
687                         skb->truesize -= frag->truesize;
688                 }
689
690                 err = 0;
691                 offset = 0;
692                 frag = skb_shinfo(skb)->frag_list;
693                 skb_frag_list_init(skb);
694                 /* BUILD HEADER */
695
696                 *prevhdr = NEXTHDR_FRAGMENT;
697                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
698                 if (!tmp_hdr) {
699                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
700                                       IPSTATS_MIB_FRAGFAILS);
701                         return -ENOMEM;
702                 }
703
704                 __skb_pull(skb, hlen);
705                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
706                 __skb_push(skb, hlen);
707                 skb_reset_network_header(skb);
708                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
709
710                 ipv6_select_ident(fh, rt);
711                 fh->nexthdr = nexthdr;
712                 fh->reserved = 0;
713                 fh->frag_off = htons(IP6_MF);
714                 frag_id = fh->identification;
715
716                 first_len = skb_pagelen(skb);
717                 skb->data_len = first_len - skb_headlen(skb);
718                 skb->len = first_len;
719                 ipv6_hdr(skb)->payload_len = htons(first_len -
720                                                    sizeof(struct ipv6hdr));
721
722                 dst_hold(&rt->dst);
723
724                 for (;;) {
725                         /* Prepare header of the next frame,
726                          * before previous one went down. */
727                         if (frag) {
728                                 frag->ip_summed = CHECKSUM_NONE;
729                                 skb_reset_transport_header(frag);
730                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
731                                 __skb_push(frag, hlen);
732                                 skb_reset_network_header(frag);
733                                 memcpy(skb_network_header(frag), tmp_hdr,
734                                        hlen);
735                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
736                                 fh->nexthdr = nexthdr;
737                                 fh->reserved = 0;
738                                 fh->frag_off = htons(offset);
739                                 if (frag->next != NULL)
740                                         fh->frag_off |= htons(IP6_MF);
741                                 fh->identification = frag_id;
742                                 ipv6_hdr(frag)->payload_len =
743                                                 htons(frag->len -
744                                                       sizeof(struct ipv6hdr));
745                                 ip6_copy_metadata(frag, skb);
746                         }
747
748                         err = output(skb);
749                         if(!err)
750                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
751                                               IPSTATS_MIB_FRAGCREATES);
752
753                         if (err || !frag)
754                                 break;
755
756                         skb = frag;
757                         frag = skb->next;
758                         skb->next = NULL;
759                 }
760
761                 kfree(tmp_hdr);
762
763                 if (err == 0) {
764                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
765                                       IPSTATS_MIB_FRAGOKS);
766                         dst_release(&rt->dst);
767                         return 0;
768                 }
769
770                 while (frag) {
771                         skb = frag->next;
772                         kfree_skb(frag);
773                         frag = skb;
774                 }
775
776                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
777                               IPSTATS_MIB_FRAGFAILS);
778                 dst_release(&rt->dst);
779                 return err;
780
781 slow_path_clean:
782                 skb_walk_frags(skb, frag2) {
783                         if (frag2 == frag)
784                                 break;
785                         frag2->sk = NULL;
786                         frag2->destructor = NULL;
787                         skb->truesize += frag2->truesize;
788                 }
789         }
790
791 slow_path:
792         left = skb->len - hlen;         /* Space per frame */
793         ptr = hlen;                     /* Where to start from */
794
795         /*
796          *      Fragment the datagram.
797          */
798
799         *prevhdr = NEXTHDR_FRAGMENT;
800
801         /*
802          *      Keep copying data until we run out.
803          */
804         while(left > 0) {
805                 len = left;
806                 /* IF: it doesn't fit, use 'mtu' - the data space left */
807                 if (len > mtu)
808                         len = mtu;
809                 /* IF: we are not sending up to and including the packet end
810                    then align the next start on an eight byte boundary */
811                 if (len < left) {
812                         len &= ~7;
813                 }
814                 /*
815                  *      Allocate buffer.
816                  */
817
818                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
819                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
820                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
821                                       IPSTATS_MIB_FRAGFAILS);
822                         err = -ENOMEM;
823                         goto fail;
824                 }
825
826                 /*
827                  *      Set up data on packet
828                  */
829
830                 ip6_copy_metadata(frag, skb);
831                 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
832                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
833                 skb_reset_network_header(frag);
834                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
835                 frag->transport_header = (frag->network_header + hlen +
836                                           sizeof(struct frag_hdr));
837
838                 /*
839                  *      Charge the memory for the fragment to any owner
840                  *      it might possess
841                  */
842                 if (skb->sk)
843                         skb_set_owner_w(frag, skb->sk);
844
845                 /*
846                  *      Copy the packet header into the new buffer.
847                  */
848                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
849
850                 /*
851                  *      Build fragment header.
852                  */
853                 fh->nexthdr = nexthdr;
854                 fh->reserved = 0;
855                 if (!frag_id) {
856                         ipv6_select_ident(fh, rt);
857                         frag_id = fh->identification;
858                 } else
859                         fh->identification = frag_id;
860
861                 /*
862                  *      Copy a block of the IP datagram.
863                  */
864                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
865                         BUG();
866                 left -= len;
867
868                 fh->frag_off = htons(offset);
869                 if (left > 0)
870                         fh->frag_off |= htons(IP6_MF);
871                 ipv6_hdr(frag)->payload_len = htons(frag->len -
872                                                     sizeof(struct ipv6hdr));
873
874                 ptr += len;
875                 offset += len;
876
877                 /*
878                  *      Put this fragment into the sending queue.
879                  */
880                 err = output(frag);
881                 if (err)
882                         goto fail;
883
884                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
885                               IPSTATS_MIB_FRAGCREATES);
886         }
887         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888                       IPSTATS_MIB_FRAGOKS);
889         kfree_skb(skb);
890         return err;
891
892 fail:
893         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
894                       IPSTATS_MIB_FRAGFAILS);
895         kfree_skb(skb);
896         return err;
897 }
898
899 static inline int ip6_rt_check(const struct rt6key *rt_key,
900                                const struct in6_addr *fl_addr,
901                                const struct in6_addr *addr_cache)
902 {
903         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
904                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
905 }
906
907 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
908                                           struct dst_entry *dst,
909                                           const struct flowi6 *fl6)
910 {
911         struct ipv6_pinfo *np = inet6_sk(sk);
912         struct rt6_info *rt = (struct rt6_info *)dst;
913
914         if (!dst)
915                 goto out;
916
917         /* Yes, checking route validity in not connected
918          * case is not very simple. Take into account,
919          * that we do not support routing by source, TOS,
920          * and MSG_DONTROUTE            --ANK (980726)
921          *
922          * 1. ip6_rt_check(): If route was host route,
923          *    check that cached destination is current.
924          *    If it is network route, we still may
925          *    check its validity using saved pointer
926          *    to the last used address: daddr_cache.
927          *    We do not want to save whole address now,
928          *    (because main consumer of this service
929          *    is tcp, which has not this problem),
930          *    so that the last trick works only on connected
931          *    sockets.
932          * 2. oif also should be the same.
933          */
934         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
935 #ifdef CONFIG_IPV6_SUBTREES
936             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
937 #endif
938             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
939                 dst_release(dst);
940                 dst = NULL;
941         }
942
943 out:
944         return dst;
945 }
946
947 static int ip6_dst_lookup_tail(struct sock *sk,
948                                struct dst_entry **dst, struct flowi6 *fl6)
949 {
950         struct net *net = sock_net(sk);
951 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
952         struct neighbour *n;
953 #endif
954         int err;
955
956         if (*dst == NULL)
957                 *dst = ip6_route_output(net, sk, fl6);
958
959         if ((err = (*dst)->error))
960                 goto out_err_release;
961
962         if (ipv6_addr_any(&fl6->saddr)) {
963                 struct rt6_info *rt = (struct rt6_info *) *dst;
964                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
965                                           sk ? inet6_sk(sk)->srcprefs : 0,
966                                           &fl6->saddr);
967                 if (err)
968                         goto out_err_release;
969         }
970
971 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
972         /*
973          * Here if the dst entry we've looked up
974          * has a neighbour entry that is in the INCOMPLETE
975          * state and the src address from the flow is
976          * marked as OPTIMISTIC, we release the found
977          * dst entry and replace it instead with the
978          * dst entry of the nexthop router
979          */
980         rcu_read_lock();
981         n = dst_get_neighbour(*dst);
982         if (n && !(n->nud_state & NUD_VALID)) {
983                 struct inet6_ifaddr *ifp;
984                 struct flowi6 fl_gw6;
985                 int redirect;
986
987                 rcu_read_unlock();
988                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
989                                       (*dst)->dev, 1);
990
991                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
992                 if (ifp)
993                         in6_ifa_put(ifp);
994
995                 if (redirect) {
996                         /*
997                          * We need to get the dst entry for the
998                          * default router instead
999                          */
1000                         dst_release(*dst);
1001                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1002                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1003                         *dst = ip6_route_output(net, sk, &fl_gw6);
1004                         if ((err = (*dst)->error))
1005                                 goto out_err_release;
1006                 }
1007         } else {
1008                 rcu_read_unlock();
1009         }
1010 #endif
1011
1012         return 0;
1013
1014 out_err_release:
1015         if (err == -ENETUNREACH)
1016                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1017         dst_release(*dst);
1018         *dst = NULL;
1019         return err;
1020 }
1021
1022 /**
1023  *      ip6_dst_lookup - perform route lookup on flow
1024  *      @sk: socket which provides route info
1025  *      @dst: pointer to dst_entry * for result
1026  *      @fl6: flow to lookup
1027  *
1028  *      This function performs a route lookup on the given flow.
1029  *
1030  *      It returns zero on success, or a standard errno code on error.
1031  */
1032 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1033 {
1034         *dst = NULL;
1035         return ip6_dst_lookup_tail(sk, dst, fl6);
1036 }
1037 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1038
1039 /**
1040  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1041  *      @sk: socket which provides route info
1042  *      @fl6: flow to lookup
1043  *      @final_dst: final destination address for ipsec lookup
1044  *      @can_sleep: we are in a sleepable context
1045  *
1046  *      This function performs a route lookup on the given flow.
1047  *
1048  *      It returns a valid dst pointer on success, or a pointer encoded
1049  *      error code.
1050  */
1051 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1052                                       const struct in6_addr *final_dst,
1053                                       bool can_sleep)
1054 {
1055         struct dst_entry *dst = NULL;
1056         int err;
1057
1058         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1059         if (err)
1060                 return ERR_PTR(err);
1061         if (final_dst)
1062                 ipv6_addr_copy(&fl6->daddr, final_dst);
1063         if (can_sleep)
1064                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1065
1066         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1067 }
1068 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1069
1070 /**
1071  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1072  *      @sk: socket which provides the dst cache and route info
1073  *      @fl6: flow to lookup
1074  *      @final_dst: final destination address for ipsec lookup
1075  *      @can_sleep: we are in a sleepable context
1076  *
1077  *      This function performs a route lookup on the given flow with the
1078  *      possibility of using the cached route in the socket if it is valid.
1079  *      It will take the socket dst lock when operating on the dst cache.
1080  *      As a result, this function can only be used in process context.
1081  *
1082  *      It returns a valid dst pointer on success, or a pointer encoded
1083  *      error code.
1084  */
1085 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1086                                          const struct in6_addr *final_dst,
1087                                          bool can_sleep)
1088 {
1089         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1090         int err;
1091
1092         dst = ip6_sk_dst_check(sk, dst, fl6);
1093
1094         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1095         if (err)
1096                 return ERR_PTR(err);
1097         if (final_dst)
1098                 ipv6_addr_copy(&fl6->daddr, final_dst);
1099         if (can_sleep)
1100                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1101
1102         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1103 }
1104 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1105
1106 static inline int ip6_ufo_append_data(struct sock *sk,
1107                         int getfrag(void *from, char *to, int offset, int len,
1108                         int odd, struct sk_buff *skb),
1109                         void *from, int length, int hh_len, int fragheaderlen,
1110                         int transhdrlen, int mtu,unsigned int flags,
1111                         struct rt6_info *rt)
1112
1113 {
1114         struct sk_buff *skb;
1115         int err;
1116
1117         /* There is support for UDP large send offload by network
1118          * device, so create one single skb packet containing complete
1119          * udp datagram
1120          */
1121         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1122                 skb = sock_alloc_send_skb(sk,
1123                         hh_len + fragheaderlen + transhdrlen + 20,
1124                         (flags & MSG_DONTWAIT), &err);
1125                 if (skb == NULL)
1126                         return err;
1127
1128                 /* reserve space for Hardware header */
1129                 skb_reserve(skb, hh_len);
1130
1131                 /* create space for UDP/IP header */
1132                 skb_put(skb,fragheaderlen + transhdrlen);
1133
1134                 /* initialize network header pointer */
1135                 skb_reset_network_header(skb);
1136
1137                 /* initialize protocol header pointer */
1138                 skb->transport_header = skb->network_header + fragheaderlen;
1139
1140                 skb->ip_summed = CHECKSUM_PARTIAL;
1141                 skb->csum = 0;
1142         }
1143
1144         err = skb_append_datato_frags(sk,skb, getfrag, from,
1145                                       (length - transhdrlen));
1146         if (!err) {
1147                 struct frag_hdr fhdr;
1148
1149                 /* Specify the length of each IPv6 datagram fragment.
1150                  * It has to be a multiple of 8.
1151                  */
1152                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1153                                              sizeof(struct frag_hdr)) & ~7;
1154                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1155                 ipv6_select_ident(&fhdr, rt);
1156                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1157                 __skb_queue_tail(&sk->sk_write_queue, skb);
1158
1159                 return 0;
1160         }
1161         /* There is not enough support do UPD LSO,
1162          * so follow normal path
1163          */
1164         kfree_skb(skb);
1165
1166         return err;
1167 }
1168
1169 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1170                                                gfp_t gfp)
1171 {
1172         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1173 }
1174
1175 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1176                                                 gfp_t gfp)
1177 {
1178         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1179 }
1180
1181 static void ip6_append_data_mtu(int *mtu,
1182                                 int *maxfraglen,
1183                                 unsigned int fragheaderlen,
1184                                 struct sk_buff *skb,
1185                                 struct rt6_info *rt)
1186 {
1187         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1188                 if (skb == NULL) {
1189                         /* first fragment, reserve header_len */
1190                         *mtu = *mtu - rt->dst.header_len;
1191
1192                 } else {
1193                         /*
1194                          * this fragment is not first, the headers
1195                          * space is regarded as data space.
1196                          */
1197                         *mtu = dst_mtu(rt->dst.path);
1198                 }
1199                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1200                               + fragheaderlen - sizeof(struct frag_hdr);
1201         }
1202 }
1203
1204 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1205         int offset, int len, int odd, struct sk_buff *skb),
1206         void *from, int length, int transhdrlen,
1207         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1208         struct rt6_info *rt, unsigned int flags, int dontfrag)
1209 {
1210         struct inet_sock *inet = inet_sk(sk);
1211         struct ipv6_pinfo *np = inet6_sk(sk);
1212         struct inet_cork *cork;
1213         struct sk_buff *skb, *skb_prev = NULL;
1214         unsigned int maxfraglen, fragheaderlen;
1215         int exthdrlen;
1216         int dst_exthdrlen;
1217         int hh_len;
1218         int mtu;
1219         int copy;
1220         int err;
1221         int offset = 0;
1222         int csummode = CHECKSUM_NONE;
1223         __u8 tx_flags = 0;
1224
1225         if (flags&MSG_PROBE)
1226                 return 0;
1227         cork = &inet->cork.base;
1228         if (skb_queue_empty(&sk->sk_write_queue)) {
1229                 /*
1230                  * setup for corking
1231                  */
1232                 if (opt) {
1233                         if (WARN_ON(np->cork.opt))
1234                                 return -EINVAL;
1235
1236                         np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1237                         if (unlikely(np->cork.opt == NULL))
1238                                 return -ENOBUFS;
1239
1240                         np->cork.opt->tot_len = opt->tot_len;
1241                         np->cork.opt->opt_flen = opt->opt_flen;
1242                         np->cork.opt->opt_nflen = opt->opt_nflen;
1243
1244                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1245                                                             sk->sk_allocation);
1246                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1247                                 return -ENOBUFS;
1248
1249                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1250                                                             sk->sk_allocation);
1251                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1252                                 return -ENOBUFS;
1253
1254                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1255                                                            sk->sk_allocation);
1256                         if (opt->hopopt && !np->cork.opt->hopopt)
1257                                 return -ENOBUFS;
1258
1259                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1260                                                             sk->sk_allocation);
1261                         if (opt->srcrt && !np->cork.opt->srcrt)
1262                                 return -ENOBUFS;
1263
1264                         /* need source address above miyazawa*/
1265                 }
1266                 dst_hold(&rt->dst);
1267                 cork->dst = &rt->dst;
1268                 inet->cork.fl.u.ip6 = *fl6;
1269                 np->cork.hop_limit = hlimit;
1270                 np->cork.tclass = tclass;
1271                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1272                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1273                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1274                 else
1275                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1276                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1277                 if (np->frag_size < mtu) {
1278                         if (np->frag_size)
1279                                 mtu = np->frag_size;
1280                 }
1281                 cork->fragsize = mtu;
1282                 if (dst_allfrag(rt->dst.path))
1283                         cork->flags |= IPCORK_ALLFRAG;
1284                 cork->length = 0;
1285                 sk->sk_sndmsg_page = NULL;
1286                 sk->sk_sndmsg_off = 0;
1287                 exthdrlen = (opt ? opt->opt_flen : 0);
1288                 length += exthdrlen;
1289                 transhdrlen += exthdrlen;
1290                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1291         } else {
1292                 rt = (struct rt6_info *)cork->dst;
1293                 fl6 = &inet->cork.fl.u.ip6;
1294                 opt = np->cork.opt;
1295                 transhdrlen = 0;
1296                 exthdrlen = 0;
1297                 dst_exthdrlen = 0;
1298                 mtu = cork->fragsize;
1299         }
1300
1301         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1302
1303         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1304                         (opt ? opt->opt_nflen : 0);
1305         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1306
1307         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1308                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1309                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1310                         return -EMSGSIZE;
1311                 }
1312         }
1313
1314         /* For UDP, check if TX timestamp is enabled */
1315         if (sk->sk_type == SOCK_DGRAM) {
1316                 err = sock_tx_timestamp(sk, &tx_flags);
1317                 if (err)
1318                         goto error;
1319         }
1320
1321         /*
1322          * Let's try using as much space as possible.
1323          * Use MTU if total length of the message fits into the MTU.
1324          * Otherwise, we need to reserve fragment header and
1325          * fragment alignment (= 8-15 octects, in total).
1326          *
1327          * Note that we may need to "move" the data from the tail of
1328          * of the buffer to the new fragment when we split
1329          * the message.
1330          *
1331          * FIXME: It may be fragmented into multiple chunks
1332          *        at once if non-fragmentable extension headers
1333          *        are too large.
1334          * --yoshfuji
1335          */
1336
1337         cork->length += length;
1338         if (length > mtu) {
1339                 int proto = sk->sk_protocol;
1340                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1341                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1342                         return -EMSGSIZE;
1343                 }
1344
1345                 if (proto == IPPROTO_UDP &&
1346                     (rt->dst.dev->features & NETIF_F_UFO)) {
1347
1348                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1349                                                   hh_len, fragheaderlen,
1350                                                   transhdrlen, mtu, flags, rt);
1351                         if (err)
1352                                 goto error;
1353                         return 0;
1354                 }
1355         }
1356
1357         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1358                 goto alloc_new_skb;
1359
1360         while (length > 0) {
1361                 /* Check if the remaining data fits into current packet. */
1362                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1363                 if (copy < length)
1364                         copy = maxfraglen - skb->len;
1365
1366                 if (copy <= 0) {
1367                         char *data;
1368                         unsigned int datalen;
1369                         unsigned int fraglen;
1370                         unsigned int fraggap;
1371                         unsigned int alloclen;
1372 alloc_new_skb:
1373                         /* There's no room in the current skb */
1374                         if (skb)
1375                                 fraggap = skb->len - maxfraglen;
1376                         else
1377                                 fraggap = 0;
1378                         /* update mtu and maxfraglen if necessary */
1379                         if (skb == NULL || skb_prev == NULL)
1380                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1381                                                     fragheaderlen, skb, rt);
1382
1383                         skb_prev = skb;
1384
1385                         /*
1386                          * If remaining data exceeds the mtu,
1387                          * we know we need more fragment(s).
1388                          */
1389                         datalen = length + fraggap;
1390
1391                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1392                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1393                         if ((flags & MSG_MORE) &&
1394                             !(rt->dst.dev->features&NETIF_F_SG))
1395                                 alloclen = mtu;
1396                         else
1397                                 alloclen = datalen + fragheaderlen;
1398
1399                         alloclen += dst_exthdrlen;
1400
1401                         if (datalen != length + fraggap) {
1402                                 /*
1403                                  * this is not the last fragment, the trailer
1404                                  * space is regarded as data space.
1405                                  */
1406                                 datalen += rt->dst.trailer_len;
1407                         }
1408
1409                         alloclen += rt->dst.trailer_len;
1410                         fraglen = datalen + fragheaderlen;
1411
1412                         /*
1413                          * We just reserve space for fragment header.
1414                          * Note: this may be overallocation if the message
1415                          * (without MSG_MORE) fits into the MTU.
1416                          */
1417                         alloclen += sizeof(struct frag_hdr);
1418
1419                         if (transhdrlen) {
1420                                 skb = sock_alloc_send_skb(sk,
1421                                                 alloclen + hh_len,
1422                                                 (flags & MSG_DONTWAIT), &err);
1423                         } else {
1424                                 skb = NULL;
1425                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1426                                     2 * sk->sk_sndbuf)
1427                                         skb = sock_wmalloc(sk,
1428                                                            alloclen + hh_len, 1,
1429                                                            sk->sk_allocation);
1430                                 if (unlikely(skb == NULL))
1431                                         err = -ENOBUFS;
1432                                 else {
1433                                         /* Only the initial fragment
1434                                          * is time stamped.
1435                                          */
1436                                         tx_flags = 0;
1437                                 }
1438                         }
1439                         if (skb == NULL)
1440                                 goto error;
1441                         /*
1442                          *      Fill in the control structures
1443                          */
1444                         skb->ip_summed = csummode;
1445                         skb->csum = 0;
1446                         /* reserve for fragmentation and ipsec header */
1447                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1448                                     dst_exthdrlen);
1449
1450                         if (sk->sk_type == SOCK_DGRAM)
1451                                 skb_shinfo(skb)->tx_flags = tx_flags;
1452
1453                         /*
1454                          *      Find where to start putting bytes
1455                          */
1456                         data = skb_put(skb, fraglen);
1457                         skb_set_network_header(skb, exthdrlen);
1458                         data += fragheaderlen;
1459                         skb->transport_header = (skb->network_header +
1460                                                  fragheaderlen);
1461                         if (fraggap) {
1462                                 skb->csum = skb_copy_and_csum_bits(
1463                                         skb_prev, maxfraglen,
1464                                         data + transhdrlen, fraggap, 0);
1465                                 skb_prev->csum = csum_sub(skb_prev->csum,
1466                                                           skb->csum);
1467                                 data += fraggap;
1468                                 pskb_trim_unique(skb_prev, maxfraglen);
1469                         }
1470                         copy = datalen - transhdrlen - fraggap;
1471
1472                         if (copy < 0) {
1473                                 err = -EINVAL;
1474                                 kfree_skb(skb);
1475                                 goto error;
1476                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1477                                 err = -EFAULT;
1478                                 kfree_skb(skb);
1479                                 goto error;
1480                         }
1481
1482                         offset += copy;
1483                         length -= datalen - fraggap;
1484                         transhdrlen = 0;
1485                         exthdrlen = 0;
1486                         dst_exthdrlen = 0;
1487                         csummode = CHECKSUM_NONE;
1488
1489                         /*
1490                          * Put the packet on the pending queue
1491                          */
1492                         __skb_queue_tail(&sk->sk_write_queue, skb);
1493                         continue;
1494                 }
1495
1496                 if (copy > length)
1497                         copy = length;
1498
1499                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1500                         unsigned int off;
1501
1502                         off = skb->len;
1503                         if (getfrag(from, skb_put(skb, copy),
1504                                                 offset, copy, off, skb) < 0) {
1505                                 __skb_trim(skb, off);
1506                                 err = -EFAULT;
1507                                 goto error;
1508                         }
1509                 } else {
1510                         int i = skb_shinfo(skb)->nr_frags;
1511                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1512                         struct page *page = sk->sk_sndmsg_page;
1513                         int off = sk->sk_sndmsg_off;
1514                         unsigned int left;
1515
1516                         if (page && (left = PAGE_SIZE - off) > 0) {
1517                                 if (copy >= left)
1518                                         copy = left;
1519                                 if (page != skb_frag_page(frag)) {
1520                                         if (i == MAX_SKB_FRAGS) {
1521                                                 err = -EMSGSIZE;
1522                                                 goto error;
1523                                         }
1524                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1525                                         skb_frag_ref(skb, i);
1526                                         frag = &skb_shinfo(skb)->frags[i];
1527                                 }
1528                         } else if(i < MAX_SKB_FRAGS) {
1529                                 if (copy > PAGE_SIZE)
1530                                         copy = PAGE_SIZE;
1531                                 page = alloc_pages(sk->sk_allocation, 0);
1532                                 if (page == NULL) {
1533                                         err = -ENOMEM;
1534                                         goto error;
1535                                 }
1536                                 sk->sk_sndmsg_page = page;
1537                                 sk->sk_sndmsg_off = 0;
1538
1539                                 skb_fill_page_desc(skb, i, page, 0, 0);
1540                                 frag = &skb_shinfo(skb)->frags[i];
1541                         } else {
1542                                 err = -EMSGSIZE;
1543                                 goto error;
1544                         }
1545                         if (getfrag(from,
1546                                     skb_frag_address(frag) + skb_frag_size(frag),
1547                                     offset, copy, skb->len, skb) < 0) {
1548                                 err = -EFAULT;
1549                                 goto error;
1550                         }
1551                         sk->sk_sndmsg_off += copy;
1552                         skb_frag_size_add(frag, copy);
1553                         skb->len += copy;
1554                         skb->data_len += copy;
1555                         skb->truesize += copy;
1556                         atomic_add(copy, &sk->sk_wmem_alloc);
1557                 }
1558                 offset += copy;
1559                 length -= copy;
1560         }
1561         return 0;
1562 error:
1563         cork->length -= length;
1564         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1565         return err;
1566 }
1567
1568 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1569 {
1570         if (np->cork.opt) {
1571                 kfree(np->cork.opt->dst0opt);
1572                 kfree(np->cork.opt->dst1opt);
1573                 kfree(np->cork.opt->hopopt);
1574                 kfree(np->cork.opt->srcrt);
1575                 kfree(np->cork.opt);
1576                 np->cork.opt = NULL;
1577         }
1578
1579         if (inet->cork.base.dst) {
1580                 dst_release(inet->cork.base.dst);
1581                 inet->cork.base.dst = NULL;
1582                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1583         }
1584         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1585 }
1586
1587 int ip6_push_pending_frames(struct sock *sk)
1588 {
1589         struct sk_buff *skb, *tmp_skb;
1590         struct sk_buff **tail_skb;
1591         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1592         struct inet_sock *inet = inet_sk(sk);
1593         struct ipv6_pinfo *np = inet6_sk(sk);
1594         struct net *net = sock_net(sk);
1595         struct ipv6hdr *hdr;
1596         struct ipv6_txoptions *opt = np->cork.opt;
1597         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1598         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1599         unsigned char proto = fl6->flowi6_proto;
1600         int err = 0;
1601
1602         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1603                 goto out;
1604         tail_skb = &(skb_shinfo(skb)->frag_list);
1605
1606         /* move skb->data to ip header from ext header */
1607         if (skb->data < skb_network_header(skb))
1608                 __skb_pull(skb, skb_network_offset(skb));
1609         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1610                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1611                 *tail_skb = tmp_skb;
1612                 tail_skb = &(tmp_skb->next);
1613                 skb->len += tmp_skb->len;
1614                 skb->data_len += tmp_skb->len;
1615                 skb->truesize += tmp_skb->truesize;
1616                 tmp_skb->destructor = NULL;
1617                 tmp_skb->sk = NULL;
1618         }
1619
1620         /* Allow local fragmentation. */
1621         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1622                 skb->local_df = 1;
1623
1624         ipv6_addr_copy(final_dst, &fl6->daddr);
1625         __skb_pull(skb, skb_network_header_len(skb));
1626         if (opt && opt->opt_flen)
1627                 ipv6_push_frag_opts(skb, opt, &proto);
1628         if (opt && opt->opt_nflen)
1629                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1630
1631         skb_push(skb, sizeof(struct ipv6hdr));
1632         skb_reset_network_header(skb);
1633         hdr = ipv6_hdr(skb);
1634
1635         *(__be32*)hdr = fl6->flowlabel |
1636                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1637
1638         hdr->hop_limit = np->cork.hop_limit;
1639         hdr->nexthdr = proto;
1640         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1641         ipv6_addr_copy(&hdr->daddr, final_dst);
1642
1643         skb->priority = sk->sk_priority;
1644         skb->mark = sk->sk_mark;
1645
1646         skb_dst_set(skb, dst_clone(&rt->dst));
1647         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1648         if (proto == IPPROTO_ICMPV6) {
1649                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1650
1651                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1652                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1653         }
1654
1655         err = ip6_local_out(skb);
1656         if (err) {
1657                 if (err > 0)
1658                         err = net_xmit_errno(err);
1659                 if (err)
1660                         goto error;
1661         }
1662
1663 out:
1664         ip6_cork_release(inet, np);
1665         return err;
1666 error:
1667         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1668         goto out;
1669 }
1670
1671 void ip6_flush_pending_frames(struct sock *sk)
1672 {
1673         struct sk_buff *skb;
1674
1675         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1676                 if (skb_dst(skb))
1677                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1678                                       IPSTATS_MIB_OUTDISCARDS);
1679                 kfree_skb(skb);
1680         }
1681
1682         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1683 }