Linux 3.2.102
[pandora-kernel.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89         skb_reset_mac_header(newskb);
90         __skb_pull(newskb, skb_network_offset(newskb));
91         newskb->pkt_type = PACKET_LOOPBACK;
92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
93         WARN_ON(!skb_dst(newskb));
94
95         netif_rx_ni(newskb);
96         return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101         struct dst_entry *dst = skb_dst(skb);
102         struct net_device *dev = dst->dev;
103         struct neighbour *neigh;
104
105         skb->protocol = htons(ETH_P_IPV6);
106         skb->dev = dev;
107
108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110
111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112                     ((mroute6_socket(dev_net(dev), skb) &&
113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115                                          &ipv6_hdr(skb)->saddr))) {
116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117
118                         /* Do not check for IFF_ALLMULTI; multicast routing
119                            is not supported in any case.
120                          */
121                         if (newskb)
122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123                                         newskb, NULL, newskb->dev,
124                                         ip6_dev_loopback_xmit);
125
126                         if (ipv6_hdr(skb)->hop_limit == 0) {
127                                 IP6_INC_STATS(dev_net(dev), idev,
128                                               IPSTATS_MIB_OUTDISCARDS);
129                                 kfree_skb(skb);
130                                 return 0;
131                         }
132                 }
133
134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135                                 skb->len);
136         }
137
138         rcu_read_lock();
139         neigh = dst_get_neighbour(dst);
140         if (neigh) {
141                 int res = neigh_output(neigh, skb);
142
143                 rcu_read_unlock();
144                 return res;
145         }
146         rcu_read_unlock();
147         IP6_INC_STATS(dev_net(dst->dev),
148                       ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149         kfree_skb(skb);
150         return -EINVAL;
151 }
152
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156             dst_allfrag(skb_dst(skb)))
157                 return ip6_fragment(skb, ip6_finish_output2);
158         else
159                 return ip6_finish_output2(skb);
160 }
161
162 int ip6_output(struct sk_buff *skb)
163 {
164         struct net_device *dev = skb_dst(skb)->dev;
165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166         if (unlikely(idev->cnf.disable_ipv6)) {
167                 IP6_INC_STATS(dev_net(dev), idev,
168                               IPSTATS_MIB_OUTDISCARDS);
169                 kfree_skb(skb);
170                 return 0;
171         }
172
173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174                             ip6_finish_output,
175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177
178 /*
179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183              struct ipv6_txoptions *opt, int tclass)
184 {
185         struct net *net = sock_net(sk);
186         struct ipv6_pinfo *np = inet6_sk(sk);
187         struct in6_addr *first_hop = &fl6->daddr;
188         struct dst_entry *dst = skb_dst(skb);
189         struct ipv6hdr *hdr;
190         u8  proto = fl6->flowi6_proto;
191         int seg_len = skb->len;
192         int hlimit = -1;
193         u32 mtu;
194
195         if (opt) {
196                 unsigned int head_room;
197
198                 /* First: exthdrs may take lots of space (~8K for now)
199                    MAX_HEADER is not enough.
200                  */
201                 head_room = opt->opt_nflen + opt->opt_flen;
202                 seg_len += head_room;
203                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204
205                 if (skb_headroom(skb) < head_room) {
206                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207                         if (skb2 == NULL) {
208                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209                                               IPSTATS_MIB_OUTDISCARDS);
210                                 kfree_skb(skb);
211                                 return -ENOBUFS;
212                         }
213                         kfree_skb(skb);
214                         skb = skb2;
215                         skb_set_owner_w(skb, sk);
216                 }
217                 if (opt->opt_flen)
218                         ipv6_push_frag_opts(skb, opt, &proto);
219                 if (opt->opt_nflen)
220                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221         }
222
223         skb_push(skb, sizeof(struct ipv6hdr));
224         skb_reset_network_header(skb);
225         hdr = ipv6_hdr(skb);
226
227         /*
228          *      Fill in the IPv6 header
229          */
230         if (np)
231                 hlimit = np->hop_limit;
232         if (hlimit < 0)
233                 hlimit = ip6_dst_hoplimit(dst);
234
235         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236
237         hdr->payload_len = htons(seg_len);
238         hdr->nexthdr = proto;
239         hdr->hop_limit = hlimit;
240
241         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
242         ipv6_addr_copy(&hdr->daddr, first_hop);
243
244         skb->priority = sk->sk_priority;
245         skb->mark = sk->sk_mark;
246
247         mtu = dst_mtu(dst);
248         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250                               IPSTATS_MIB_OUT, skb->len);
251                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252                                dst->dev, dst_output);
253         }
254
255         if (net_ratelimit())
256                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
257         skb->dev = dst->dev;
258         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
259         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
260         kfree_skb(skb);
261         return -EMSGSIZE;
262 }
263
264 EXPORT_SYMBOL(ip6_xmit);
265
266 /*
267  *      To avoid extra problems ND packets are send through this
268  *      routine. It's code duplication but I really want to avoid
269  *      extra checks since ipv6_build_header is used by TCP (which
270  *      is for us performance critical)
271  */
272
273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
274                const struct in6_addr *saddr, const struct in6_addr *daddr,
275                int proto, int len)
276 {
277         struct ipv6_pinfo *np = inet6_sk(sk);
278         struct ipv6hdr *hdr;
279
280         skb->protocol = htons(ETH_P_IPV6);
281         skb->dev = dev;
282
283         skb_reset_network_header(skb);
284         skb_put(skb, sizeof(struct ipv6hdr));
285         hdr = ipv6_hdr(skb);
286
287         *(__be32*)hdr = htonl(0x60000000);
288
289         hdr->payload_len = htons(len);
290         hdr->nexthdr = proto;
291         hdr->hop_limit = np->hop_limit;
292
293         ipv6_addr_copy(&hdr->saddr, saddr);
294         ipv6_addr_copy(&hdr->daddr, daddr);
295
296         return 0;
297 }
298
299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
300 {
301         struct ip6_ra_chain *ra;
302         struct sock *last = NULL;
303
304         read_lock(&ip6_ra_lock);
305         for (ra = ip6_ra_chain; ra; ra = ra->next) {
306                 struct sock *sk = ra->sk;
307                 if (sk && ra->sel == sel &&
308                     (!sk->sk_bound_dev_if ||
309                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
310                         if (last) {
311                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
312                                 if (skb2)
313                                         rawv6_rcv(last, skb2);
314                         }
315                         last = sk;
316                 }
317         }
318
319         if (last) {
320                 rawv6_rcv(last, skb);
321                 read_unlock(&ip6_ra_lock);
322                 return 1;
323         }
324         read_unlock(&ip6_ra_lock);
325         return 0;
326 }
327
328 static int ip6_forward_proxy_check(struct sk_buff *skb)
329 {
330         struct ipv6hdr *hdr = ipv6_hdr(skb);
331         u8 nexthdr = hdr->nexthdr;
332         int offset;
333
334         if (ipv6_ext_hdr(nexthdr)) {
335                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
336                 if (offset < 0)
337                         return 0;
338         } else
339                 offset = sizeof(struct ipv6hdr);
340
341         if (nexthdr == IPPROTO_ICMPV6) {
342                 struct icmp6hdr *icmp6;
343
344                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345                                          offset + 1 - skb->data)))
346                         return 0;
347
348                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349
350                 switch (icmp6->icmp6_type) {
351                 case NDISC_ROUTER_SOLICITATION:
352                 case NDISC_ROUTER_ADVERTISEMENT:
353                 case NDISC_NEIGHBOUR_SOLICITATION:
354                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
355                 case NDISC_REDIRECT:
356                         /* For reaction involving unicast neighbor discovery
357                          * message destined to the proxied address, pass it to
358                          * input function.
359                          */
360                         return 1;
361                 default:
362                         break;
363                 }
364         }
365
366         /*
367          * The proxying router can't forward traffic sent to a link-local
368          * address, so signal the sender and discard the packet. This
369          * behavior is clarified by the MIPv6 specification.
370          */
371         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372                 dst_link_failure(skb);
373                 return -1;
374         }
375
376         return 0;
377 }
378
379 static inline int ip6_forward_finish(struct sk_buff *skb)
380 {
381         return dst_output(skb);
382 }
383
384 int ip6_forward(struct sk_buff *skb)
385 {
386         struct dst_entry *dst = skb_dst(skb);
387         struct ipv6hdr *hdr = ipv6_hdr(skb);
388         struct inet6_skb_parm *opt = IP6CB(skb);
389         struct net *net = dev_net(dst->dev);
390         struct neighbour *n;
391         u32 mtu;
392
393         if (net->ipv6.devconf_all->forwarding == 0)
394                 goto error;
395
396         if (skb_warn_if_lro(skb))
397                 goto drop;
398
399         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
401                 goto drop;
402         }
403
404         if (skb->pkt_type != PACKET_HOST)
405                 goto drop;
406
407         skb_forward_csum(skb);
408
409         /*
410          *      We DO NOT make any processing on
411          *      RA packets, pushing them to user level AS IS
412          *      without ane WARRANTY that application will be able
413          *      to interpret them. The reason is that we
414          *      cannot make anything clever here.
415          *
416          *      We are not end-node, so that if packet contains
417          *      AH/ESP, we cannot make anything.
418          *      Defragmentation also would be mistake, RA packets
419          *      cannot be fragmented, because there is no warranty
420          *      that different fragments will go along one path. --ANK
421          */
422         if (opt->ra) {
423                 u8 *ptr = skb_network_header(skb) + opt->ra;
424                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
425                         return 0;
426         }
427
428         /*
429          *      check and decrement ttl
430          */
431         if (hdr->hop_limit <= 1) {
432                 /* Force OUTPUT device used as source address */
433                 skb->dev = dst->dev;
434                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
435                 IP6_INC_STATS_BH(net,
436                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
437
438                 kfree_skb(skb);
439                 return -ETIMEDOUT;
440         }
441
442         /* XXX: idev->cnf.proxy_ndp? */
443         if (net->ipv6.devconf_all->proxy_ndp &&
444             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
445                 int proxied = ip6_forward_proxy_check(skb);
446                 if (proxied > 0)
447                         return ip6_input(skb);
448                 else if (proxied < 0) {
449                         IP6_INC_STATS(net, ip6_dst_idev(dst),
450                                       IPSTATS_MIB_INDISCARDS);
451                         goto drop;
452                 }
453         }
454
455         if (!xfrm6_route_forward(skb)) {
456                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
457                 goto drop;
458         }
459         dst = skb_dst(skb);
460
461         /* IPv6 specs say nothing about it, but it is clear that we cannot
462            send redirects to source routed frames.
463            We don't send redirects to frames decapsulated from IPsec.
464          */
465         n = dst_get_neighbour(dst);
466         if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
467                 struct in6_addr *target = NULL;
468                 struct rt6_info *rt;
469
470                 /*
471                  *      incoming and outgoing devices are the same
472                  *      send a redirect.
473                  */
474
475                 rt = (struct rt6_info *) dst;
476                 if ((rt->rt6i_flags & RTF_GATEWAY))
477                         target = (struct in6_addr*)&n->primary_key;
478                 else
479                         target = &hdr->daddr;
480
481                 if (!rt->rt6i_peer)
482                         rt6_bind_peer(rt, 1);
483
484                 /* Limit redirects both by destination (here)
485                    and by source (inside ndisc_send_redirect)
486                  */
487                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
488                         ndisc_send_redirect(skb, n, target);
489         } else {
490                 int addrtype = ipv6_addr_type(&hdr->saddr);
491
492                 /* This check is security critical. */
493                 if (addrtype == IPV6_ADDR_ANY ||
494                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
495                         goto error;
496                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
497                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
498                                     ICMPV6_NOT_NEIGHBOUR, 0);
499                         goto error;
500                 }
501         }
502
503         mtu = dst_mtu(dst);
504         if (mtu < IPV6_MIN_MTU)
505                 mtu = IPV6_MIN_MTU;
506
507         if (skb->len > mtu && !skb_is_gso(skb)) {
508                 /* Again, force OUTPUT device used as source address */
509                 skb->dev = dst->dev;
510                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
511                 IP6_INC_STATS_BH(net,
512                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
513                 IP6_INC_STATS_BH(net,
514                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
515                 kfree_skb(skb);
516                 return -EMSGSIZE;
517         }
518
519         if (skb_cow(skb, dst->dev->hard_header_len)) {
520                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
521                 goto drop;
522         }
523
524         hdr = ipv6_hdr(skb);
525
526         /* Mangling hops number delayed to point after skb COW */
527
528         hdr->hop_limit--;
529
530         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
531         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
532                        ip6_forward_finish);
533
534 error:
535         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
536 drop:
537         kfree_skb(skb);
538         return -EINVAL;
539 }
540
541 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
542 {
543         to->pkt_type = from->pkt_type;
544         to->priority = from->priority;
545         to->protocol = from->protocol;
546         skb_dst_drop(to);
547         skb_dst_set(to, dst_clone(skb_dst(from)));
548         to->dev = from->dev;
549         to->mark = from->mark;
550
551 #ifdef CONFIG_NET_SCHED
552         to->tc_index = from->tc_index;
553 #endif
554         nf_copy(to, from);
555 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
556     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
557         to->nf_trace = from->nf_trace;
558 #endif
559         skb_copy_secmark(to, from);
560 }
561
562 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
563 {
564         unsigned int offset = sizeof(struct ipv6hdr);
565         unsigned int packet_len = skb->tail - skb->network_header;
566         int found_rhdr = 0;
567         *nexthdr = &ipv6_hdr(skb)->nexthdr;
568
569         while (offset <= packet_len) {
570                 struct ipv6_opt_hdr *exthdr;
571
572                 switch (**nexthdr) {
573
574                 case NEXTHDR_HOP:
575                         break;
576                 case NEXTHDR_ROUTING:
577                         found_rhdr = 1;
578                         break;
579                 case NEXTHDR_DEST:
580 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
581                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
582                                 break;
583 #endif
584                         if (found_rhdr)
585                                 return offset;
586                         break;
587                 default :
588                         return offset;
589                 }
590
591                 if (offset + sizeof(struct ipv6_opt_hdr) > packet_len)
592                         return -EINVAL;
593
594                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
595                                                  offset);
596                 offset += ipv6_optlen(exthdr);
597                 if (offset > IPV6_MAXPLEN)
598                         return -EINVAL;
599                 *nexthdr = &exthdr->nexthdr;
600         }
601
602         return -EINVAL;
603 }
604
605 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
606 {
607         static u32 ip6_idents_hashrnd __read_mostly;
608         static bool hashrnd_initialized = false;
609         u32 hash, id;
610
611         if (unlikely(!hashrnd_initialized)) {
612                 hashrnd_initialized = true;
613                 get_random_bytes(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
614         }
615         hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd);
616         hash = __ipv6_addr_jhash(&rt->rt6i_src.addr, hash);
617
618         id = ip_idents_reserve(hash, 1);
619         fhdr->identification = htonl(id);
620 }
621
622 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
623 {
624         struct sk_buff *frag;
625         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
626         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
627         struct ipv6hdr *tmp_hdr;
628         struct frag_hdr *fh;
629         unsigned int mtu, hlen, left, len;
630         int hroom, troom;
631         __be32 frag_id = 0;
632         int ptr, offset = 0, err=0;
633         u8 *prevhdr, nexthdr = 0;
634         struct net *net = dev_net(skb_dst(skb)->dev);
635
636         err = ip6_find_1stfragopt(skb, &prevhdr);
637         if (err < 0)
638                 goto fail;
639         hlen = err;
640         nexthdr = *prevhdr;
641
642         mtu = ip6_skb_dst_mtu(skb);
643
644         /* We must not fragment if the socket is set to force MTU discovery
645          * or if the skb it not generated by a local socket.
646          */
647         if (!skb->local_df && skb->len > mtu) {
648                 skb->dev = skb_dst(skb)->dev;
649                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
650                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
651                               IPSTATS_MIB_FRAGFAILS);
652                 kfree_skb(skb);
653                 return -EMSGSIZE;
654         }
655
656         if (np && np->frag_size < mtu) {
657                 if (np->frag_size)
658                         mtu = np->frag_size;
659         }
660         mtu -= hlen + sizeof(struct frag_hdr);
661
662         if (skb_has_frag_list(skb)) {
663                 int first_len = skb_pagelen(skb);
664                 struct sk_buff *frag2;
665
666                 if (first_len - hlen > mtu ||
667                     ((first_len - hlen) & 7) ||
668                     skb_cloned(skb))
669                         goto slow_path;
670
671                 skb_walk_frags(skb, frag) {
672                         /* Correct geometry. */
673                         if (frag->len > mtu ||
674                             ((frag->len & 7) && frag->next) ||
675                             skb_headroom(frag) < hlen)
676                                 goto slow_path_clean;
677
678                         /* Partially cloned skb? */
679                         if (skb_shared(frag))
680                                 goto slow_path_clean;
681
682                         BUG_ON(frag->sk);
683                         if (skb->sk) {
684                                 frag->sk = skb->sk;
685                                 frag->destructor = sock_wfree;
686                         }
687                         skb->truesize -= frag->truesize;
688                 }
689
690                 err = 0;
691                 offset = 0;
692                 frag = skb_shinfo(skb)->frag_list;
693                 skb_frag_list_init(skb);
694                 /* BUILD HEADER */
695
696                 *prevhdr = NEXTHDR_FRAGMENT;
697                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
698                 if (!tmp_hdr) {
699                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
700                                       IPSTATS_MIB_FRAGFAILS);
701                         return -ENOMEM;
702                 }
703
704                 __skb_pull(skb, hlen);
705                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
706                 __skb_push(skb, hlen);
707                 skb_reset_network_header(skb);
708                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
709
710                 ipv6_select_ident(fh, rt);
711                 fh->nexthdr = nexthdr;
712                 fh->reserved = 0;
713                 fh->frag_off = htons(IP6_MF);
714                 frag_id = fh->identification;
715
716                 first_len = skb_pagelen(skb);
717                 skb->data_len = first_len - skb_headlen(skb);
718                 skb->len = first_len;
719                 ipv6_hdr(skb)->payload_len = htons(first_len -
720                                                    sizeof(struct ipv6hdr));
721
722                 dst_hold(&rt->dst);
723
724                 for (;;) {
725                         /* Prepare header of the next frame,
726                          * before previous one went down. */
727                         if (frag) {
728                                 frag->ip_summed = CHECKSUM_NONE;
729                                 skb_reset_transport_header(frag);
730                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
731                                 __skb_push(frag, hlen);
732                                 skb_reset_network_header(frag);
733                                 memcpy(skb_network_header(frag), tmp_hdr,
734                                        hlen);
735                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
736                                 fh->nexthdr = nexthdr;
737                                 fh->reserved = 0;
738                                 fh->frag_off = htons(offset);
739                                 if (frag->next != NULL)
740                                         fh->frag_off |= htons(IP6_MF);
741                                 fh->identification = frag_id;
742                                 ipv6_hdr(frag)->payload_len =
743                                                 htons(frag->len -
744                                                       sizeof(struct ipv6hdr));
745                                 ip6_copy_metadata(frag, skb);
746                         }
747
748                         err = output(skb);
749                         if(!err)
750                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
751                                               IPSTATS_MIB_FRAGCREATES);
752
753                         if (err || !frag)
754                                 break;
755
756                         skb = frag;
757                         frag = skb->next;
758                         skb->next = NULL;
759                 }
760
761                 kfree(tmp_hdr);
762
763                 if (err == 0) {
764                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
765                                       IPSTATS_MIB_FRAGOKS);
766                         dst_release(&rt->dst);
767                         return 0;
768                 }
769
770                 while (frag) {
771                         skb = frag->next;
772                         kfree_skb(frag);
773                         frag = skb;
774                 }
775
776                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
777                               IPSTATS_MIB_FRAGFAILS);
778                 dst_release(&rt->dst);
779                 return err;
780
781 slow_path_clean:
782                 skb_walk_frags(skb, frag2) {
783                         if (frag2 == frag)
784                                 break;
785                         frag2->sk = NULL;
786                         frag2->destructor = NULL;
787                         skb->truesize += frag2->truesize;
788                 }
789         }
790
791 slow_path:
792         left = skb->len - hlen;         /* Space per frame */
793         ptr = hlen;                     /* Where to start from */
794
795         /*
796          *      Fragment the datagram.
797          */
798
799         *prevhdr = NEXTHDR_FRAGMENT;
800         hroom = LL_RESERVED_SPACE(rt->dst.dev);
801         troom = rt->dst.dev->needed_tailroom;
802
803         /*
804          *      Keep copying data until we run out.
805          */
806         while(left > 0) {
807                 len = left;
808                 /* IF: it doesn't fit, use 'mtu' - the data space left */
809                 if (len > mtu)
810                         len = mtu;
811                 /* IF: we are not sending up to and including the packet end
812                    then align the next start on an eight byte boundary */
813                 if (len < left) {
814                         len &= ~7;
815                 }
816                 /*
817                  *      Allocate buffer.
818                  */
819
820                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
821                                       hroom + troom, GFP_ATOMIC)) == NULL) {
822                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
823                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
824                                       IPSTATS_MIB_FRAGFAILS);
825                         err = -ENOMEM;
826                         goto fail;
827                 }
828
829                 /*
830                  *      Set up data on packet
831                  */
832
833                 ip6_copy_metadata(frag, skb);
834                 skb_reserve(frag, hroom);
835                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
836                 skb_reset_network_header(frag);
837                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
838                 frag->transport_header = (frag->network_header + hlen +
839                                           sizeof(struct frag_hdr));
840
841                 /*
842                  *      Charge the memory for the fragment to any owner
843                  *      it might possess
844                  */
845                 if (skb->sk)
846                         skb_set_owner_w(frag, skb->sk);
847
848                 /*
849                  *      Copy the packet header into the new buffer.
850                  */
851                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
852
853                 /*
854                  *      Build fragment header.
855                  */
856                 fh->nexthdr = nexthdr;
857                 fh->reserved = 0;
858                 if (!frag_id) {
859                         ipv6_select_ident(fh, rt);
860                         frag_id = fh->identification;
861                 } else
862                         fh->identification = frag_id;
863
864                 /*
865                  *      Copy a block of the IP datagram.
866                  */
867                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
868                         BUG();
869                 left -= len;
870
871                 fh->frag_off = htons(offset);
872                 if (left > 0)
873                         fh->frag_off |= htons(IP6_MF);
874                 ipv6_hdr(frag)->payload_len = htons(frag->len -
875                                                     sizeof(struct ipv6hdr));
876
877                 ptr += len;
878                 offset += len;
879
880                 /*
881                  *      Put this fragment into the sending queue.
882                  */
883                 err = output(frag);
884                 if (err)
885                         goto fail;
886
887                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888                               IPSTATS_MIB_FRAGCREATES);
889         }
890         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
891                       IPSTATS_MIB_FRAGOKS);
892         kfree_skb(skb);
893         return err;
894
895 fail:
896         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
897                       IPSTATS_MIB_FRAGFAILS);
898         kfree_skb(skb);
899         return err;
900 }
901
902 static inline int ip6_rt_check(const struct rt6key *rt_key,
903                                const struct in6_addr *fl_addr,
904                                const struct in6_addr *addr_cache)
905 {
906         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
907                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
908 }
909
910 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
911                                           struct dst_entry *dst,
912                                           const struct flowi6 *fl6)
913 {
914         struct ipv6_pinfo *np = inet6_sk(sk);
915         struct rt6_info *rt;
916
917         if (!dst)
918                 goto out;
919
920         if (dst->ops->family != AF_INET6) {
921                 dst_release(dst);
922                 return NULL;
923         }
924
925         rt = (struct rt6_info *)dst;
926         /* Yes, checking route validity in not connected
927          * case is not very simple. Take into account,
928          * that we do not support routing by source, TOS,
929          * and MSG_DONTROUTE            --ANK (980726)
930          *
931          * 1. ip6_rt_check(): If route was host route,
932          *    check that cached destination is current.
933          *    If it is network route, we still may
934          *    check its validity using saved pointer
935          *    to the last used address: daddr_cache.
936          *    We do not want to save whole address now,
937          *    (because main consumer of this service
938          *    is tcp, which has not this problem),
939          *    so that the last trick works only on connected
940          *    sockets.
941          * 2. oif also should be the same.
942          */
943         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
944 #ifdef CONFIG_IPV6_SUBTREES
945             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
946 #endif
947             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
948                 dst_release(dst);
949                 dst = NULL;
950         }
951
952 out:
953         return dst;
954 }
955
956 static int ip6_dst_lookup_tail(struct sock *sk,
957                                struct dst_entry **dst, struct flowi6 *fl6)
958 {
959         struct net *net = sock_net(sk);
960 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
961         struct neighbour *n;
962 #endif
963         int err;
964
965         if (*dst == NULL)
966                 *dst = ip6_route_output(net, sk, fl6);
967
968         if ((err = (*dst)->error))
969                 goto out_err_release;
970
971         if (ipv6_addr_any(&fl6->saddr)) {
972                 struct rt6_info *rt = (struct rt6_info *) *dst;
973                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
974                                           sk ? inet6_sk(sk)->srcprefs : 0,
975                                           &fl6->saddr);
976                 if (err)
977                         goto out_err_release;
978         }
979
980 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
981         /*
982          * Here if the dst entry we've looked up
983          * has a neighbour entry that is in the INCOMPLETE
984          * state and the src address from the flow is
985          * marked as OPTIMISTIC, we release the found
986          * dst entry and replace it instead with the
987          * dst entry of the nexthop router
988          */
989         rcu_read_lock();
990         n = dst_get_neighbour(*dst);
991         if (n && !(n->nud_state & NUD_VALID)) {
992                 struct inet6_ifaddr *ifp;
993                 struct flowi6 fl_gw6;
994                 int redirect;
995
996                 rcu_read_unlock();
997                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
998                                       (*dst)->dev, 1);
999
1000                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1001                 if (ifp)
1002                         in6_ifa_put(ifp);
1003
1004                 if (redirect) {
1005                         /*
1006                          * We need to get the dst entry for the
1007                          * default router instead
1008                          */
1009                         dst_release(*dst);
1010                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1011                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1012                         *dst = ip6_route_output(net, sk, &fl_gw6);
1013                         if ((err = (*dst)->error))
1014                                 goto out_err_release;
1015                 }
1016         } else {
1017                 rcu_read_unlock();
1018         }
1019 #endif
1020
1021         return 0;
1022
1023 out_err_release:
1024         if (err == -ENETUNREACH)
1025                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1026         dst_release(*dst);
1027         *dst = NULL;
1028         return err;
1029 }
1030
1031 /**
1032  *      ip6_dst_lookup - perform route lookup on flow
1033  *      @sk: socket which provides route info
1034  *      @dst: pointer to dst_entry * for result
1035  *      @fl6: flow to lookup
1036  *
1037  *      This function performs a route lookup on the given flow.
1038  *
1039  *      It returns zero on success, or a standard errno code on error.
1040  */
1041 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1042 {
1043         *dst = NULL;
1044         return ip6_dst_lookup_tail(sk, dst, fl6);
1045 }
1046 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1047
1048 /**
1049  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1050  *      @sk: socket which provides route info
1051  *      @fl6: flow to lookup
1052  *      @final_dst: final destination address for ipsec lookup
1053  *      @can_sleep: we are in a sleepable context
1054  *
1055  *      This function performs a route lookup on the given flow.
1056  *
1057  *      It returns a valid dst pointer on success, or a pointer encoded
1058  *      error code.
1059  */
1060 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1061                                       const struct in6_addr *final_dst,
1062                                       bool can_sleep)
1063 {
1064         struct dst_entry *dst = NULL;
1065         int err;
1066
1067         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1068         if (err)
1069                 return ERR_PTR(err);
1070         if (final_dst)
1071                 ipv6_addr_copy(&fl6->daddr, final_dst);
1072         if (can_sleep)
1073                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1074
1075         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1076 }
1077 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1078
1079 /**
1080  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1081  *      @sk: socket which provides the dst cache and route info
1082  *      @fl6: flow to lookup
1083  *      @final_dst: final destination address for ipsec lookup
1084  *      @can_sleep: we are in a sleepable context
1085  *
1086  *      This function performs a route lookup on the given flow with the
1087  *      possibility of using the cached route in the socket if it is valid.
1088  *      It will take the socket dst lock when operating on the dst cache.
1089  *      As a result, this function can only be used in process context.
1090  *
1091  *      It returns a valid dst pointer on success, or a pointer encoded
1092  *      error code.
1093  */
1094 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1095                                          const struct in6_addr *final_dst,
1096                                          bool can_sleep)
1097 {
1098         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1099         int err;
1100
1101         dst = ip6_sk_dst_check(sk, dst, fl6);
1102
1103         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1104         if (err)
1105                 return ERR_PTR(err);
1106         if (final_dst)
1107                 ipv6_addr_copy(&fl6->daddr, final_dst);
1108         if (can_sleep)
1109                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1110
1111         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1112 }
1113 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1114
1115 static inline int ip6_ufo_append_data(struct sock *sk,
1116                         int getfrag(void *from, char *to, int offset, int len,
1117                         int odd, struct sk_buff *skb),
1118                         void *from, int length, int hh_len, int fragheaderlen,
1119                         int exthdrlen, int transhdrlen, int mtu,
1120                         unsigned int flags, struct rt6_info *rt)
1121 {
1122         struct sk_buff *skb;
1123         int err;
1124
1125         /* There is support for UDP large send offload by network
1126          * device, so create one single skb packet containing complete
1127          * udp datagram
1128          */
1129         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1130                 struct frag_hdr fhdr;
1131
1132                 skb = sock_alloc_send_skb(sk,
1133                         hh_len + fragheaderlen + transhdrlen + 20,
1134                         (flags & MSG_DONTWAIT), &err);
1135                 if (skb == NULL)
1136                         return err;
1137
1138                 /* reserve space for Hardware header */
1139                 skb_reserve(skb, hh_len);
1140
1141                 /* create space for UDP/IP header */
1142                 skb_put(skb,fragheaderlen + transhdrlen);
1143
1144                 /* initialize network header pointer */
1145                 skb_set_network_header(skb, exthdrlen);
1146
1147                 /* initialize protocol header pointer */
1148                 skb->transport_header = skb->network_header + fragheaderlen;
1149
1150                 skb->ip_summed = CHECKSUM_PARTIAL;
1151                 skb->csum = 0;
1152
1153                 /* Specify the length of each IPv6 datagram fragment.
1154                  * It has to be a multiple of 8.
1155                  */
1156                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1157                                              sizeof(struct frag_hdr)) & ~7;
1158                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1159                 ipv6_select_ident(&fhdr, rt);
1160                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1161                 __skb_queue_tail(&sk->sk_write_queue, skb);
1162         }
1163
1164         return skb_append_datato_frags(sk, skb, getfrag, from,
1165                                        (length - transhdrlen));
1166 }
1167
1168 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1169                                                gfp_t gfp)
1170 {
1171         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1172 }
1173
1174 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1175                                                 gfp_t gfp)
1176 {
1177         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1178 }
1179
1180 static void ip6_append_data_mtu(unsigned int *mtu,
1181                                 int *maxfraglen,
1182                                 unsigned int fragheaderlen,
1183                                 struct sk_buff *skb,
1184                                 struct rt6_info *rt,
1185                                 unsigned int orig_mtu)
1186 {
1187         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1188                 if (skb == NULL) {
1189                         /* first fragment, reserve header_len */
1190                         *mtu = orig_mtu - rt->dst.header_len;
1191
1192                 } else {
1193                         /*
1194                          * this fragment is not first, the headers
1195                          * space is regarded as data space.
1196                          */
1197                         *mtu = orig_mtu;
1198                 }
1199                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1200                               + fragheaderlen - sizeof(struct frag_hdr);
1201         }
1202 }
1203
1204 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1205         int offset, int len, int odd, struct sk_buff *skb),
1206         void *from, int length, int transhdrlen,
1207         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1208         struct rt6_info *rt, unsigned int flags, int dontfrag)
1209 {
1210         struct inet_sock *inet = inet_sk(sk);
1211         struct ipv6_pinfo *np = inet6_sk(sk);
1212         struct inet_cork *cork;
1213         struct sk_buff *skb, *skb_prev = NULL;
1214         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1215         int exthdrlen;
1216         int dst_exthdrlen;
1217         int hh_len;
1218         int copy;
1219         int err;
1220         int offset = 0;
1221         int csummode = CHECKSUM_NONE;
1222         __u8 tx_flags = 0;
1223
1224         if (flags&MSG_PROBE)
1225                 return 0;
1226         cork = &inet->cork.base;
1227         if (skb_queue_empty(&sk->sk_write_queue)) {
1228                 /*
1229                  * setup for corking
1230                  */
1231                 if (opt) {
1232                         if (WARN_ON(np->cork.opt))
1233                                 return -EINVAL;
1234
1235                         np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1236                         if (unlikely(np->cork.opt == NULL))
1237                                 return -ENOBUFS;
1238
1239                         np->cork.opt->tot_len = opt->tot_len;
1240                         np->cork.opt->opt_flen = opt->opt_flen;
1241                         np->cork.opt->opt_nflen = opt->opt_nflen;
1242
1243                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1244                                                             sk->sk_allocation);
1245                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1246                                 return -ENOBUFS;
1247
1248                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1249                                                             sk->sk_allocation);
1250                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1251                                 return -ENOBUFS;
1252
1253                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1254                                                            sk->sk_allocation);
1255                         if (opt->hopopt && !np->cork.opt->hopopt)
1256                                 return -ENOBUFS;
1257
1258                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1259                                                             sk->sk_allocation);
1260                         if (opt->srcrt && !np->cork.opt->srcrt)
1261                                 return -ENOBUFS;
1262
1263                         /* need source address above miyazawa*/
1264                 }
1265                 dst_hold(&rt->dst);
1266                 cork->dst = &rt->dst;
1267                 inet->cork.fl.u.ip6 = *fl6;
1268                 np->cork.hop_limit = hlimit;
1269                 np->cork.tclass = tclass;
1270                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1271                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1272                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1273                 else
1274                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1275                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1276                 if (np->frag_size < mtu) {
1277                         if (np->frag_size)
1278                                 mtu = np->frag_size;
1279                 }
1280                 cork->fragsize = mtu;
1281                 if (dst_allfrag(rt->dst.path))
1282                         cork->flags |= IPCORK_ALLFRAG;
1283                 cork->length = 0;
1284                 sk->sk_sndmsg_page = NULL;
1285                 sk->sk_sndmsg_off = 0;
1286                 exthdrlen = (opt ? opt->opt_flen : 0);
1287                 length += exthdrlen;
1288                 transhdrlen += exthdrlen;
1289                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1290         } else {
1291                 rt = (struct rt6_info *)cork->dst;
1292                 fl6 = &inet->cork.fl.u.ip6;
1293                 opt = np->cork.opt;
1294                 transhdrlen = 0;
1295                 exthdrlen = 0;
1296                 dst_exthdrlen = 0;
1297                 mtu = cork->fragsize;
1298         }
1299         orig_mtu = mtu;
1300
1301         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1302
1303         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1304                         (opt ? opt->opt_nflen : 0);
1305         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1306
1307         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1308                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1309                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1310                         return -EMSGSIZE;
1311                 }
1312         }
1313
1314         /* For UDP, check if TX timestamp is enabled */
1315         if (sk->sk_type == SOCK_DGRAM) {
1316                 err = sock_tx_timestamp(sk, &tx_flags);
1317                 if (err)
1318                         goto error;
1319         }
1320
1321         /*
1322          * Let's try using as much space as possible.
1323          * Use MTU if total length of the message fits into the MTU.
1324          * Otherwise, we need to reserve fragment header and
1325          * fragment alignment (= 8-15 octects, in total).
1326          *
1327          * Note that we may need to "move" the data from the tail of
1328          * of the buffer to the new fragment when we split
1329          * the message.
1330          *
1331          * FIXME: It may be fragmented into multiple chunks
1332          *        at once if non-fragmentable extension headers
1333          *        are too large.
1334          * --yoshfuji
1335          */
1336
1337         if ((length > mtu) && dontfrag && (sk->sk_protocol == IPPROTO_UDP ||
1338                                            sk->sk_protocol == IPPROTO_RAW)) {
1339                 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1340                 return -EMSGSIZE;
1341         }
1342
1343         skb = skb_peek_tail(&sk->sk_write_queue);
1344         cork->length += length;
1345         if (((length > mtu) ||
1346              (skb && skb_has_frags(skb))) &&
1347             (sk->sk_protocol == IPPROTO_UDP) &&
1348             (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
1349             (sk->sk_type == SOCK_DGRAM)) {
1350                 err = ip6_ufo_append_data(sk, getfrag, from, length,
1351                                           hh_len, fragheaderlen, exthdrlen,
1352                                           transhdrlen, mtu, flags, rt);
1353                 if (err)
1354                         goto error;
1355                 return 0;
1356         }
1357
1358         if (!skb)
1359                 goto alloc_new_skb;
1360
1361         while (length > 0) {
1362                 /* Check if the remaining data fits into current packet. */
1363                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1364                 if (copy < length)
1365                         copy = maxfraglen - skb->len;
1366
1367                 if (copy <= 0) {
1368                         char *data;
1369                         unsigned int datalen;
1370                         unsigned int fraglen;
1371                         unsigned int fraggap;
1372                         unsigned int alloclen;
1373 alloc_new_skb:
1374                         /* There's no room in the current skb */
1375                         if (skb)
1376                                 fraggap = skb->len - maxfraglen;
1377                         else
1378                                 fraggap = 0;
1379                         /* update mtu and maxfraglen if necessary */
1380                         if (skb == NULL || skb_prev == NULL)
1381                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1382                                                     fragheaderlen, skb, rt,
1383                                                     orig_mtu);
1384
1385                         skb_prev = skb;
1386
1387                         /*
1388                          * If remaining data exceeds the mtu,
1389                          * we know we need more fragment(s).
1390                          */
1391                         datalen = length + fraggap;
1392
1393                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1394                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1395                         if ((flags & MSG_MORE) &&
1396                             !(rt->dst.dev->features&NETIF_F_SG))
1397                                 alloclen = mtu;
1398                         else
1399                                 alloclen = datalen + fragheaderlen;
1400
1401                         alloclen += dst_exthdrlen;
1402
1403                         if (datalen != length + fraggap) {
1404                                 /*
1405                                  * this is not the last fragment, the trailer
1406                                  * space is regarded as data space.
1407                                  */
1408                                 datalen += rt->dst.trailer_len;
1409                         }
1410
1411                         alloclen += rt->dst.trailer_len;
1412                         fraglen = datalen + fragheaderlen;
1413
1414                         /*
1415                          * We just reserve space for fragment header.
1416                          * Note: this may be overallocation if the message
1417                          * (without MSG_MORE) fits into the MTU.
1418                          */
1419                         alloclen += sizeof(struct frag_hdr);
1420
1421                         copy = datalen - transhdrlen - fraggap;
1422                         if (copy < 0) {
1423                                 err = -EINVAL;
1424                                 goto error;
1425                         }
1426                         if (transhdrlen) {
1427                                 skb = sock_alloc_send_skb(sk,
1428                                                 alloclen + hh_len,
1429                                                 (flags & MSG_DONTWAIT), &err);
1430                         } else {
1431                                 skb = NULL;
1432                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1433                                     2 * sk->sk_sndbuf)
1434                                         skb = sock_wmalloc(sk,
1435                                                            alloclen + hh_len, 1,
1436                                                            sk->sk_allocation);
1437                                 if (unlikely(skb == NULL))
1438                                         err = -ENOBUFS;
1439                                 else {
1440                                         /* Only the initial fragment
1441                                          * is time stamped.
1442                                          */
1443                                         tx_flags = 0;
1444                                 }
1445                         }
1446                         if (skb == NULL)
1447                                 goto error;
1448                         /*
1449                          *      Fill in the control structures
1450                          */
1451                         skb->ip_summed = csummode;
1452                         skb->csum = 0;
1453                         /* reserve for fragmentation and ipsec header */
1454                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1455                                     dst_exthdrlen);
1456
1457                         if (sk->sk_type == SOCK_DGRAM)
1458                                 skb_shinfo(skb)->tx_flags = tx_flags;
1459
1460                         /*
1461                          *      Find where to start putting bytes
1462                          */
1463                         data = skb_put(skb, fraglen);
1464                         skb_set_network_header(skb, exthdrlen);
1465                         data += fragheaderlen;
1466                         skb->transport_header = (skb->network_header +
1467                                                  fragheaderlen);
1468                         if (fraggap) {
1469                                 skb->csum = skb_copy_and_csum_bits(
1470                                         skb_prev, maxfraglen,
1471                                         data + transhdrlen, fraggap, 0);
1472                                 skb_prev->csum = csum_sub(skb_prev->csum,
1473                                                           skb->csum);
1474                                 data += fraggap;
1475                                 pskb_trim_unique(skb_prev, maxfraglen);
1476                         }
1477                         if (copy > 0 &&
1478                             getfrag(from, data + transhdrlen, offset,
1479                                     copy, fraggap, skb) < 0) {
1480                                 err = -EFAULT;
1481                                 kfree_skb(skb);
1482                                 goto error;
1483                         }
1484
1485                         offset += copy;
1486                         length -= datalen - fraggap;
1487                         transhdrlen = 0;
1488                         exthdrlen = 0;
1489                         dst_exthdrlen = 0;
1490                         csummode = CHECKSUM_NONE;
1491
1492                         /*
1493                          * Put the packet on the pending queue
1494                          */
1495                         __skb_queue_tail(&sk->sk_write_queue, skb);
1496                         continue;
1497                 }
1498
1499                 if (copy > length)
1500                         copy = length;
1501
1502                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1503                         unsigned int off;
1504
1505                         off = skb->len;
1506                         if (getfrag(from, skb_put(skb, copy),
1507                                                 offset, copy, off, skb) < 0) {
1508                                 __skb_trim(skb, off);
1509                                 err = -EFAULT;
1510                                 goto error;
1511                         }
1512                 } else {
1513                         int i = skb_shinfo(skb)->nr_frags;
1514                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1515                         struct page *page = sk->sk_sndmsg_page;
1516                         int off = sk->sk_sndmsg_off;
1517                         unsigned int left;
1518
1519                         if (page && (left = PAGE_SIZE - off) > 0) {
1520                                 if (copy >= left)
1521                                         copy = left;
1522                                 if (page != skb_frag_page(frag)) {
1523                                         if (i == MAX_SKB_FRAGS) {
1524                                                 err = -EMSGSIZE;
1525                                                 goto error;
1526                                         }
1527                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1528                                         skb_frag_ref(skb, i);
1529                                         frag = &skb_shinfo(skb)->frags[i];
1530                                 }
1531                         } else if(i < MAX_SKB_FRAGS) {
1532                                 if (copy > PAGE_SIZE)
1533                                         copy = PAGE_SIZE;
1534                                 page = alloc_pages(sk->sk_allocation, 0);
1535                                 if (page == NULL) {
1536                                         err = -ENOMEM;
1537                                         goto error;
1538                                 }
1539                                 sk->sk_sndmsg_page = page;
1540                                 sk->sk_sndmsg_off = 0;
1541
1542                                 skb_fill_page_desc(skb, i, page, 0, 0);
1543                                 frag = &skb_shinfo(skb)->frags[i];
1544                         } else {
1545                                 err = -EMSGSIZE;
1546                                 goto error;
1547                         }
1548                         if (getfrag(from,
1549                                     skb_frag_address(frag) + skb_frag_size(frag),
1550                                     offset, copy, skb->len, skb) < 0) {
1551                                 err = -EFAULT;
1552                                 goto error;
1553                         }
1554                         sk->sk_sndmsg_off += copy;
1555                         skb_frag_size_add(frag, copy);
1556                         skb->len += copy;
1557                         skb->data_len += copy;
1558                         skb->truesize += copy;
1559                         atomic_add(copy, &sk->sk_wmem_alloc);
1560                 }
1561                 offset += copy;
1562                 length -= copy;
1563         }
1564         return 0;
1565 error:
1566         cork->length -= length;
1567         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1568         return err;
1569 }
1570
1571 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1572 {
1573         if (np->cork.opt) {
1574                 kfree(np->cork.opt->dst0opt);
1575                 kfree(np->cork.opt->dst1opt);
1576                 kfree(np->cork.opt->hopopt);
1577                 kfree(np->cork.opt->srcrt);
1578                 kfree(np->cork.opt);
1579                 np->cork.opt = NULL;
1580         }
1581
1582         if (inet->cork.base.dst) {
1583                 dst_release(inet->cork.base.dst);
1584                 inet->cork.base.dst = NULL;
1585                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1586         }
1587         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1588 }
1589
1590 int ip6_push_pending_frames(struct sock *sk)
1591 {
1592         struct sk_buff *skb, *tmp_skb;
1593         struct sk_buff **tail_skb;
1594         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1595         struct inet_sock *inet = inet_sk(sk);
1596         struct ipv6_pinfo *np = inet6_sk(sk);
1597         struct net *net = sock_net(sk);
1598         struct ipv6hdr *hdr;
1599         struct ipv6_txoptions *opt = np->cork.opt;
1600         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1601         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1602         unsigned char proto = fl6->flowi6_proto;
1603         int err = 0;
1604
1605         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1606                 goto out;
1607         tail_skb = &(skb_shinfo(skb)->frag_list);
1608
1609         /* move skb->data to ip header from ext header */
1610         if (skb->data < skb_network_header(skb))
1611                 __skb_pull(skb, skb_network_offset(skb));
1612         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1613                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1614                 *tail_skb = tmp_skb;
1615                 tail_skb = &(tmp_skb->next);
1616                 skb->len += tmp_skb->len;
1617                 skb->data_len += tmp_skb->len;
1618                 skb->truesize += tmp_skb->truesize;
1619                 tmp_skb->destructor = NULL;
1620                 tmp_skb->sk = NULL;
1621         }
1622
1623         /* Allow local fragmentation. */
1624         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1625                 skb->local_df = 1;
1626
1627         ipv6_addr_copy(final_dst, &fl6->daddr);
1628         __skb_pull(skb, skb_network_header_len(skb));
1629         if (opt && opt->opt_flen)
1630                 ipv6_push_frag_opts(skb, opt, &proto);
1631         if (opt && opt->opt_nflen)
1632                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1633
1634         skb_push(skb, sizeof(struct ipv6hdr));
1635         skb_reset_network_header(skb);
1636         hdr = ipv6_hdr(skb);
1637
1638         *(__be32*)hdr = fl6->flowlabel |
1639                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1640
1641         hdr->hop_limit = np->cork.hop_limit;
1642         hdr->nexthdr = proto;
1643         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1644         ipv6_addr_copy(&hdr->daddr, final_dst);
1645
1646         skb->priority = sk->sk_priority;
1647         skb->mark = sk->sk_mark;
1648
1649         skb_dst_set(skb, dst_clone(&rt->dst));
1650         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1651         if (proto == IPPROTO_ICMPV6) {
1652                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1653
1654                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1655                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1656         }
1657
1658         err = ip6_local_out(skb);
1659         if (err) {
1660                 if (err > 0)
1661                         err = net_xmit_errno(err);
1662                 if (err)
1663                         goto error;
1664         }
1665
1666 out:
1667         ip6_cork_release(inet, np);
1668         return err;
1669 error:
1670         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1671         goto out;
1672 }
1673
1674 void ip6_flush_pending_frames(struct sock *sk)
1675 {
1676         struct sk_buff *skb;
1677
1678         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1679                 if (skb_dst(skb))
1680                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1681                                       IPSTATS_MIB_OUTDISCARDS);
1682                 kfree_skb(skb);
1683         }
1684
1685         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1686 }